python實現簡易搜索引擎(含程式碼)
- 2019 年 10 月 6 日
- 筆記
今天我們使用python來搭建簡易的搜索引擎。
搜索引擎的本質其實就是對數據的預處理,分詞構建索引和查詢。
(這邊我們默認所有的數據都是utf-8的數據類型)
我們在一個網站上去獲取所有的URL:
def crawl(pages,depth=2): for i in range(depth): newpages = set() for page in pages: try: c = urllib.request.urlopen(page) except: print('Invaild page:',page) continue soup = bs4.BeautifulSoup(c.read()) links = soup('a') for link in links: if('href' in dict(link.attrs)): url = urllib.urljoin(page,link['href']) if url.find("'")!=-1:continue url = url.split('#')[0] if url[0:3]=='http': newpages.add(url) pages = newpages
通過一個循環抓取當前頁面上所有的鏈接,我們儘可能多的去抓取鏈接,之所以選擇set而不使用list是防止重複的現象,我們可以將爬取的的網站存放到文件或者MySQL或者是MongoDB里。
output = sys.stdout outputfile = open('lujing.txt', 'w') sys.stdout = outputfile list = GetFileList(lujing, [])
將生成的路徑文件lujing.txt讀取,並按照路徑文件對文本處理
# 將生成的路徑文件lujing.txt讀取,並按照路徑文件對文本處理,去標籤 for line in open("lujing.txt"): print(line) # line=line[0:-2] line1 = line[0:12] line2 = line[13:16] line3 = line[17:-1] line4 = line[17:-6] line = line1 + '\' + line2 + '\' + line3 print(line4) path = line fb = open(path, "rb") data = fb.read() bianma = chardet.detect(data)['encoding'] # 獲取當前文件的編碼方式,並按照此編碼類型處理文檔 page = open(line, 'r', encoding=bianma, errors='ignore').read() dr = re.compile(r'<[^>]+>', re.S) # 去HTML標籤 dd = dr.sub('', page) print(dd) fname = 'TXT' + "\" + line4 + ".txt" # print(fname) f = open(fname, "w+", encoding=bianma) # 將去標籤的文件寫到文件夾內,並按照原命名以txt文檔方式保存 # fo=open(fname,"w+") f.write(dd)
下面我們進行分詞索引:
因為大家都比較熟悉sql語句那我在這裡就寫成MySQL的版本了,如果需要mongodb的可以私信公眾號。
import jieba import chardet import pymysql import importlib, sys importlib.reload(sys) # 如果使用MongoDB # from pymongo import MongoClient # #data processing # client = MongoClient('localhost',27017) # apiDB = client['urlDB'] #serverDB_name:test_nodedata # questionnaires = apiDB['weburl'] # data = list(questionnaires.find()) conn = pymysql .connect(host="localhost",user="root", password="123456",db="suoyin",port=3307) conn.text_factory = str c = conn.cursor() c.execute('drop table doc') c.execute('create table doc (id int primary key,link text)') c.execute('drop table word') c.execute('create table word (term varchar(25) primary key,list text)') conn.commit() conn.close() def Fenci(): num = 0 for line in open("url.txt"): lujing = line print(lujing) num += 1 print(line) line = line[17:-5] print(line) line = 'TXT' + '\' + line + 'Txt' # line為文件位置 print(line) # 文件名稱 path = line fb = open(path, "rb") data = fb.read() bianma = chardet.detect(data)['encoding'] # 獲取文件編碼 print(bianma) # page = open(line, 'r', encoding=bianma, errors='ignore').read() # page1=page.decode('UTF-8') if bianma == 'UTF-16': data = data.decode('UTF-16') data = data.encode('utf-8') word = jieba.cut_for_search(data) seglist = list(word) print(seglist) # 創建資料庫 c = conn.cursor() # 創建游標 c.execute('insert into doc values(?,?)', (num, lujing)) # 對每個分出的詞語建立詞表 for word in seglist: # print(word) # 檢驗看看這個詞語是否已存在於資料庫 c.execute('select list from word where term=?', (word,)) result = c.fetchall() # 如果不存在 if len(result) == 0: docliststr = str(num) c.execute('insert into word values(?,?)', (word, docliststr)) # 如果已存在 else: docliststr = result[0][0] # 得到字元串 docliststr += ' ' + str(num) c.execute('update word set list=? where term=?', (docliststr, word)) conn.commit() conn.close() Fenci()
最後一步,查詢:
import pymsql import jieba import math conn = pymysql .connect(host="localhost",user="root", password="123456",db="suoyin",port=3307) c = conn.cursor() c.execute('select count(*) from doc') N = 1 + c.fetchall()[0][0] # 文檔總數 target = input('請輸入搜索詞:') seggen = jieba.cut_for_search(target) score = {} # 文檔號:匹配度 for word in seggen: print('得到查詢詞:', word) # 計算score tf = {} # 文檔號:文檔數 c.execute('select list from word where term=?', (word,)) result = c.fetchall() if len(result) > 0: doclist = result[0][0] doclist = doclist.split(' ') # 把字元串轉換為元素為int的list doclist = [int(x) for x in doclist] # 當前word對應的df數 df = len(set(doclist)) idf = math.log(N / df) print('idf:', idf) for num in doclist: if num in tf: tf[num] = tf[num] + 1 else: tf[num] = 1 # tf統計結束,現在開始計算score for num in tf: if num in score: # 如果該num文檔已經有分數了,則累加 score[num] = score[num] + tf[num] * idf else: score[num] = tf[num] * idf sortedlist = sorted(score.items(), key=lambda d: d[1], reverse=True) cnt = 0 for num, docscore in sortedlist: cnt = cnt + 1 c.execute('select link from doc where id=?', (num,)) url = c.fetchall()[0][0] print("Result Ranking:", cnt) print('url:', url, 'match degree:', docscore) if cnt > 20: break if cnt == 0: print('No result')
搞定。