python實現簡易搜索引擎(含程式碼)

  • 2019 年 10 月 6 日
  • 筆記

今天我們使用python來搭建簡易的搜索引擎。

搜索引擎的本質其實就是對數據的預處理,分詞構建索引和查詢。

(這邊我們默認所有的數據都是utf-8的數據類型)

我們在一個網站上去獲取所有的URL:

def crawl(pages,depth=2):      for i in range(depth):          newpages = set()          for page in pages:              try:                  c = urllib.request.urlopen(page)              except:                  print('Invaild page:',page)                  continue              soup = bs4.BeautifulSoup(c.read())                links = soup('a')              for link in links:                  if('href' in dict(link.attrs)):                      url = urllib.urljoin(page,link['href'])                      if url.find("'")!=-1:continue                      url = url.split('#')[0]                      if url[0:3]=='http':                          newpages.add(url)          pages = newpages

通過一個循環抓取當前頁面上所有的鏈接,我們儘可能多的去抓取鏈接,之所以選擇set而不使用list是防止重複的現象,我們可以將爬取的的網站存放到文件或者MySQL或者是MongoDB里。

output = sys.stdout  outputfile = open('lujing.txt', 'w')  sys.stdout = outputfile  list = GetFileList(lujing, [])

將生成的路徑文件lujing.txt讀取,並按照路徑文件對文本處理

# 將生成的路徑文件lujing.txt讀取,並按照路徑文件對文本處理,去標籤  for line in open("lujing.txt"):      print(line)      # line=line[0:-2]      line1 = line[0:12]      line2 = line[13:16]      line3 = line[17:-1]      line4 = line[17:-6]      line = line1 + '\' + line2 + '\' + line3      print(line4)      path = line      fb = open(path, "rb")      data = fb.read()      bianma = chardet.detect(data)['encoding']  # 獲取當前文件的編碼方式,並按照此編碼類型處理文檔      page = open(line, 'r', encoding=bianma, errors='ignore').read()      dr = re.compile(r'<[^>]+>', re.S)  # 去HTML標籤      dd = dr.sub('', page)      print(dd)      fname = 'TXT' + "\" + line4 + ".txt"      # print(fname)      f = open(fname, "w+", encoding=bianma)  # 將去標籤的文件寫到文件夾內,並按照原命名以txt文檔方式保存      # fo=open(fname,"w+")      f.write(dd)

下面我們進行分詞索引:

因為大家都比較熟悉sql語句那我在這裡就寫成MySQL的版本了,如果需要mongodb的可以私信公眾號。

import jieba  import chardet  import pymysql  import importlib, sys  importlib.reload(sys)    # 如果使用MongoDB  # from pymongo import MongoClient  # #data processing  # client = MongoClient('localhost',27017)  # apiDB = client['urlDB']    #serverDB_name:test_nodedata  # questionnaires = apiDB['weburl']  # data = list(questionnaires.find())  conn = pymysql .connect(host="localhost",user="root",                         password="123456",db="suoyin",port=3307)  conn.text_factory = str  c = conn.cursor()  c.execute('drop table doc')  c.execute('create table doc (id int primary key,link text)')  c.execute('drop table word')  c.execute('create table word (term varchar(25) primary key,list text)')  conn.commit()  conn.close()      def Fenci():      num = 0      for line in open("url.txt"):          lujing = line          print(lujing)          num += 1          print(line)          line = line[17:-5]          print(line)          line = 'TXT' + '\' + line + 'Txt'  # line為文件位置          print(line)  # 文件名稱          path = line          fb = open(path, "rb")          data = fb.read()          bianma = chardet.detect(data)['encoding']  # 獲取文件編碼        print(bianma)          # page = open(line, 'r', encoding=bianma, errors='ignore').read()          # page1=page.decode('UTF-8')          if bianma == 'UTF-16':              data = data.decode('UTF-16')              data = data.encode('utf-8')          word = jieba.cut_for_search(data)          seglist = list(word)          print(seglist)            # 創建資料庫          c = conn.cursor()  # 創建游標          c.execute('insert into doc values(?,?)', (num, lujing))          # 對每個分出的詞語建立詞表          for word in seglist:              # print(word)              # 檢驗看看這個詞語是否已存在於資料庫              c.execute('select list from word where term=?', (word,))              result = c.fetchall()              # 如果不存在              if len(result) == 0:                  docliststr = str(num)                  c.execute('insert into word values(?,?)', (word, docliststr))              # 如果已存在              else:                  docliststr = result[0][0]  # 得到字元串                  docliststr += ' ' + str(num)                  c.execute('update word set list=? where term=?', (docliststr, word))          conn.commit()          conn.close()      Fenci()

最後一步,查詢:

import pymsql  import jieba  import math    conn = pymysql .connect(host="localhost",user="root",                         password="123456",db="suoyin",port=3307)  c = conn.cursor()  c.execute('select count(*) from doc')  N = 1 + c.fetchall()[0][0]  # 文檔總數  target = input('請輸入搜索詞:')  seggen = jieba.cut_for_search(target)  score = {}  # 文檔號:匹配度  for word in seggen:      print('得到查詢詞:', word)      # 計算score      tf = {}  # 文檔號:文檔數      c.execute('select list from word where term=?', (word,))      result = c.fetchall()      if len(result) > 0:          doclist = result[0][0]          doclist = doclist.split(' ')          # 把字元串轉換為元素為int的list          doclist = [int(x) for x in doclist]          # 當前word對應的df數          df = len(set(doclist))          idf = math.log(N / df)          print('idf:', idf)          for num in doclist:              if num in tf:                  tf[num] = tf[num] + 1              else:                  tf[num] = 1          # tf統計結束,現在開始計算score          for num in tf:              if num in score:                  # 如果該num文檔已經有分數了,則累加                  score[num] = score[num] + tf[num] * idf              else:                  score[num] = tf[num] * idf  sortedlist = sorted(score.items(), key=lambda d: d[1], reverse=True)    cnt = 0  for num, docscore in sortedlist:      cnt = cnt + 1      c.execute('select link from doc where id=?', (num,))      url = c.fetchall()[0][0]      print("Result Ranking:", cnt)      print('url:', url, 'match degree:', docscore)        if cnt > 20:          break  if cnt == 0:      print('No result')

搞定。