python实现简易搜索引擎(含代码)

  • 2019 年 10 月 6 日
  • 笔记

今天我们使用python来搭建简易的搜索引擎。

搜索引擎的本质其实就是对数据的预处理,分词构建索引和查询。

(这边我们默认所有的数据都是utf-8的数据类型)

我们在一个网站上去获取所有的URL:

def crawl(pages,depth=2):      for i in range(depth):          newpages = set()          for page in pages:              try:                  c = urllib.request.urlopen(page)              except:                  print('Invaild page:',page)                  continue              soup = bs4.BeautifulSoup(c.read())                links = soup('a')              for link in links:                  if('href' in dict(link.attrs)):                      url = urllib.urljoin(page,link['href'])                      if url.find("'")!=-1:continue                      url = url.split('#')[0]                      if url[0:3]=='http':                          newpages.add(url)          pages = newpages

通过一个循环抓取当前页面上所有的链接,我们尽可能多的去抓取链接,之所以选择set而不使用list是防止重复的现象,我们可以将爬取的的网站存放到文件或者MySQL或者是MongoDB里。

output = sys.stdout  outputfile = open('lujing.txt', 'w')  sys.stdout = outputfile  list = GetFileList(lujing, [])

将生成的路径文件lujing.txt读取,并按照路径文件对文本处理

# 将生成的路径文件lujing.txt读取,并按照路径文件对文本处理,去标签  for line in open("lujing.txt"):      print(line)      # line=line[0:-2]      line1 = line[0:12]      line2 = line[13:16]      line3 = line[17:-1]      line4 = line[17:-6]      line = line1 + '\' + line2 + '\' + line3      print(line4)      path = line      fb = open(path, "rb")      data = fb.read()      bianma = chardet.detect(data)['encoding']  # 获取当前文件的编码方式,并按照此编码类型处理文档      page = open(line, 'r', encoding=bianma, errors='ignore').read()      dr = re.compile(r'<[^>]+>', re.S)  # 去HTML标签      dd = dr.sub('', page)      print(dd)      fname = 'TXT' + "\" + line4 + ".txt"      # print(fname)      f = open(fname, "w+", encoding=bianma)  # 将去标签的文件写到文件夹内,并按照原命名以txt文档方式保存      # fo=open(fname,"w+")      f.write(dd)

下面我们进行分词索引:

因为大家都比较熟悉sql语句那我在这里就写成MySQL的版本了,如果需要mongodb的可以私信公众号。

import jieba  import chardet  import pymysql  import importlib, sys  importlib.reload(sys)    # 如果使用MongoDB  # from pymongo import MongoClient  # #data processing  # client = MongoClient('localhost',27017)  # apiDB = client['urlDB']    #serverDB_name:test_nodedata  # questionnaires = apiDB['weburl']  # data = list(questionnaires.find())  conn = pymysql .connect(host="localhost",user="root",                         password="123456",db="suoyin",port=3307)  conn.text_factory = str  c = conn.cursor()  c.execute('drop table doc')  c.execute('create table doc (id int primary key,link text)')  c.execute('drop table word')  c.execute('create table word (term varchar(25) primary key,list text)')  conn.commit()  conn.close()      def Fenci():      num = 0      for line in open("url.txt"):          lujing = line          print(lujing)          num += 1          print(line)          line = line[17:-5]          print(line)          line = 'TXT' + '\' + line + 'Txt'  # line为文件位置          print(line)  # 文件名称          path = line          fb = open(path, "rb")          data = fb.read()          bianma = chardet.detect(data)['encoding']  # 获取文件编码        print(bianma)          # page = open(line, 'r', encoding=bianma, errors='ignore').read()          # page1=page.decode('UTF-8')          if bianma == 'UTF-16':              data = data.decode('UTF-16')              data = data.encode('utf-8')          word = jieba.cut_for_search(data)          seglist = list(word)          print(seglist)            # 创建数据库          c = conn.cursor()  # 创建游标          c.execute('insert into doc values(?,?)', (num, lujing))          # 对每个分出的词语建立词表          for word in seglist:              # print(word)              # 检验看看这个词语是否已存在于数据库              c.execute('select list from word where term=?', (word,))              result = c.fetchall()              # 如果不存在              if len(result) == 0:                  docliststr = str(num)                  c.execute('insert into word values(?,?)', (word, docliststr))              # 如果已存在              else:                  docliststr = result[0][0]  # 得到字符串                  docliststr += ' ' + str(num)                  c.execute('update word set list=? where term=?', (docliststr, word))          conn.commit()          conn.close()      Fenci()

最后一步,查询:

import pymsql  import jieba  import math    conn = pymysql .connect(host="localhost",user="root",                         password="123456",db="suoyin",port=3307)  c = conn.cursor()  c.execute('select count(*) from doc')  N = 1 + c.fetchall()[0][0]  # 文档总数  target = input('请输入搜索词:')  seggen = jieba.cut_for_search(target)  score = {}  # 文档号:匹配度  for word in seggen:      print('得到查询词:', word)      # 计算score      tf = {}  # 文档号:文档数      c.execute('select list from word where term=?', (word,))      result = c.fetchall()      if len(result) > 0:          doclist = result[0][0]          doclist = doclist.split(' ')          # 把字符串转换为元素为int的list          doclist = [int(x) for x in doclist]          # 当前word对应的df数          df = len(set(doclist))          idf = math.log(N / df)          print('idf:', idf)          for num in doclist:              if num in tf:                  tf[num] = tf[num] + 1              else:                  tf[num] = 1          # tf统计结束,现在开始计算score          for num in tf:              if num in score:                  # 如果该num文档已经有分数了,则累加                  score[num] = score[num] + tf[num] * idf              else:                  score[num] = tf[num] * idf  sortedlist = sorted(score.items(), key=lambda d: d[1], reverse=True)    cnt = 0  for num, docscore in sortedlist:      cnt = cnt + 1      c.execute('select link from doc where id=?', (num,))      url = c.fetchall()[0][0]      print("Result Ranking:", cnt)      print('url:', url, 'match degree:', docscore)        if cnt > 20:          break  if cnt == 0:      print('No result')

搞定。