静态html提取正文的API和开源算法
- 2019 年 10 月 5 日
- 笔记
版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/xc_zhou/article/details/100994876
1.arex https://github.com/ahkimkoo/arex 2.Html2Article http://www.cnblogs.com/jasondan/p/3497757.html
主要python包:requests+lxml+jparser+url2io。
其中jparser、url2io都用于网页文本正文提取,url2io准确率高,但不稳定,解析错误时则调用jparser。通过两者结合使用来提高正文提取的效果。
jparser
- 安装
pip install jparser
- 使用
可参考官网:https://pypi.org/project/jparser/0.0.10/
url2io
- 下载安装,即下载url2io.py文件。 可以到这个github项目上下载:https://github.com/Neo-Luo/scrapy_baidu github主页下载最新版:https://github.com/url2io/url2io-python-sdk/
- 官网注册 获取token:http://url2io.applinzi.com/
- 使用:https://github.com/url2io/url2io-python-sdk/
- url2io python3
#coding: utf-8 # # This program is free software. It comes without any warranty, to # the extent permitted by applicable law. You can redistribute it # and/or modify it under the terms of the Do What The Fuck You Want # To Public License, Version 2, as published by Sam Hocevar. See # http://sam.zoy.org/wtfpl/COPYING (copied as below) for more details. # # DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE # Version 2, December 2004 # # Copyright (C) 2004 Sam Hocevar <[email protected]> # # Everyone is permitted to copy and distribute verbatim or modified # copies of this license document, and changing it is allowed as long # as the name is changed. # # DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE # TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION # # 0. You just DO WHAT THE FUCK YOU WANT TO. """a simple url2io sdk example: api = API(token) api.article(url='http://www.url2io.com/products', fields=['next', 'text']) """ __all__ = ['APIError', 'API'] DEBUG_LEVEL = 1 import sys import socket import json import urllib,urllib.request,urllib.error,urllib.parse from urllib.request import urlopen import time from collections import Iterable class APIError(Exception): code = None """HTTP status code""" url = None """request URL""" body = None """server response body; or detailed error information""" def __init__(self, code, url, body): self.code = code self.url = url self.body = body def __str__(self): return 'code={s.code}nurl={s.url}n{s.body}'.format(s = self) __repr__ = __str__ class API(object): token = None server = 'http://api.url2io.com/' decode_result = True timeout = None max_retries = None retry_delay = None def __init__(self, token, srv = None, decode_result = True, timeout = 30, max_retries = 5, retry_delay = 3): """:param srv: The API server address :param decode_result: whether to json_decode the result :param timeout: HTTP request timeout in seconds :param max_retries: maximal number of retries after catching URL error or socket error :param retry_delay: time to sleep before retrying""" self.token = token if srv: self.server = srv self.decode_result = decode_result assert timeout >= 0 or timeout is None assert max_retries >= 0 self.timeout = timeout self.max_retries = max_retries self.retry_delay = retry_delay _setup_apiobj(self, self, []) def update_request(self, request): """overwrite this function to update the request before sending it to server""" pass def _setup_apiobj(self, apiobj, path): if self is not apiobj: self._api = apiobj self._urlbase = apiobj.server + '/'.join(path) lvl = len(path) done = set() for i in _APIS: if len(i) <= lvl: continue cur = i[lvl] if i[:lvl] == path and cur not in done: done.add(cur) setattr(self, cur, _APIProxy(apiobj, i[:lvl + 1])) class _APIProxy(object): _api = None _urlbase = None def __init__(self, apiobj, path): _setup_apiobj(self, apiobj, path) def __call__(self, post = False, *args, **kwargs): # /article # url = 'http://xxxx.xxx', # fields = ['next',], # if len(args): raise TypeError('only keyword arguments are allowed') if type(post) is not bool: raise TypeError('post argument can only be True or False') url = self.geturl(**kwargs) request = urllib.request.Request(url) self._api.update_request(request) retry = self._api.max_retries while True: retry -= 1 try: ret = urlopen(request, timeout = self._api.timeout).read() break except urllib.error.HTTPError as e: raise APIError(e.code, url, e.read()) except (socket.error, urllib.error.URLError) as e: if retry < 0: raise e _print_debug('caught error: {}; retrying'.format(e)) time.sleep(self._api.retry_delay) if self._api.decode_result: try: ret = json.loads(ret) except: raise APIError(-1, url, 'json decode error, value={0!r}'.format(ret)) return ret def _mkarg(self, kargs): """change the argument list (encode value, add api key/secret) :return: the new argument list""" def enc(x): #if isinstance(x, unicode): # return x.encode('utf-8') #return str(x) return x.encode('utf-8') if isinstance(x, str) else str(x) kargs = kargs.copy() kargs['token'] = self._api.token for (k, v) in kargs.items(): if isinstance(v, Iterable) and not isinstance(v, str): # kargs[k] = ','.join([enc(i) for i in v]) kargs[k] = ','.join([str(i) for i in v]) else: kargs[k] = enc(v) return kargs def geturl(self, **kargs): """return the request url""" return self._urlbase + '?' + urllib.parse.urlencode(self._mkarg(kargs)) def _print_debug(msg): if DEBUG_LEVEL: sys.stderr.write(str(msg) + 'n') _APIS = [ '/article', #'/images', ] _APIS = [i.split('/')[1:] for i in _APIS]
- 主要代码
# -*- coding:utf-8 -*- import url2io,requests,time from jparser import PageModel from newspaper import Article headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } def get_url2io(url): try: ret = api.article(url=url, fields=['text', 'next']) content=ret['text'].replace('r', '').replace('n', '') return content except Exception as e: # import traceback # ex_msg = '{exception}'.format(exception=traceback.format_exc()) # print(ex_msg, e) return '' def get_jparser(url): try: response = requests.get(url, headers=headers) en_code = response.encoding de_code = response.apparent_encoding # print(en_code,de_code,'-----------------') if de_code == None: if en_code in ['utf-8', 'UTF-8']: # en_code=utf-8时,de_code=utf-8,可以获取到内容 de_code = 'utf-8' elif de_code in ['ISO-8859-1', 'ISO-8859-2', 'Windows-1254', 'UTF-8-SIG']: de_code = 'utf-8' html = response.text.encode(en_code, errors='ignore').decode(de_code, errors='ignore') pm = PageModel(html) result = pm.extract() ans = [x['data'] for x in result['content'] if x['type'] == 'text'] content=''.join(ans) return content except Exception as e: # import traceback # ex_msg = '{exception}'.format(exception=traceback.format_exc()) # print(ex_msg, e) return '' if __name__=='__main__': token = '111111111' # 请到url2io官网注册获取token api = url2io.API(token) url = 'https://36kr.com/p/5245238' url = 'http://sc.stock.cnfol.com/ggzixun/20190909/27678429.shtml' url='https://news.pedaily.cn/201908/445881.shtml' # content=get_url2io(url) content = get_jparser(url) print(content)
Python Goose的使用:
代码比较方便,但是有些网址没有解析出来。 示例代码如下所示:
from goose import Goose from goose.text import StopWordsChinese url = 'http://www.chinanews.com/gj/2014/11-19/6791729.shtml' g = Goose({'stipwords_class':StopWordsChinese}) article = g.extract(url = url) print article.cleaned_text[:150]
结果:效果不好,有些网址解析不出来。

基于行块分布函数的通用网页正文抽取 http://wenku.baidu.com/link?url=TOBoIHWT_k68h5z8k_Pmqr-wJMPfCy2q64yzS8hxsgTg4lMNH84YVfOCWUfvfORTlccMWe5Bd1BNVf9dqIgh75t4VQ728fY2Rte3x3CQhaS
网页正文及内容图片提取算法 http://www.jianshu.com/p/d43422081e4b
这一算法的主要原理基于两点:
正文区密度:在去除HTML中所有tag之后,正文区字符密度更高,较少出现多行空白; 行块长度:非正文区域的内容一般单独标签(行块)中较短。
测试源码: https://github.com/rainyear/cix-extractor-py/blob/master/extractor.py#L9
#! /usr/bin/env python3 # -*- coding: utf-8 -*- import requests as req import re DBUG = 0 reBODY = r'<body.*?>([sS]*?)</body>' reCOMM = r'<!--.*?-->' reTRIM = r'<{0}.*?>([sS]*?)</{0}>' reTAG = r'<[sS]*?>|[ trfv]' reIMG = re.compile(r'<img[sS]*?src=['|"]([sS]*?)['|"][sS]*?>') class Extractor(): def __init__(self, url = "", blockSize=3, timeout=5, image=False): self.url = url self.blockSize = blockSize self.timeout = timeout self.saveImage = image self.rawPage = "" self.ctexts = [] self.cblocks = [] def getRawPage(self): try: resp = req.get(self.url, timeout=self.timeout) except Exception as e: raise e if DBUG: print(resp.encoding) resp.encoding = "UTF-8" return resp.status_code, resp.text #去除所有tag,包括样式、Js脚本内容等,但保留原有的换行符n: def processTags(self): self.body = re.sub(reCOMM, "", self.body) self.body = re.sub(reTRIM.format("script"), "" ,re.sub(reTRIM.format("style"), "", self.body)) # self.body = re.sub(r"[n]+","n", re.sub(reTAG, "", self.body)) self.body = re.sub(reTAG, "", self.body) #将网页内容按行分割,定义行块 blocki 为第 [i,i+blockSize] 行文本之和并给出行块长度基于行号的分布函数: def processBlocks(self): self.ctexts = self.body.split("n") self.textLens = [len(text) for text in self.ctexts] self.cblocks = [0]*(len(self.ctexts) - self.blockSize - 1) lines = len(self.ctexts) for i in range(self.blockSize): self.cblocks = list(map(lambda x,y: x+y, self.textLens[i : lines-1-self.blockSize+i], self.cblocks)) maxTextLen = max(self.cblocks) if DBUG: print(maxTextLen) self.start = self.end = self.cblocks.index(maxTextLen) while self.start > 0 and self.cblocks[self.start] > min(self.textLens): self.start -= 1 while self.end < lines - self.blockSize and self.cblocks[self.end] > min(self.textLens): self.end += 1 return "".join(self.ctexts[self.start:self.end]) #如果需要提取正文区域出现的图片,只需要在第一步去除tag时保留<img>标签的内容: def processImages(self): self.body = reIMG.sub(r'{{1}}', self.body) #正文出现在最长的行块,截取两边至行块长度为 0 的范围: def getContext(self): code, self.rawPage = self.getRawPage() self.body = re.findall(reBODY, self.rawPage)[0] if DBUG: print(code, self.rawPage) if self.saveImage: self.processImages() self.processTags() return self.processBlocks() # print(len(self.body.strip("n"))) if __name__ == '__main__': ext = Extractor(url="http://blog.rainy.im/2015/09/02/web-content-and-main-image-extractor/",blockSize=5, image=False) print(ext.getContext())
以上算法基本可以应对大部分(中文)网页正文的提取,针对有些网站正文图片多于文字的情况,可以采用保留
标签中图片链接的方法,增加正文密度。 目前少量测试发现的问题有: 1)文章分页或动态加载的网页; 2)评论长度过长喧宾夺主的网页。
参考: https://blog.csdn.net/weixin_43098787/article/details/88633973 https://www.cnblogs.com/zhaobang/p/7472091.html https://blog.csdn.net/levy_cui/article/details/51481306 https://www.v2ex.com/t/309948