python爬虫学习01–电子书爬取
- 2020 年 7 月 13 日
- 筆記
- Python, Python电子书爬虫
python爬虫学习01–电子书爬取
1.获取网页信息
import requests
'''
获取网页信息
'''
if __name__ == '__main__':
target = '//www.xsbiquge.com/78_78513/108078.html'
req = requests.get(url=target)
req.encoding='utf-8'
print(req.text)
2.引入BeautifulSoup对网页内容进行解析
import requests
from bs4 import BeautifulSoup
'''
引入BeautifulSoup对网页内容进行解析
获取网页电子书文本信息
'''
if __name__ == '__main__':
target = '//www.xsbiquge.com/78_78513/108078.html'
req = requests.get(url=target)
req.encoding='utf-8'
html = req.text
bs = BeautifulSoup(html,'lxml')
texts = bs.find('div',id='content')
print(texts)
3.切分数据,去掉空格,提取文字
import requests
from bs4 import BeautifulSoup
'''
引入BeautifulSoup对网页内容进行解析
获取网页电子书文本信息
最后一句texts.text 是提取所有文字,然后再使用 strip 方法去掉回车,
最后使用 split 方法根据 \xa0 切分数据,因为每一段的开头,都有四个空格
'''
if __name__ == '__main__':
target = '//www.xsbiquge.com/78_78513/108078.html'
req = requests.get(url=target)
req.encoding='utf-8'
html = req.text
bs = BeautifulSoup(html,'lxml')
texts = bs.find('div',id='content')
print(texts.text.strip().split('\xa0'*4))
4.查看章节列表
import requests
from bs4 import BeautifulSoup
'''
查看章节列表信息
引入BeautifulSoup对网页内容进行解析
获取网页电子书文本信息
'''
if __name__ == '__main__':
target = '//www.xsbiquge.com/78_78513/'
req = requests.get(url=target)
req.encoding='utf-8'
html = req.text
bs = BeautifulSoup(html,'lxml')
chapters = bs.find('div',id='list')
chapters = chapters.find_all('a')
for chapter in chapters:
print(chapter)
5.获取章节目录和章节链接
import requests
from bs4 import BeautifulSoup
'''
查看章节列表信息
引入BeautifulSoup对网页内容进行解析
获取网页电子书文本信息
'''
if __name__ == '__main__':
server = '//www.xsbiquge.com'
target = '//www.xsbiquge.com/78_78513/'
req = requests.get(url=target)
req.encoding='utf-8'
html = req.text
bs = BeautifulSoup(html,'lxml')
chapters = bs.find('div',id='list')
chapters = chapters.find_all('a')
for chapter in chapters:
url = chapter.get('href')
print("《"+chapter.string+"》")
print(server+url)
6.整合数据,下载电子书文档
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
'''
查看章节列表信息
引入BeautifulSoup对网页内容进行解析
获取网页电子书文本信息
'''
def get_content(target):
req = requests.get(url=target)
req.encoding = 'utf-8'
html = req.text
bf = BeautifulSoup(html, 'lxml')
texts = bf.find('div', id='content')
content = texts.text.strip().split('\xa0' * 4)
return content
if __name__ == '__main__':
server = '//www.xsbiquge.com'
book_name = '《元尊》.txt'
target = '//www.xsbiquge.com/78_78513/'
req = requests.get(url=target)
req.encoding='utf-8'
html = req.text
chapter_bs = BeautifulSoup(html,'lxml')
chapters = chapter_bs.find('div',id='list')
chapters = chapters.find_all('a')
for chapter in tqdm(chapters):
chapter_name = chapter.string
url = server + chapter.get('href')
content = get_content(url)
with open(book_name,'a',encoding='utf-8') as f:
f.write("《"+chapter_name+"》")
f.write('\n')
f.write('\n'.join(content))
f.write('\n')
ps:下载的时候可能会有点慢,下载一本书大概十几分钟,在以后学到新的方法会改善的
