爬取豆瓣电影排行top250
- 2020 年 1 月 16 日
- 笔记
功能描述V1.0:
爬取豆瓣电影排行top250
功能分析:
使用的库
1、time
2、json
3、requests
4、BuautifulSoup
5、RequestException
上机实验室:
""" 作者:李舵 日期:2019-4-27 功能:抓取豆瓣电影top250 版本:V1.0 """ import time import json import requests from bs4 import BeautifulSoup from requests.exceptions import RequestException def get_one_page(url): try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'} response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: return None def parse_one_page(html): soup = BeautifulSoup(html, 'lxml') ol_list = soup.find('ol', {'class': 'grid_view'}) li_list = ol_list.find_all('li') for i in range(25): move_value = li_list[i] yield { 'index': move_value.find('em', {'class': ''}).text.strip(), 'title': move_value.find('span', {'class': 'title'}).text.strip(), 'actor': move_value.find('p', {'class': ''}).text.strip(), 'score': move_value.find('span', {'class': 'rating_num'}).text.strip() } def write_to_file(content): with open('result.txt', 'a', encoding='utf-8') as f: print(type(json.dumps(content))) f.write(json.dumps(content, ensure_ascii=False)+'n') def main(start): url = 'https://movie.douban.com/top250?start=' + str(start) html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == '__main__': for i in range(0,250,25): main(start=i) time.sleep(1)
功能描述V2.0:
爬取豆瓣电影排行top250
功能分析:
使用的库
1、time
2、requests
3、RequestException
上机实验室:
""" 作者:李舵 日期:2019 - 4 - 8 功能:抓取豆瓣电影top250 版本:V2.0 """ import re import time import requests from requests.exceptions import RequestException def get_one_page(url): try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'} response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: return None def parse_one_page(html): pattern = re.compile(u'<div.*?class="item">.*?' + u'<div.*?class="pic">.*?' + u'<em.*?class="">(.*?)</em>.*?' + u'<div.*?class="info">.*?' + u'<span.*?class="title">(.*?)</span>.*?' + u'<span.*?class="other">(.*?)</span>.*?' + u'<div.*?class="bd">.*?' + u'<p.*?class="">.*?' + u'导演:s(.*?)s.*?<br>' + u'(.*?) / ' + u'(.*?) / (.*?)</p>.*?' + u'<div.*?class="star">.*?' + u'<span.*?class="rating_num".*?property="v:average">' + u'(.*?)</span>.*?' + u'<span>(.*?)人评价</span>.*?' + u'<span.*?class="inq">(.*?)</span>', re.S) movies = re.findall(pattern, html) movie_list = [] for movie in movies: movie_list.append([movie[0], movie[1], movie[2].lstrip(' / '), movie[3], movie[4].lstrip(), movie[5], movie[6].strip(), movie[7], movie[8], movie[9]]) return movie_list def write_to_file(movie_list): with open('top_250.txt', 'w', encoding='utf-8',) as f: for movie in movie_list: f.write('电影排名:' + movie[0] + 'n') f.write('电影名称:' + movie[1] + 'n') f.write('电影别名:' + movie[2] + 'n') f.write('导演:' + movie[3] + 'n') f.write('上映年份:' + movie[4] + 'n') f.write('制作国家/地区:' + movie[5] + 'n') f.write('电影类别:' + movie[6] + 'n') f.write('评分:' + movie[7] + 'n') f.write('参评人数:' + movie[8] + 'n') f.write('简短影评:' + movie[9] + 'n') f.write('n') print('成功写入文件,共有%d条记录……' % len(movie_list)) f.close() def main(start): url = 'https://movie.douban.com/top250?start=' + str(start) html = get_one_page(url) movie_list = parse_one_page(html) write_to_file(movie_list) if __name__ == '__main__': for i in range(0, 250, 25): main(start=i) time.sleep(1)
补充说明:
1、