爬取豆瓣電影排行top250

  • 2020 年 1 月 16 日
  • 筆記

功能描述V1.0:

爬取豆瓣電影排行top250

功能分析:

使用的庫

1、time

2、json

3、requests

4、BuautifulSoup

5、RequestException

上機實驗室:

"""      作者:李舵      日期:2019-4-27      功能:抓取豆瓣電影top250      版本:V1.0  """    import time  import json  import requests  from bs4 import BeautifulSoup  from requests.exceptions import RequestException      def get_one_page(url):      try:          headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}          response = requests.get(url, headers=headers)          if response.status_code == 200:              return response.text          return None      except RequestException:          return None      def parse_one_page(html):      soup = BeautifulSoup(html, 'lxml')      ol_list = soup.find('ol', {'class': 'grid_view'})      li_list = ol_list.find_all('li')      for i in range(25):          move_value = li_list[i]          yield {              'index': move_value.find('em', {'class': ''}).text.strip(),              'title': move_value.find('span', {'class': 'title'}).text.strip(),              'actor': move_value.find('p', {'class': ''}).text.strip(),              'score': move_value.find('span', {'class': 'rating_num'}).text.strip()          }      def write_to_file(content):      with open('result.txt', 'a', encoding='utf-8') as f:          print(type(json.dumps(content)))          f.write(json.dumps(content, ensure_ascii=False)+'n')      def main(start):      url = 'https://movie.douban.com/top250?start=' + str(start)      html = get_one_page(url)      for item in parse_one_page(html):          print(item)          write_to_file(item)      if __name__ == '__main__':      for i in range(0,250,25):          main(start=i)          time.sleep(1)

功能描述V2.0:

爬取豆瓣電影排行top250

功能分析:

使用的庫

1、time

2、requests

3、RequestException

上機實驗室:

"""  作者:李舵  日期:2019 - 4 - 8  功能:抓取豆瓣電影top250  版本:V2.0  """    import re  import time  import requests  from requests.exceptions import RequestException      def get_one_page(url):      try:          headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}          response = requests.get(url, headers=headers)          if response.status_code == 200:              return response.text          return None      except RequestException:          return None      def parse_one_page(html):      pattern = re.compile(u'<div.*?class="item">.*?'                              + u'<div.*?class="pic">.*?'                              + u'<em.*?class="">(.*?)</em>.*?'                              + u'<div.*?class="info">.*?'                              + u'<span.*?class="title">(.*?)</span>.*?'                              + u'<span.*?class="other">(.*?)</span>.*?'                              + u'<div.*?class="bd">.*?'                              + u'<p.*?class="">.*?'                              + u'導演:s(.*?)s.*?<br>'                              + u'(.*?) / '                              + u'(.*?) / (.*?)</p>.*?'                              + u'<div.*?class="star">.*?'                              + u'<span.*?class="rating_num".*?property="v:average">'                              + u'(.*?)</span>.*?'                              + u'<span>(.*?)人評價</span>.*?'                              + u'<span.*?class="inq">(.*?)</span>', re.S)      movies = re.findall(pattern, html)      movie_list = []      for movie in movies:          movie_list.append([movie[0],                             movie[1],                             movie[2].lstrip(' / '),                             movie[3],                             movie[4].lstrip(),                             movie[5],                             movie[6].strip(),                             movie[7],                             movie[8],                             movie[9]])      return movie_list      def write_to_file(movie_list):      with open('top_250.txt', 'w', encoding='utf-8',) as f:          for movie in movie_list:              f.write('電影排名:' + movie[0] + 'n')              f.write('電影名稱:' + movie[1] + 'n')              f.write('電影別名:' + movie[2] + 'n')              f.write('導演:' + movie[3] + 'n')              f.write('上映年份:' + movie[4] + 'n')              f.write('製作國家/地區:' + movie[5] + 'n')              f.write('電影類別:' + movie[6] + 'n')              f.write('評分:' + movie[7] + 'n')              f.write('參評人數:' + movie[8] + 'n')              f.write('簡短影評:' + movie[9] + 'n')              f.write('n')          print('成功寫入文件,共有%d條記錄……' % len(movie_list))          f.close()      def main(start):      url = 'https://movie.douban.com/top250?start=' + str(start)      html = get_one_page(url)      movie_list = parse_one_page(html)      write_to_file(movie_list)      if __name__ == '__main__':      for i in range(0, 250, 25):          main(start=i)          time.sleep(1)

補充說明:

1、