python笔记:用Python实现简单的爬虫

  • 2019 年 11 月 22 日
  • 筆記

示例

做了一个简单的爬虫。使用python3。 涉及到代理的使用。关闭ssl验证。返回json的处理。 功能:用来查火车票。

import urllib.request  import json  import codecs  import time,datetime  import ssl    ssl._create_default_https_context = ssl._create_unverified_context    def GetInfo():      while True:          try:              proxy_handler = urllib.request.ProxyHandler({'https': 'http://y003460:[email protected]:8080'})              opener = urllib.request.build_opener(proxy_handler)              urllib.request.install_opener(opener)                resp=urllib.request              .urlopen('https://kyfw.12306.cn/otn/leftTicket/queryT?'                       'leftTicketDTO.train_date=2016-10-01'                       '&leftTicketDTO.from_station=SZQ&leftTicketDTO.to_station=LDQ&'                       'purpose_codes=ADULT',timeout=8)                reader = codecs.getreader("utf-8")              train_result = json.load(reader(resp))                # print(train_result)              train_datas = train_result['data']              for item in train_datas:                  train_single_data = item['queryLeftNewDTO']                  print(train_single_data['station_train_code'],"二等",train_single_data['ze_num'])                  if train_single_data['ze_num'] != "无" and train_single_data['ze_num'] != "-":                      return              nowtime = datetime.datetime.now()              print(nowtime.strftime("%Y-%m-%d %H:%M:%S-%f"))              time.sleep(8)          except Exception as errors:              print("一个错误",errors)    GetInfo()  print("找到了")

技术

获取网页

py2

proxy_handler = urllib2.ProxyHandler({})  opener = urllib2.build_opener(proxy_handler)  urllib2.install_opener(opener)    # download text  req = URL.format(args[1])  res_data = urllib2.urlopen(req)  res = res_data.read()  res = res.decode("utf-8")

py3

proxy_handler = urllib.request.ProxyHandler({})  opener = urllib.request.build_opener(proxy_handler)  urllib.request.install_opener(opener)  # download text  resp = urllib.request.urlopen(URL.format(args[1]))  reader = codecs.getreader("utf-8")  res = reader(resp).read()