python笔记:用Python实现简单的爬虫
- 2019 年 11 月 22 日
- 筆記
示例
做了一个简单的爬虫。使用python3。 涉及到代理的使用。关闭ssl验证。返回json的处理。 功能:用来查火车票。
import urllib.request import json import codecs import time,datetime import ssl ssl._create_default_https_context = ssl._create_unverified_context def GetInfo(): while True: try: proxy_handler = urllib.request.ProxyHandler({'https': 'http://y003460:[email protected]:8080'}) opener = urllib.request.build_opener(proxy_handler) urllib.request.install_opener(opener) resp=urllib.request .urlopen('https://kyfw.12306.cn/otn/leftTicket/queryT?' 'leftTicketDTO.train_date=2016-10-01' '&leftTicketDTO.from_station=SZQ&leftTicketDTO.to_station=LDQ&' 'purpose_codes=ADULT',timeout=8) reader = codecs.getreader("utf-8") train_result = json.load(reader(resp)) # print(train_result) train_datas = train_result['data'] for item in train_datas: train_single_data = item['queryLeftNewDTO'] print(train_single_data['station_train_code'],"二等",train_single_data['ze_num']) if train_single_data['ze_num'] != "无" and train_single_data['ze_num'] != "-": return nowtime = datetime.datetime.now() print(nowtime.strftime("%Y-%m-%d %H:%M:%S-%f")) time.sleep(8) except Exception as errors: print("一个错误",errors) GetInfo() print("找到了")
技术
获取网页
py2
proxy_handler = urllib2.ProxyHandler({}) opener = urllib2.build_opener(proxy_handler) urllib2.install_opener(opener) # download text req = URL.format(args[1]) res_data = urllib2.urlopen(req) res = res_data.read() res = res.decode("utf-8")
py3
proxy_handler = urllib.request.ProxyHandler({}) opener = urllib.request.build_opener(proxy_handler) urllib.request.install_opener(opener) # download text resp = urllib.request.urlopen(URL.format(args[1])) reader = codecs.getreader("utf-8") res = reader(resp).read()