python 爬取mm信息
# -*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup import sys import re reload(sys) sys.setdefaultencoding('utf-8') for num in range(1,4300): try: URL = 'http://mm.taobao.com/json/request_top_list.htm?page=%d' % num #print "现在爬取的网站url是:" + URL response = requests.get(URL) response.encoding = 'gb2312' text = response.text soup = BeautifulSoup(text, 'lxml') for model in soup.select(".list-item"): try: model_id = model.find('span', {'class': 'friend-follow J_FriendFollow'})['data-userid'] json_url = "http://mm.taobao.com/self/info/model_info_show.htm?user_id=%d" % int(model_id) response_json = requests.get(json_url) response_json.encoding = 'gb2312' text_response_json = response_json.text soup_json = BeautifulSoup(text_response_json, 'lxml') print "***********************************" + model.find('a', {'class': 'lady-name'}).string + "*********************************" print "模特的名字:" + model.find('a', {'class': 'lady-name'}).string print "模特的年龄:"+ model.find('p', {'class': 'top'}).em.strong.string print "生日:" + soup_json.find('li', {'class': 'mm-p-cell-left'}).span.string blood = soup_json.find_all('li', {'class': 'mm-p-cell-right'})[1].span.string if blood is None: blood = "无" print "血型:" + blood print "学校/专业:" + soup_json.find_all('li')[5].span.string print "身高:" + soup_json.find('li', {'class': 'mm-p-small-cell mm-p-height'}).p.string print "体重:" + soup_json.find('li', {'class': 'mm-p-small-cell mm-p-weight'}).p.string print "三围:" + soup_json.find('li', {'class': 'mm-p-small-cell mm-p-size'}).p.string print "罩杯:" + soup_json.find('li', {'class': 'mm-p-small-cell mm-p-bar'}).p.string print "鞋码:" + soup_json.find('li', {'class': 'mm-p-small-cell mm-p-shose'}).p.string print "模特所在地:"+ model.find('p', {'class': 'top'}).span.string print "模特的id:"+ model.find('span', {'class': 'friend-follow J_FriendFollow'})['data-userid'] print "模特的标签:"+ model.find_all('p')[1].em.string print "模特的粉丝数:"+ model.find_all('p')[1].strong.string print "模特的排名:"+ [text for text in model.find('div', {'class': 'popularity'}).dl.dt.stripped_strings][0] print model.find('ul', {'class': 'info-detail'}).get_text(" ",strip=True) print "模特的个人资料页面:" +"http:"+ model.find('a', {'class': 'lady-name'})['href'] print "模特的个人作品页面:" +"http:"+ model.find('a', {'class': 'lady-avatar'})['href'] print "模特的个人头像:" + "http:" + model.find('img')['src'] print "***********************************" + model.find('a', {'class': 'lady-name'}).string + "*********************************" print "n" except: print "error" except: print num + "page is error"