import requests
import re
from bs4 import BeautifulSoup
import bs4
'''
数据线起始页https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306
数据线第二页https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=44
数据线第三页https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=0&ntoffset=6&p4ppushleft=1%2C48&s=88
'''
def get_html_text(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except :
return '该网页请求连接失败'
#通过脚本语言编写的html代码,不是完整的html语言,直接搜索相对比较简单
#正则表达式原生字符串你还没有理解,最小匹配原则 视频时间11:20
def parse_page(list_info,html):
try:
list_price=re.findall(r'"view_price":"[\d.]*"',html)
list_title=re.findall(r'"raw_title":".*?"',html)
list_location=re.findall(r'"item_loc":".*?"',html)
#list_num_payment=re.findall(r'"view_sales":"u"',html)
for i in range(len(list_price)):
price=eval(list_price[i].split(':')[1])
title=eval(list_title[i].split(':')[1])
location=eval(list_location[i].split(':')[1])
#num_payment=eval(list_num_payment.split(':')[1])
#list_info.append([price,num_payment,location,title])
list_info.append([price,location,title])
except :
print('解析网页出现异常')
def print_goods_info(list_info):
#tplt='{:4}\t{:8}\t{:8}\t{:12}\t{:20}\t'
tplt='{:4}\t{:8}\t{:12}\t{:20}\t'
#print(tplt.format('序号','商品价格','付款人数','发货地址','商品名称'))
print(tplt.format('序号','商品价格','发货地址','商品名称'))
count=0
for goods in list_info:
count+=1
#print(tplt.format(count,goods[0],goods[1],goods[2],goods[3]))
print(tplt.format(count,goods[0],goods[1],goods[2]))
if __name__ == '__main__':
goods='书包'
depth=2
start_url='https://s.taobao.com/search?q='+goods
list_info=[]
for i in range(depth):
try:
url=start_url+'&s='+str(44*i)
html=get_html_text(url)
parse_page(list_info,html)
except:
continue #如果某一个页面出现了问题,则会跳过该页面的解析,而不会影响程序的整体运行
print_goods_info(list_info)