#抓取淘宝数据
import re
import requests
from bs4 import BeautifulSoup
import string
import os
import sqlite3
class Getdata:
def getHTMLText(url,header):
try:
r = requests.get(url,headers=header)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt,html):
try:
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
for i in range(len(plt)):
price = eval(plt[i].split(":")[1])
title = eval(tlt[i].split(":")[1])
ilt.append([price,title])
except:
print("爬取失败")
def GetCount(html):
total=re.findall('"totalPage"\:\d+',html)
for i in range(len(total)):
totalPage = eval(total[i].split(":")[1])
return totalPage
def printGoodsList(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序号","商品价格","商品名称"))
count = 0
for g in ilt:
count = count + 1
print(tplt.format(count,g[0],g[1]))
class DatabaseMannege:
#创建数据表
def CreateDataBase():
try:
db =sqlite3.connect("taobao.db")
except:
print("创建数据库失败")
try:
db.execute('create table GoodMsg(id varchar(10),price varchar(10),name varchar(40))')
except:
print("创建表失败或表已经存在")
##将数据写入数据库
def InsertDatabase(data):
db =sqlite3.connect("taobao.db")
cur = db.cursor()
for property in data:
try:
print("开始插入")
sql_insert = ("insert into GoodMsg(price,name)values('{}','{}')").format(property[0],property[1])
cur.execute(sql_insert)
db.commit()
print("插入成功")
except :
print('插入失败')
class Main:
def main():
print("请输入查询商品")
goods = input()
infoList = []
start_url = "https://s.taobao.com/search?q=" + goods
header = {"cookie":"thw=cn; cna=ktJ/FI8k0gQCAbaLv4XUGVvh; tg=0; enc=%2FDi9xgv2fnznKtXV88N9fUTdV6UcRLyw3G6h3pjdwcpbHwkSTh%2FO1B1zsb29cDTL5N8TU0t4TdkRNxzvKIn4Ig%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; tracknick=1052071694www; t=0a525deca2dff81647d91643519e7e37; UM_distinctid=16b9bd49a2a5ef-031997ebe67ce2-37c143e-144000-16b9bd49a2b92e; miid=1364685100501550517; _cc_=W5iHLLyFfA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _m_h5_tk=98af7fdaf32be92fe72127eda6e0044e_1571041861081; _m_h5_tk_enc=ca1bdc50118e6ce4e5fd587ccc946e6c; mt=ci%3D-1_0; v=0; cookie2=1aac9317cb43d8f5dfab37bd0222fcf9; _tb_token_=578e3e4e7eedb; JSESSIONID=021AC0B7547DE41EE0944D2ECB89C106; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; l=dBjS2MZrqT2zAZFsBOCgSZ1_aY79jIRAguWbYNq9i_5BK6L_qNbOkg25WFp6cjWfthYB4NSLztv9-etkiKy06Pt-g3fPNxDc.; isg=BHR0oiuylQB4VAH5skFM2Q9IRTLsTpjNHWdJ-w7VA_-CeRTDNlnkx4w7_fEEgdCP",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}
html = Getdata.getHTMLText(start_url,header)
depth = Getdata.GetCount(html)
DatabaseMannege.CreateDataBase()
for i in range(depth):
try:
url = start_url + "&s=" + str(44 * i)
html = Getdata.getHTMLText(url,header)
Getdata.parsePage(infoList,html)
Getdata.printGoodsList(infoList)
DatabaseMannege.InsertDatabase(infoList)
except:
continue
Main.main()