爬各大电商平台
根据搜索爬天猫、京东、当当、孔夫子的书籍信息
天猫
import requests
from bs4 import BeautifulSoup
import bs4
proxies = {
'http':'<http://117.135.153.10:80>'
}
headers = {'User-Agent': 'Mozilla/5.0'}
def getHTMLText(url): #通用 获取网页信息
try:
r = requests.get(url, headers=headers,proxies=proxies) #设置代理 设置时间
r.raise_for_status()
r.encoding ='gbk';
except:
print("淘宝搜索失败")
return ""
return r.text
def getUnivList(allUnivList,text,Limit):
bookInfo = [] # 每件商品的信息 0价格 1 名字 2 评论数量 3图片 4 店名 5 平台 6物品详细界面 7筛选 8商品ID
soup = BeautifulSoup(text, "html.parser") #直接获取每一条数据的全部信息
books = soup.findAll('div', class_='product', limit=Limit) # 获取书本ID
length = len(books)
if (length == 0): return "" #如果搜索为空就直接返回
for book in books:
if (isinstance(book, bs4.element.Tag)):
productPrice = book.findAll('p', class_='productPrice') #价格
bookInfo.append(str(productPrice[0].text)[2:])
productTitle = book.findAll('p', class_='productTitle') # 名字 限制22个字
if (len(productTitle)):
if (len(str(productTitle[0].text.strip())) <= 22):
bookInfo.append(str(productTitle[0].text.strip()))
else:
bookInfo.append(str(productTitle[0].text.strip())[0:22] + "...")
else:
bookInfo.append('')
productStatus = book.findAll('p', class_='productStatus') # 评价
bookInfo.append(str(productStatus[0].text))
productImg = book.findAll('div', class_='productImg-wrap') # 图片
try:
productImg = productImg[0].a.img['src']
except:
productImg = productImg[0].a.img['data-ks-lazyload']
bookInfo.append('https:' + str(productImg))
productShopName = book.findAll('a', class_='productShop-name') # 店名
bookInfo.append(str(productShopName[0].text))
bookInfo.append('天猫')
link = book.findAll('a', class_='productImg') # 物品详细界面
bookInfo.append('https:'+str(link[0]['href']))
bookInfo.append('2')
bookInfo.append(str(book['data-id']))
allUnivList.append(bookInfo)
bookInfo=[] #清空临时列表
def tianmao(key,Limit): #bug:当搜索不足10条时
allUnivList=[]
url = '<https://list.tmall.com/search_product.htm?q='+> key
text = getHTMLText(url)
if (text == ""): return "" # 抓不到就返回空
getUnivList(allUnivList, text, Limit)
return allUnivList
京东
import bs4
import random
import json
import requests
from bs4 import BeautifulSoup
proxies = {
'http':'<http://117.135.153.10:80>'
}
headers = {'User-Agent': 'Mozilla/5.0'}
def getHTMLText(url): #通用 获取网页信息
try:
r = requests.get(url, headers=headers,proxies=proxies) #设置代理 设置时间
r.raise_for_status()
if(url.find('<https://item.jd.com/>')!=-1):
r.encoding ='gbk';
else:r.encoding ='utf-8';
except:
print("搜索失败")
return ""
return r.text
def getPrice(url):
try:
r = requests.get(url, headers=headers) #设置代理 设置时间
r.raise_for_status()
r.encoding ='utf-8';
result=json.loads(r.text)
price=result[0]['p']
except:
print("获取价格失败")
return ""
return price
def getHTMLText(url): #通用 获取网页信息
try:
r = requests.get(url, headers=headers,proxies=proxies) #设置代理 设置时间
r.raise_for_status()
if(url.find('<https://item.jd.com/>')!=-1):
r.encoding ='gbk';
else:r.encoding ='utf-8';
except:
print("搜索失败")
return ""
return r.text
def getPrice(url):
try:
r = requests.get(url, headers=headers) #设置代理 设置时间
r.raise_for_status()
r.encoding ='utf-8';
result=json.loads(r.text)
price=result[0]['p']
except:
print("获取价格失败")
return ""
return price
def getUnivList(allUnivList,text,Limit):
bookUrls = []
bookPriceURLs = []
bookInfo = [] # 每件商品的信息 1价格 2 名字 3 评论数量 4 店名 5图片 6 平台 7物品详细界面
soup = BeautifulSoup(text, "html.parser")
bookIDs = soup.findAll('li', {'class': 'gl-item'}, limit=Limit) # 获取书本ID
length = len(bookIDs)
if (length == 0): return "" #如果搜索为空就直接返回
commentNum = soup.findAll('div', class_='p-commit', limit=Limit) # 获取评论数量
img = soup.findAll('div', class_='p-img', limit=Limit) # 获取图片
for i in range(length):
bookUrls.append('<https://item.jd.com/>' + str(bookIDs[i].get('data-sku')) + '.html') # 把商品的url加入数列
bookPriceURLs.append('<https://p.3.cn/prices/mgets?skuIds=J_>' + str(bookIDs[i].get('data-sku'))) # 把商品价格的url加入列表
for i in range(length): # bug :获取的元素不存在时会出错 用#isinstance(div,bs4.element.Tag): 判断 没有就给个空字符串
text = getHTMLText(bookUrls[i])
if (text == ""): return "" # 抓不到就返回空
soup = BeautifulSoup(text, "html.parser")
bookInfo.append(getPrice(bookPriceURLs[i])) # 价格
name = soup.findAll('div', class_='sku-name', limit=1) # 名字(限制22个字)
if(len(name)):
if(len(str(name[0].text.strip()))<=22):
bookInfo.append(str(name[0].text.strip()))
else:
bookInfo.append(str(name[0].text.strip())[0:22]+"...")
else:
bookInfo.append('')
bookInfo.append(str(commentNum[i].text.strip())) # 把评论数论数量加入列表
if (str(img[i].img.get('src')) == 'None'):
bookInfo.append('http:' + str(img[i].img.get('source-data-lazy-img'))) # 把图片加入列表
else:
bookInfo.append('http:' + str(img[i].img.get('src'))) # 把图片加入列表
shopName = soup.findAll('div', class_='J-hove-wrap', limit=1) # 店名
if (len(shopName)):
shopName = shopName[0].findAll('a', limit=1) # 在任何一个标签上都可以使用 findAll()
if (len(shopName)):
bookInfo.append(str(shopName[0].text.strip()))
else:
bookInfo.append('京东自营')
else:
bookInfo.append('')
bookInfo.append("京东")
bookInfo.append(bookUrls[i])
allUnivList.append(bookInfo)
bookInfo=[]
def jingdong(key,allUnivList,Limit): #bug:当搜索不足10条时
url = '<https://search.jd.com/Search?keyword=>' + key + '&enc=utf-8&wq=' + key
text = getHTMLText(url)
if (text == ""): return "" # 抓不到就返回空
getUnivList(allUnivList, text, Limit)
def main():
key=""
Limit=10
allUnivList=[]
jingdong(key,allUnivList,20)
main()
#r.request.headers 查看请求信息
#当获取元素的属性不存在时 返回一个 None
#isinstance(div,bs4.element.Tag): 判断一个标签的是否存在
'''from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(options=chrome_options)
driver.get("<https://item.jd.com/12146215.html>")
element = driver.find_element_by_id("jd-price").text
print (element)'''
当当
import requests
from bs4 import BeautifulSoup
import bs4
proxies = {
'http':'<http://123.59.199.100:80>'
}
headers = {'User-Agent': 'Mozilla/5.0'}
def getHTMLText(url): #通用 获取网页信息
try:
r = requests.get(url, headers=headers,proxies=proxies) #设置代理 设置时间
r.raise_for_status()
r.encoding ='gbk';
except:
print("当当搜索失败")
return ""
return r.text
def getUnivList(allUnivList,text,Limit):
bookInfo = [] # 每件商品的信息 0价格 1 名字 2 评论数量 3图片 4 店名 5 平台 6物品详细界面 7筛选 8商品ID
soup = BeautifulSoup(text, "html.parser") #直接获取每一条数据的全部信息
books = soup.findAll('ul', class_='bigimg',limit=1) # 获取每一本的信息
books = books[0].findAll('li',limit=Limit)
length = len(books)
if (length == 0): return "" #如果搜索为空就直接返回
for book in books:
if (isinstance(book, bs4.element.Tag)):
productPrice = book.findAll('span', class_='search_now_price') #价格
bookInfo.append(str(productPrice[0].text))
productTitle = book.findAll('p', class_='name') # 名字
if (len(productTitle)):
if (len(str(productTitle[0].text.strip())) <= 22):
bookInfo.append(str(productTitle[0].text.strip()))
else:
bookInfo.append(str(productTitle[0].text.strip())[0:22] + "...")
else:
bookInfo.append('')
productStatus = book.findAll('p', class_='search_star_line') # 评价
bookInfo.append(str(productStatus[0].text))
productImg = book.findAll('a', class_='pic') # 图片
link=productImg[0]['href'] #获取物品详细界面
try:
productImg = productImg[0].img['data-original']
except:
productImg = productImg[0].img['src']
bookInfo.append(str(productImg))
productShopName = book.findAll('p', class_='search_shangjia') # 店名 分自营和商家 限制22个字
if (len(productShopName)):
productShopName = productShopName[0].find('a')['title']
bookInfo.append(productShopName)
else:
bookInfo.append('当当自营')
bookInfo.append('当当')
bookInfo.append(link)
bookInfo.append('3')
bookInfo.append(book['id'][1:])
allUnivList.append(bookInfo)
bookInfo = []
def dangdang(key,Limit): #bug:当搜索不足10条时
allUnivList=[]
url = '<http://search.dangdang.com/?key='+key+'&act=input>'
text = getHTMLText(url)
if (text == ""): return "" # 抓不到就返回空
getUnivList(allUnivList, text, Limit)
print(allUnivList)
return allUnivList
孔夫子
import requests
import json
proxies = {
}
headers = {'User-Agent': 'Mozilla/5.0'}
def getHTMLText(url): #通用 获取网页信息
try:
r = requests.get(url, headers=headers,proxies=proxies) #设置代理 设置时间
r.raise_for_status()
r.encoding ='utf-8';
except:
print("孔夫子搜索失败")
return ""
return r.text
def getFee(userId,itemId):
url='<http://shop.kongfz.com/book/shopsearch/getShippingFee?callback=jQuery11120008516125003493968_1545017785216¶ms={%22params%22:[{%22userId%22:%22'+userId+'%22,%22itemId%22:%22'+itemId+'%22}>],%22area%22:%221006000000%22}'
text=getHTMLText(url)
if (text == ""): return "邮费获取失败" # 抓不到就返回空
text=text.replace('jQuery11120008516125003493968_1545017785216(','').replace(')','')
feeAll = json.loads(text)
fee='快递:'+feeAll['data'][0]['fee'][0]['totalFee']
return fee
def getUnivList(allUnivList,text,Limit):
bookdetail = json.loads(text)
bookInfo = [] # 每件商品的信息 0价格 1 名字 2 评论数量(邮费) 3图片 4 店名 5 平台(商品的品数) 6物品详细界面 7筛选 8商品ID
books=bookdetail['data']['itemList']
lenth=len(books)
if lenth==0:
return ""
if(lenth>Limit):
lenth=Limit;
for i in range(lenth):
bookInfo.append(float(books[i]['price']))
bookInfo.append(str(books[i]['itemname_snippet']).replace('<b>','').replace('</b>',''))
bookInfo.append(getFee(str(books[i]['userid']),str(books[i]['itemid']))) #传入用户ID与商品ID去获得邮费
if str((books[i]['imgurl']))=="":
bookInfo.append("")
else:
bookInfo.append('<http://www.kfzimg.com/'+str((books[i]['imgurl>'])))
bookInfo.append(str(books[i]['shopname'])[0:15])
bookInfo.append(str(books[i]['qualityname']))
bookInfo.append('<http://book.kongfz.com/'+str(books[i]['shopid'])+'/'+str(books[i]['itemid'])+'/>')
bookInfo.append('1')
allUnivList.append(bookInfo)
bookInfo=[]
def kongfuzi(key,Limit): #bug:当搜索不足10条时
allUnivList=[]
url = '<http://search.kongfz.com/product_result/?key='+key+'&exact=1&type=1&ajaxdata=1>'
text = getHTMLText(url)
if (text == ""): return "" # 抓不到就返回空
getUnivList(allUnivList, text, Limit)
print(allUnivList)
return allUnivList
kongfuzi("三体",10)

浙公网安备 33010602011771号