# _*_ coding:utf-8 _*_
import random
import requests
from bs4 import BeautifulSoup
def crawl_tb_product():
"""
抓取淘宝天猫产品
:return:
"""
# 淘宝天猫产品链接
url = 'https://www.tmall.com/mlist/cp_bGFibyBsYWJvILPH0rDSvcn6.html'
# 浏览器伪装
headers = {
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'
}
# 利用代理
proxys = ['61.135.217.7 80',
'58.56.108.226 43296',
'58.240.232.126 57505',
'182.111.64.7 41766',
'59.32.37.118 8010',
'114.225.169.36 53128',
'59.173.74.169 8010',
'121.31.155.184 8123',
'121.31.192.215 8123',
'175.175.216.210 1133',
'13.121.242.49 808',
'115.219.109.1 8010']
print(random.choice(proxys))
p = random.choice(proxys)
ip = p.strip('\n').split('\t')
proxy = 'http:\\' + ip[0] + ':' + ip[1]
proxies = {'proxy': proxy}
try:
r = requests.get(url, headers=headers, proxies=proxies)
soup = BeautifulSoup(r.text, "lxml")
all_product = soup.find_all('div', attrs={'class': 'product'})
name_list = []
price_list = []
client_list = []
image_url_list = []
for product in all_product:
# 产品名称列表
name = product.select('.productTitle a')[0].get_text()
name_list.append(name)
# 产品图片
image_url = product.select('.productImg-wrap a img')[0].attrs['src']
image_url_list.append('https:' + image_url)
# 价格列表
price =product.select('.productPrice em')[0].get_text()
price_list.append(price)
# 商家
client =product.select('.productShop')[0].get_text().strip()
client_list.append(client)
# 打印抓取的列表
print(name_list)
print(price_list)
print(client_list)
print(image_url_list)
print('查询成功!')
except Exception as e:
print(e)
print('爬取网页失败!')