NDGD团队(五)
在苏宁(胡)的呼吁下
其余两人开始进行进入selenium的学习
淘宝(刘)成果
import time
from selenium import webdriver
import requests
import pymysql
import re #正则表达式
def get_conn():
db = pymysql.connect(host='localhost', user='root', passwd='password', db='tb',
charset="utf8")
print("数据库连接成功")
return db
def search_product(key):
'''模拟搜索商品,获取最大页数'''
driver.find_element_by_id('q').send_keys(key)
driver.find_element_by_class_name('btn-search').click()
driver.maximize_window()
time.sleep(15)
# page = driver.find_element_by_xpath('//*[@id="mainsrp-pager"]/div/div/div/div[1]').text #找到页数的标签
# page = re.findall('(\d+)', page)[0]
# return int(page) #获取到的页数
def get_product():
time_str = time.strftime("%Y{}%m{}%d")
date = time_str.format("-", "-")
origin = "淘宝"
conn = get_conn()
cursor = conn.cursor()
headers = {
"user-Agent": "Mozilla/5.0 (Windows NT 10.0;Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
}
divs = driver.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq "]')
sql = "insert into tb (date, id, price, name, link, image, origin) values (%s, %s, %s, %s, %s, %s, %s)"
for div in divs:
info = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').text# 商品名称
price = div.find_element_by_xpath('./').text + "元" # 商品价格
deal = div.find_element_by_xpath('.//div[@class="deal-cnt"]').text # 付款人数
name = div.find_element_by_xpath('.//div[@class="shop"]/a').text # 店铺名称
url1 = div.find_element_by_xpath('.//div[@class="pic"]/a').get_attribute("data-href")
url = "https:" + url1
picture = div.find_element_by_xpath('.//div[@class="pic"]/a/img').get_attribute("data-src")
res = requests.get(url, headers=headers).text
getK = re.compile(r'<li title=".*?">款号: (.*?)<>')
keyL = str(re.findall(getK, res))
print(info, price, deal, name, url, picture, keyL, sep='| ')
cursor.execute(sql, [date, keyL, price, name, url, picture, origin])
conn.commit()
print("插入数据完毕")
def main():
print('正在爬取第1页数据')
get_conn()
page = search_product(keyword)
get_product()
page_num = 1
#q不变
while page_num < 21:
print('*' * 100)
print('正在爬取第{}页的数据'.format(page_num + 1))
print('*' * 100)
driver.get('https://s.taobao.com/search?q={}&s={}'.format(keyword, page_num * 44)) # 拼接URL地址
driver.implicitly_wait(10) # 浏览器等待方法
driver.maximize_window()
get_product()
page_num += 1
if __name__ == '__main__':
keyword = input("请输入要搜索的商品:")
driver = webdriver.Chrome()
driver.get('https://www.taobao.com/')
main()
京东(高)成果
from bs4 import BeautifulSoup
from selenium import webdriver
import mysql.connector
import time
import re
import datetime
import requests
cookies = {
'__jdu': '16196590092911498836650',
'shshshfpa': 'a828a369-a728-e182-048e-b5072447be6e-1619659011',
'__jdv': '76161171|baidu|-|organic|not set|1620086427678',
'PCSYCityID': 'CN_130000_130100_0',
'shshshfpb': 'tOCI1z%20PGPaR3CEfad3X5Hg%3D%3D',
'user-key': 'f073a896-8985-417f-97cb-f814516b4745',
'areaId': '5',
'ipLoc-djd': '5-199-221-47311',
'o2State': '{%22webp%22:true}',
'mt_xid': 'V2_52007VwMVUlVfVlIeSBFfDWcDF1pUUVBdGE0YbAMzU0UAXQ9WRk8eTQ4ZYlNAB0FRVgoeVRkMVWYEF1sPCAUNGnkaXQZnHxNWQVhTSx5BElwHbAAXYl1oUmodQBpdAmELEVFeaFZeHEs%3D',
'__jda': '76161171.16196590092911498836650.1619659009.1620572190.1620609100.16',
'__jdc': '76161171',
'shshshfp': 'ab1973b5bc2435b07e462e35ab1154e9',
'__jdb': '76161171.31.16196590092911498836650|16.1620609100',
'shshshsID': 'fa3eef235617f6894add8c87cdccdd86_10_1620610403184',
}
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
'sec-ch-ua-mobile': '?0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 FS',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,ja;q=0.6',
}
#response = requests.get('https://www.jd.com/', headers=headers, cookies=cookies)
driver = webdriver.Chrome()
resultlist = []
date = datetime.datetime.now().strftime('%Y-%m-%d')
mydb = mysql.connector.connect(
host="localhost",
user="root",
passwd="629329",
database="jd"
)
mycursor = mydb.cursor()
def insertMysql(good_list, page):
sql = "INSERT INTO pachong (date, id, price, name, link, image, origin) VALUES (%s, %s, %s, %s, %s, %s, %s)"
#count = (page-1) * 60 + 0
for g in good_list:
#count += 1
mycursor.execute(sql, (date, g[0], g[1], g[2], g[3], g[4], "京东"))
print("插入成功")
mydb.commit()
def next_page(page):
if page>1 and page<=30:
driver.find_elements_by_xpath("//*[@id ='J_bottomPage']/span[1]/a[9]")[0].click()#模拟点击下一页按钮
Parse_Html_Page()
insertMysql(resultlist,page)
def Parse_Html_Page():#获取每页源码存入至resultlist列表中
resultlist.clear()#每次调用清空列表,避免每次获取数据都会累加上一页数据
time.sleep(5)
js = "var q=document.documentElement.scrollTop=10000"
driver.execute_script(js) # 因京东商品每页数据不是一次性加载出来(每次只加载30条数据),但是每页有60条数据,这样爬取数据就不对了,所以要模拟鼠标手动刷新
time.sleep(5)#刷新完再休眠5s
html = driver.page_source # 加载完所有商品获取网页源码
soup = BeautifulSoup(html, "html.parser")#用BeautifulSoup解析网页源码,便于后面获取网页信息
#print(soup)
goodslist = soup.select("#J_goodsList>ul>li")#用BeautifulSoup提供的css选择器select函数获取所有的li标签,也就是所有的商品信息
#print(goodslist)
#print(len(goodslist))
for good in goodslist:#循环遍历商品信息存入至resultlist结果列表
temp = []
good_price = good.find("i").text
good_name = good.find_all("em")[1].text
Link = good.select("div > div.p-img > a") # 返回的类型为<class 'bs4.element.ResultSet'>,如果想操作的话需要转换为<class 'bs4.element.Tag'>
# 类型,所以if条件获取时需要写成imgsrc[0]
link = "https:" + Link[0]["href"]
print(link)
#res = requests.get(link, headers=headers, timeout=5).text
res = requests.get(link, headers=headers, cookies=cookies).text
getK = re.compile(r"<li title='.*?'>货号:(.*?)</li>")
keyL = re.findall(getK, res)
id = ''.join(keyL)
print(id)
imgsrc = good.select("div > div.p-img > a>img")#返回的类型为<class 'bs4.element.ResultSet'>,如果想操作的话需要转换为<class 'bs4.element.Tag'>
#类型,所以if条件获取时需要写成imgsrc[0]
if imgsrc[0]["data-lazy-img"] == "done":
image="https:"+imgsrc[0]["src"]
print(image)
else:
image="https:"+imgsrc[0]["data-lazy-img"]
print(image)
temp.append(id)
temp.append(good_price)
temp.append(good_name)
temp.append(link)
temp.append(image)
resultlist.append(temp)
def main_index(key):#主函数
driver.get("https://www.jd.com/")#打开京东首页
driver.maximize_window()#窗口最大化
try:
driver.find_element_by_id("key").send_keys(key)#输入关键字
driver.find_element_by_xpath("//div[@id='search']/div/div[2]/button").click()#模拟鼠标点击事件
time.sleep(5)
total=driver.find_elements_by_xpath("//div[@id='J_bottomPage']/span[2]/em/b")[0].text#获取总页数
for i in range(1,int(total)+1):#从第一页开始遍历
next_page(i)
except Exception as e:
print(e)
if __name__=="__main__":
start_time=time.time()
print("爬虫开始时间%s" %start_time)
main_index("衣服")
end_time=time.time()
print("共耗时%s" %(end_time-start_time))

浙公网安备 33010602011771号