from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import re,json
from pyquery import PyQuery as pq
driver = webdriver.Ie()
wait=WebDriverWait(driver,20,0.2)
url = "http://taobao.com"
def search():
try:
driver.get(url)
# 判断页面是否加载成功
input=wait.until(EC.presence_of_element_located((By.ID,"q")))
input.send_keys("美食")
submit=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
submit.click()
#总的页数
total=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total")))
get_products()
return total.text
except TimeoutError:
return search()
def next_page(page_number):
try:
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))) #输入框
submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))) #确定
input.clear()
input.send_keys(page_number)
submit.click()
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number)))
get_products()
except TimeoutError:
next_page(page_number)
def get_products():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))
html=driver.page_source
#进行解析
doc=pq(html)
items=doc('#mainsrp-itemlist .items .item').items()
#字典
for item in items:
product={
'image':item.find('.pic .img').attr('src'),#获取SRC属性
'price':item.find('.price').text(),#商品价格
'deal':item.find('.deal-cnt').text()[:-3],#成交量 去掉付款人三个字
'title':item.find(".title").text(),#商品标题
'shop':item.find(".shop").text(),#店名
'location':item.find(".location").text() #地址
}
print(product)
save_data(product)
def save_data(result):
with open('淘宝商品信息','a+',encoding="utf-8")as f:
f.write(json.dumps(result,ensure_ascii=False)+'\n')
f.close()
def main():
total=search()
#只打印数字页数,强制转换为int类型
total=int(re.compile('(\d+)').search(total).group(1))
print(total)
for i in range(2,total+1):
next_page(i)
driver.quit()
if __name__ == '__main__':
main()