1 from selenium import webdriver
2 from selenium.common.exceptions import TimeoutException
3 from selenium.webdriver.common.by import By
4 from selenium.webdriver.support.ui import WebDriverWait
5 from selenium.webdriver.support import expected_conditions as EC
6 import re
7 from pyquery import PyQuery as pq
8 from config import *
9 import pymongo
10
11 client = pymongo.MongoClient(MONGO_URL)
12 db =client[MONGO_DB]
13
14 browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
15 wait = WebDriverWait(browser, 10)# 等待时长10秒,默认0.5秒询问一次,等待页面加载完成,找到某个条件发生后再继续执行后续代码,如果超过设置时间检测不到则抛出异常
16 browser.set_window_size(1400,900)
17 def search():
18 print("正在搜索")
19 try:
20 browser.get('https://www.taobao.com/')
21 input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#q")))#模拟输入框
22 submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
23 #模拟搜索按钮
24 input.send_keys('美食')#添加input
25 submit.click()#模拟按下搜索按钮
26 total=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total")))
27 get_product()
28 return total.text
29 except TimeoutException:
30 return search()
31 def next_page(page_number):#翻页,把当前页码清除后,直接跳转到想去的页码
32 print("正在翻页",page_number)
33 try:
34 input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
35 submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
36 input.clear()
37 input.send_keys(page_number)
38 submit.click()
39 wait.until(EC.text_to_be_present_in_element(
40 (By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number)))
41 get_product()
42 except TimeoutException:
43 next_page(page_number)
44 def get_product():#获得每页商品内容,pyquery not understand
45 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
46 html =browser.page_source
47 # print(html)
48 doc = pq(html)
49 items = doc("#mainsrp-itemlist .items .item").items()#注意class名字后空格!
50 for item in items:
51 product = {
52 'image':item.find('.pic .img').attr('src'),
53 'price':item.find('.price').text(),
54 'deal':item.find('.deal-cnt').text()[:-3],
55 'title':item.find('.title').text(),
56 'shop':item.find('.shop').text(),
57 'location':item.find('.location').text()#find 查找的是div class的名字 别的标签不可以
58 }
59 print(product)
60 save_to_mongo(product)
61 def save_to_mongo(result):#将数据存储到mongodb
62 try:
63 if db[MONGO_TABLE].insert(result):
64 print('存储成功->',result)
65 except Exception:
66 print('存储失败->',result)
67 def main():
68 total = search()
69 total = int(re.compile('\d+').search(total).group(0))
70 for i in range(2,20):
71 next_page(i)
72 browser.close()
73
74 if __name__ == '__main__':
75 main()
![]()
MONGO_URL = 'localhost'
MONGO_DB = 'taobao'
MONGO_TABLE = 'product'
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
config