1 from selenium import webdriver
2 from selenium.webdriver.common.by import By
3 from selenium.webdriver.support.ui import WebDriverWait
4 from selenium.webdriver.support import expected_conditions as EC
5 from selenium.common.exceptions import TimeoutException
6 import re
7 from pyquery import PyQuery
8 from day01.config import *
9 import pymongo
10 client = pymongo.MongoClient(MONGO_URL) #连接mongodb
11 db = client[MONGO_DB]
12
13 browser = webdriver.Chrome()
14 wait = WebDriverWait(browser,10)
15
16 def search():
17 try:
18 browser.get("https://www.taobao.com")
19 # 输入框
20 input_box = wait.until(
21 EC.presence_of_element_located((By.CSS_SELECTOR,"#q"))
22 )
23 submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
24 input_box.send_keys("美食")
25 submit.click()
26 login = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#fm-login-id")))
27 if login is not None:
28 login.send_keys("********")
29 password = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#fm-login-password")))
30 password.send_keys("*********")
31 login_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#login-form > div.fm-btn > button")))
32 login_button.click()
33 else:
34 pass
35 total_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total")))
36 get_products()
37 return total_page.text
38 except TimeoutException:
39 return search()
40 # finally:
41 # browser.quit()
42
43 def next_page(page_number):
44 "操作翻页"
45 try:
46 input_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > input")))
47 confirm_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
48 input_page.clear()
49 input_page.send_keys(page_number)
50 confirm_button.click()
51 # 判断页码数是否在当前页,用来判断元素中存在指定文本的
52 wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number)))
53 get_products()
54 except TimeoutException:
55 next_page(page_number)
56
57 def get_products():
58 "获取产品信息"
59 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))
60 html = browser.page_source #可以返回网页源码
61 doc = PyQuery(html) #使用pyquery解析网页
62 items = doc('#mainsrp-itemlist .items .item').items()
63 for item in items:
64 product = {
65 'image':item.find('.pic .img').attr('src'),#获取标签属性
66 'price':item.find('.price').text(), #价格
67 'deal':item.find('.deal-cnt').text()[:-3], #成交量
68 'title':item.find('.title').text(),
69 'shop':item.find('.shop').text(),
70 'location':item.find('.location').text()
71 }
72 # print(product)
73 save_to_mongo(product)
74 # from day01.connectMongo import ConnectMongo
75 # con = ConnectMongo()
76 # con.insert_one_data(product,"table")
77
78 def save_to_mongo(result):
79 try:
80 if db[MONGO_TABLE].insert(result):
81 print("存储到mongodb成功")
82 except Exception as e:
83 print("存储到mongodb异常,%s"%e)
84
85
86 def main():
87 result = search()
88 total = int(re.compile("(\d+)").search(result).group(1))
89 for i in range(2,total+1):
90 next_page(i)
91
92 if __name__ == '__main__':
93 main()