1 import re
2 from selenium import webdriver
3 from selenium.common.exceptions import TimeoutException
4 from selenium.webdriver.common.by import By
5 from selenium.webdriver.support.ui import WebDriverWait
6 from selenium.webdriver.support import expected_conditions as EC
7 from pyquery import PyQuery as pq
8
9 import pymongo
10
11 MONGO_URL = 'localhost'
12 MONGO_DB = 'taobao'
13 MONGO_TABLE = 'product'
14
15 SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
16
17 KEYWORD = '美食'
18
19 client = pymongo.MongoClient(MONGO_URL)
20 db = client[MONGO_DB]
21
22 browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
23 wait = WebDriverWait(browser, 10)
24
25 browser.set_window_size(1400, 900)
26
27 def search():
28 print('正在搜索')
29 try:
30 browser.get('https://www.taobao.com')
31 input = wait.until(
32 EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
33 )
34 submit = wait.until(
35 EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
36 input.send_keys(KEYWORD)
37 submit.click()
38 total = wait.until(
39 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
40 get_products()
41 return total.text
42 except TimeoutException:
43 return search()
44
45
46 def next_page(page_number):
47 print('正在翻页', page_number)
48 try:
49 input = wait.until(
50 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
51 )
52 submit = wait.until(EC.element_to_be_clickable(
53 (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
54 input.clear()
55 input.send_keys(page_number)
56 submit.click()
57 wait.until(EC.text_to_be_present_in_element(
58 (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)))
59 get_products()
60 except TimeoutException:
61 next_page(page_number)
62
63
64 def get_products():
65 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
66 html = browser.page_source
67 doc = pq(html)
68 items = doc('#mainsrp-itemlist .items .item').items()
69 for item in items:
70 product = {
71 'image': item.find('.pic .img').attr('src'),
72 'price': item.find('.price').text(),
73 'deal': item.find('.deal-cnt').text()[:-3],
74 'title': item.find('.title').text(),
75 'shop': item.find('.shop').text(),
76 'location': item.find('.location').text()
77 }
78 print(product)
79 save_to_mongo(product)
80
81
82 def save_to_mongo(result):
83 try:
84 if db[MONGO_TABLE].insert(result):
85 print('存储到MONGODB成功', result)
86 except Exception:
87 print('存储到MONGODB失败', result)
88
89
90 def main():
91 try:
92 total = search()
93 total = int(re.compile('(\d+)').search(total).group(1))
94 for i in range(2, total + 1):
95 next_page(i)
96 except Exception:
97 print('出错啦')
98 finally:
99 browser.close()
100
101 if __name__ == '__main__':
102 main()