'''利用selenium爬取网页内容'''
import re
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from config import *
# driver = webdriver.PhantomJS(service_args=SERVICE_ARGS)
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
# driver.set_window_size(1400,900) #有这这句话就是可以爬取到网页的内容,没有的话就出现TimeOut错误
def search():
print('正在搜索')
try:
driver.get('http://www.tmall.com')
s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mq')))
sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mallSearch > form > fieldset > div > button')))
s_input.send_keys(KEYWORD)
sumbit.click()
shop = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_Filter > a.fType-w')))
shop.click()
totle = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#content > div > div.ui-page > div > b.ui-page-skip > form')))
get_shopname()
return totle.text
except TimeoutException:
print('TimeOut')
return search()
def next_page(page_num):
print('正在翻页', page_num)
try:
s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#content > div > div.ui-page > div > b.ui-page-skip > form > input.ui-page-skipTo')))
sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#content > div > div.ui-page > div > b.ui-page-skip > form > button')))
s_input.clear()
s_input.send_keys(page_num)
sumbit.click()
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#content > div > div.ui-page > div > b.ui-page-num > b.ui-page-cur'),str(page_num)))
get_shopname()
except TimeoutException:
print('TimeOut')
next_page(page_num)
def get_shopname(): #获取店面名称、链接
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#J_ItemList .shopBox .shopHeader')))
html = driver.page_source
doc = pq(html)
items = doc('#J_ItemList .shopBox .shopHeader').items()
for item in items:
shopname = {
'shopmessage': item.find('.shopHeader-info').text(),
'shoplink': 'http:' + item.find('.sHe-shop').attr('href'),
'shop_score':item.find('.shopDsr-con').text()
}
print(shopname)
# def login():
# login_sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_Quick2Static')))
# print('点击使用用户名和密码登录')
# login_sumbit.click()
# user = driver.find_element_by_id('TPL_username_1')
# print('输入用户名')
# user.send_keys(USER)
# password = driver.find_element_by_id('TPL_password_1')
# print('输入密码')
# password.send_keys(PASSWORD)
# sumbit = driver.find_element_by_id('J_SubmitStatic')
# sumbit.click()
# return driver.page_source
def main():
totle = search()
totle = int(re.compile('(\d+)').search(totle).group(1))
for num in range(2,totle+1):
next_page(num)
time.sleep(2)
if __name__ == '__main__':
main()