from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re # 正则
from requests.exceptions import RequestException # 预防报错
from multiprocessing import Pool #多线程
from pyquery import PyQuery as pq #解析库
from config import * #引入当前目录config 文件内容
print(SERVICE_ARGS)
# bro = webdriver.Chrome() # 引用浏览器Chrome 提前要安装chromedriver.exe
bro = webdriver.PhantomJS(service_args=SERVICE_ARGS) #引用无界面 PhantomJS 不在弹框 只在后台运行
# wait = WebDriverWait(bro, 10) # selenium 属性设置
# bro.set_window_size(1400, 900) # 设置 浏览器窗口宽度
def search():
print('正在搜索...')
try:
bro.get('http://search.zongheng.com')
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#commSearch > div > input.search-text.fl'))) # 使用css属性定位
submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#commSearch > div > input.search-btn.fr')))# 使用css属性定位
input.send_keys('都市')#输入文字
submit.click() #点击按钮
total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#totalPage')))# 使用css属性定位
get_pr()
return total.text # 返回HTML上的文字
except TimeoutException:
search()
def next_page(total):
print('正在翻页',total)
try:
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab > div.search_d_pagesize > input.search_d_page_value')))# 使用css属性定位
submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab > div.search_d_pagesize > input.search_d_page_submit')))# 使用css属性定位
input.clear()#清除里面的内容
input.send_keys(total)#输入文字
submit.click()# 点击
# 判断当前是否加载完毕
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab > div.search_d_pagesize > a.active'), str(total)))
get_pr()
except RequestException:
next_page(total)
def get_pr():
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab')))# 使用css属性定位
html = bro.page_source
doc = pq(html)
items = doc('body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab').items() #使用css属性定位 获取属性下的html
for item in items:
html = re.compile('.*?src="(.*?)" onerro.*?class="tit"><a href="(.*?)" target.*?">(.*?)</a>.*?}">(.*?)</a>.*?">(.*?)</a>.*?</em><span>(.*?)</span>.*?em><span>(.*?)</span>.*?<p>.*?(.*?)</p>',re.S)#正则
html = re.findall(html,str(item))#比配
for ii in html:
# print(ii)
product = {
# '图片': item.find('.imgbox img').attr('scr'),
# '图片': item.find('.se-result-infos .tit').text(),
'图片': ii[0],
'地址': ii[1],
'书名': ii[2],
'作者': ii[3] + ii[4],
'连载': ii[6],
'介绍': ii[7]
}
print(product)
# print(html)
# print(item)
def main():
total = search()
# total = int(re.compile('(\d+)').search(totla).group(1))
for i in range(2, int(total) + 1):
next_page(i)
# break
# print(total)
pool = Pool()
# pool.map([next_page(i) for i in range(2, int(total) + 1)]) #多线程
bro.close() # 关闭浏览器
if __name__ == '__main__':
# pool = Pool()
# pool.map(main)
# main()
pass