#coding=utf-8
import time
import requests
from lxml import etree
from pymongo import MongoClient
from selenium import webdriver
client = MongoClient("IP", 27017)
db = client["Automobile"]
collection = db["wenda_autohome"]
db.authenticate("","")
driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe")
def splist(l, s):
return [l[i: i+s] for i in range(len(l)) if i%s==0]
for i in range(36726, 40202):
# url = 'https://wenda.autohome.com.cn/topic/detail/40195'
url = 'https://wenda.autohome.com.cn/topic/detail/' + str(i)
time.sleep(1)
driver.get(url)
html = driver.page_source
tree = etree.HTML(html)
question = tree.xpath("//h1[@class='card-title']/text()")
answer_list = tree.xpath("//a[@class='text']/text()")
if question==[] or answer_list==[]:
continue
n = 0
for j in answer_list:
n += 1
answer_list[n-1] = j[41:-37]
if answer_list[n-1][-3:]!='...':
continue
s = "//div[@class='card-reply-wrap'][" + str(n) + "]//a[@class='more']"
try:
driver.find_element_by_xpath(s).click()
html_answer = driver.page_source
tree_answer = etree.HTML(html_answer)
answer_part = tree_answer.xpath("//div[@class='answer-content']/div/div[@class='ahe__area ahe__block ahe__text']/p/text()")
answer = ''
for item in answer_part:
answer += item
answer_list[n-1] = answer
time.sleep(1)
driver.get(url)
except Exception as e:
print e
continue
keywords = tree.xpath("//ul[@class='card-tag-list']/li/text()")
discription_list = tree.xpath("//div[@class='ahe__area ahe__block ahe__text']/p/text()")
discription = ''
for j in discription_list:
discription += j
zancai = tree.xpath("//span[@class='js-praise-count']/text()")
zancai_list = splist(zancai, 2)
dc = {}
dc['keywords'] = keywords
dc['question'] = question[0]
dc['discription'] = discription
dc['answer'] = answer_list
dc['zancai'] = zancai_list
dc['url'] = url
collection.insert(dc)
driver.close()