# 实战大项目:模拟登录丁香园,并抓取论坛页面所有的人员基本信息与回复帖子内容。
#
# 丁香园论坛:http://www.dxy.cn/bbs/thread/626626#626626 。
# 丁香园用户名:xxxx
# 密码:ABcd1234
from selenium import webdriver
import time
from lxml import etree
class DingxiangyuanLogin():
def run(self):
# 1.请求头:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
#2. 添加请求头信息
options = webdriver.ChromeOptions()
options.add_argument('user-agent="ozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"')
#3. 创建浏览器驱动
chrome_driver = webdriver.Chrome(options=options)
#4. 登录账号
#4.1 访问登录界面
url = "https://auth.dxy.cn/accounts/login?"
chrome_driver.get(url=url)
time.sleep(3)
#4.2 定位电脑登录按钮:
pc_login_selec_button = chrome_driver.find_element_by_class_name('login__tab_wp').find_elements_by_tag_name('a')[1]
#4.3 点击进入电脑登录界面:
pc_login_selec_button.click()
time.sleep(3)
#4.4 定位“用户名”输入框
user_name_box = chrome_driver.find_element_by_name('username')
#4.5 输入用户名:
user_name_box.send_keys('xxxx')
#4.6 定位“密码”输入框
code_box = chrome_driver.find_element_by_name('password')
#4.7 输入密码
code_box.send_keys('ABcd1234')
#4.8 定位登录按钮
login_button = chrome_driver.find_element_by_class_name('form__button')
#4.9 点击登录按钮
login_button.click()
#5. 访问目标帖子界面
chrome_driver.get('http://www.dxy.cn/bbs/thread/626626#626626 ')
#获取帖子网页源代码
reply_list = []
response_data = chrome_driver.page_source
#使用Xpath解析内容
xpath_data = etree.HTML(response_data)
# 获取所有回复节点
# starts-with(@title,"注册时间")
replies = xpath_data.xpath('//div[starts-with(@id, "post_")]')
print("replies: " + str(replies))
# print(replies)
for reply in replies:
reply_dict = {}
print('reply: ' + str(reply))
# 回复人姓名:
auth_name = reply.xpath('.//div[@class="auth"]')[0].xpath('string(.)')
# print('auth_name: ' + str(auth_name))
# 级别
auth_rank = reply.xpath('.//div[@class="info clearfix"]')[0].xpath('string(.)').strip()
print("auth_rank: " + str(auth_rank))
# 回复内容
reply_content = reply.xpath('.//td[@class="postbody"]')[0].xpath('string(.)').strip()
print('reply_content: ' + str(reply_content))
reply_dict['auth_name'] = auth_name
reply_dict['auth_rank'] = auth_rank
reply_dict['reply_content'] = reply_content
reply_list.append(reply_dict)
DingxiangyuanLogin().run()