案例-爬取天涯论坛教师通讯录:xpath
地址: https://bbs.tianya.cn/m/post-140-393974-1.shtml
xpath代码,保存到CSV文件
import requests
import json
from lxml import etree
import csv
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
url = 'https://bbs.tianya.cn/m/post-140-393974-1.shtml'
def list_to_str(lst):
# lst 接收的参数必须是 list
if type(lst) == list:
s = ''
for item in lst:
s = s + item
return s
def get_link(url):
resp = requests.get(url, headers=headers)
resp.encoding = 'UTF-8'
html_tree = etree.HTML(resp.text)
data = html_tree.xpath('//div[@class="content"]/div[contains(@class,"item-ht")]')
for item in data:
nick_name = item.xpath('./div[1]/a/h4/text()')[0]
nick_name = nick_name.strip()
contents = item.xpath('./div[2]/div/p/text()')
contents = list_to_str(contents)
name_info = item.xpath('./div[1]/a/p/text()')[0]
# print("名字==>", nick_name)
# print("name_info==>", name_info)
# print("contents==>", contents)
with open("天涯论坛教师通讯录.csv", "a", newline="", encoding="utf-8") as cf:
w = csv.writer(cf)
w.writerow([nick_name,name_info,contents])
cf.close()
# 分页
url = 'https://bbs.tianya.cn/m/post-140-393974-1.shtml'
for item in range(1,48):
full_url = "https://bbs.tianya.cn/m/post-140-393974-{}.shtml".format(item)
get_link(full_url)
浙公网安备 33010602011771号