案例-爬取天涯论坛教师通讯录:xpath

地址:  https://bbs.tianya.cn/m/post-140-393974-1.shtml

 

xpath代码,保存到CSV文件

import requests
import json
from lxml import etree
import csv

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
url  = 'https://bbs.tianya.cn/m/post-140-393974-1.shtml'

def list_to_str(lst):
    # lst 接收的参数必须是 list
    if type(lst) == list:
        s = ''
        for item in lst:
            s = s + item
        return s

def get_link(url):
    resp = requests.get(url, headers=headers)
    resp.encoding = 'UTF-8'
    html_tree = etree.HTML(resp.text)
    data = html_tree.xpath('//div[@class="content"]/div[contains(@class,"item-ht")]')
    for item in data:
        nick_name = item.xpath('./div[1]/a/h4/text()')[0]
        nick_name = nick_name.strip()
        contents = item.xpath('./div[2]/div/p/text()')
        contents = list_to_str(contents)
        name_info = item.xpath('./div[1]/a/p/text()')[0]
        # print("名字==>", nick_name)
        # print("name_info==>", name_info)
        # print("contents==>", contents)
        with open("天涯论坛教师通讯录.csv", "a", newline="", encoding="utf-8") as cf:
            w = csv.writer(cf)
            w.writerow([nick_name,name_info,contents])
            cf.close()

# 分页
url  = 'https://bbs.tianya.cn/m/post-140-393974-1.shtml'
for item in range(1,48):
    full_url = "https://bbs.tianya.cn/m/post-140-393974-{}.shtml".format(item)
    get_link(full_url)

posted @ 2023-01-06 11:33  屠魔的少年  阅读(4)  评论(0)    收藏  举报