【Python】爬虫的简单实现

1、BeautifulSoup提取信息

from bs4 import BeautifulSoup
import requests

def getpage(url): 
    headers = {
        "user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36"
    }
    response = requests.get(url,headers = headers)
    soup = BeautifulSoup(response.content.decode("utf-8"), "lxml")
    mylist = soup.find("div",{"class":"book-list"}).children#div标签的子孙标签
    for i in mylist:
        print(i.get_text())#打印每一个标签的内容

url = "https://www.qidian.com/"
getpage(url)

2、xpath提取信息

import requests
from lxml import etree

def gethtml(url):
    headers = {
        "user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36"
    }
    response = requests.get(url,headers = headers)
    response.encoding = "utf-8"
    html = etree.HTML(response.text)
    data = html.xpath('/html/body/div[@class="wrap"]/div[@class="index-two-wrap box-center mb40 cf"]/div[@class="book-list-wrap mr30 fl"]/div[@class="book-list"]/ul/li/a/text()')
    print(data)
    #for i in data:
        #print(i.strip())#去除空格

url = "https://www.qidian.com/"
gethtml(url)

 

posted @ 2023-06-08 09:53  山鬼谣`  阅读(26)  评论(0)    收藏  举报