【Python】爬虫的简单实现
1、BeautifulSoup提取信息
from bs4 import BeautifulSoup import requests def getpage(url): headers = { "user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36" } response = requests.get(url,headers = headers) soup = BeautifulSoup(response.content.decode("utf-8"), "lxml") mylist = soup.find("div",{"class":"book-list"}).children#div标签的子孙标签 for i in mylist: print(i.get_text())#打印每一个标签的内容 url = "https://www.qidian.com/" getpage(url)
2、xpath提取信息
import requests from lxml import etree def gethtml(url): headers = { "user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36" } response = requests.get(url,headers = headers) response.encoding = "utf-8" html = etree.HTML(response.text) data = html.xpath('/html/body/div[@class="wrap"]/div[@class="index-two-wrap box-center mb40 cf"]/div[@class="book-list-wrap mr30 fl"]/div[@class="book-list"]/ul/li/a/text()') print(data) #for i in data: #print(i.strip())#去除空格 url = "https://www.qidian.com/" gethtml(url)