爬虫基础-bs4方式和xpath方式提取标签下所有文本
bs4方式
import requests
from lxml import etree
from bs4 import BeautifulSoup
import time
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
url = 'http://www.biquge.info/32_32050/12195102.html'
#url = 'https://cl.fs55.xyz/htm_data/2008/20/4050969.html'
response = requests.get(url=url,headers=headers)
response.encoding = 'utf-8'
# bs4方式
soup = BeautifulSoup(response.text, 'lxml')
content = soup.find('div', id='content').text.replace('\xa0'*4, '\n\n')
xpath方式
import requests
from lxml import etree
from bs4 import BeautifulSoup
import time
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
url = 'http://www.biquge.info/32_32050/12195102.html'
#url = 'https://cl.fs55.xyz/htm_data/2008/20/4050969.html'
response = requests.get(url=url,headers=headers)
response.encoding = 'utf-8'
# xpath 方式
tree = etree.HTML(response.text)
content = tree.xpath('//div[@id="content"]')[0].xpath('string(.)').replace('\xa0'*4, '\n\n')