爬虫基础-bs4方式和xpath方式提取标签下所有文本

 bs4方式

import requests
from lxml import etree
from bs4 import BeautifulSoup
import time
import os

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
url = 'http://www.biquge.info/32_32050/12195102.html'
#url = 'https://cl.fs55.xyz/htm_data/2008/20/4050969.html'
response = requests.get(url=url,headers=headers)
response.encoding = 'utf-8'

# bs4方式
soup = BeautifulSoup(response.text, 'lxml')
content = soup.find('div', id='content').text.replace('\xa0'*4, '\n\n')

 xpath方式

import requests
from lxml import etree
from bs4 import BeautifulSoup
import time
import os

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
url = 'http://www.biquge.info/32_32050/12195102.html'
#url = 'https://cl.fs55.xyz/htm_data/2008/20/4050969.html'
response = requests.get(url=url,headers=headers)
response.encoding = 'utf-8'

# xpath 方式
tree = etree.HTML(response.text)
content = tree.xpath('//div[@id="content"]')[0].xpath('string(.)').replace('\xa0'*4, '\n\n')

 

 

posted @ 2020-08-15 17:15  消磨_时间  阅读(637)  评论(0编辑  收藏  举报