import requests
import requests.adapters
from bs4 import BeautifulSoup
from lxml import etree
from pyquery import PyQuery as pq
import re
def get_url_txt(url, headers, encoding, data=None):
ret = ''
try:
requests.adapters.DEFAULT_RETRIES = 5
session = requests.session()
session.keep_alive = False
if data is None:
response = session.get(url, headers=headers)
else:
response = session.get(url, headers=headers, data=data)
if response.status_code == 200:
response.encoding = encoding
ret = response.text
response.close()
session.close()
except Exception as e:
print(e)
return ret
def get_url_byte(url, headers, data_dict=None):
ret = b''
try:
requests.adapters.DEFAULT_RETRIES = 5
session = requests.session()
session.keep_alive = False
if data_dict is None:
response = session.get(url, headers=headers)
else:
response = session.get(url, headers=headers, data=data_dict)
if response.status_code == 200:
ret = response.content
response.close()
session.close()
except Exception as e:
print(e)
return ret
if __name__ == '__main__':
url1 = 'https://www.baidu.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
encoding = 'utf8'
text = get_url_txt(url=url1, headers=headers, encoding=encoding)
# print(text)
"""
<a href="http://news.baidu.com" name="tj_trnews" class="mnav">新闻</a>
selector #u1 > a:nth-child(2)
xpath //*[@id="u1"]/a[2]
full xpath /html/body/div[1]/div[1]/div/div[3]/a[2]
"""
pass # bs4 .get_text() .get('属性')
soup = BeautifulSoup(text, 'lxml')
soup.prettify()
rets = soup.select('#u1 > a:nth-child(2)')
print(rets)
print(rets[0].get_text(), rets[0].get('href'))
pass # xpath //text() //@属性
xpath = etree.HTML(text)
rets1 = xpath.xpath('//*[@id="u1"]/a[2]//text()')
rets2 = xpath.xpath('//*[@id="u1"]/a[2]//@href')
print(rets1, rets2)
pass # PyQuery tag'tag名称' id'#id值' class'.class值'
doc = pq(text)
tags = doc('a')
print(len(tags), tags)
print(pq(tags[0]).text(), pq(tags[0]).attr('href'))
ids = doc('#u1')
print(len(ids), ids)
classes = doc('.mnav')
print(len(classes), classes)
print(pq(classes[0]).text(), pq(classes[0]).attr('href'))
pass # re
"""
提取特定字符之间的字符(不包括分隔字符)
ret = re.findall(r'>(.*?)<', s) # 提取><之间的字符
删除特定字符之间的字符(包括分隔字符)
del_between = re.compile(r'\{.*?\}') # 特定{}之间的字符
ret = del_between.sub('', s)
删除特定字符
ret = re.sub(r'\s', '', s) # 删除空格
提取汉字、字母、数字
ret = re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9]', s)
ret = re.findall(r'[\u4e00-\u9fa5]', s)
ret = re.findall(r'[a-z]', s)
ret = re.findall(r'[A-Z]', s)
ret = re.findall(r'[0-9]', s)
"""
pass