# 爬取汽车之家新闻 图片
import requests
from bs4 import BeautifulSoup
response = requests.get(
url="https://www.autohome.com.cn/news/"
)
# response.encoding = "gbk"
response.encoding = response.apparent_encoding # 设置编码集, response.apparent_encoding父类网页的编码
# print(response.text)
# soup = BeautifulSoup(response.text, features='lxml')
soup = BeautifulSoup(response.text, features='html.parser')
target = soup.find(id='auto-channel-lazyload-article')
# print(target)
li_list = target.find_all('li') # 列表对象
for i in li_list:
a = i.find("a")
# print(a)
if a:
print(a.attrs.get('href')) # 新闻 url
txt = a.find('h3')
print("对象: ", txt)
print("文本:", txt.text) # 新闻标题
# 爬取所有的文本图片
img = a.find('img')
img_url = img.attrs.get('src')
print(img_url)
img_response = requests.get("https:"+img_url)
import uuid
file_name = str(uuid.uuid4()) + ".img"
with open(file_name, 'wb') as f:
f.write(img_response.content) # 返回二进制 数据 img_response.content