爬虫基础——示例：汽车之家

# 爬取汽车之家新闻 图片
import requests
from bs4 import BeautifulSoup


response = requests.get(
    url="https://www.autohome.com.cn/news/"
)

# response.encoding = "gbk"
response.encoding = response.apparent_encoding  # 设置编码集， response.apparent_encoding父类网页的编码


# print(response.text)
# soup = BeautifulSoup(response.text, features='lxml')
soup = BeautifulSoup(response.text, features='html.parser')
target = soup.find(id='auto-channel-lazyload-article')
# print(target)

li_list = target.find_all('li')  # 列表对象

for i in li_list:
    a = i.find("a")
    # print(a)
    if a:
        print(a.attrs.get('href'))  # 新闻 url
        txt = a.find('h3')
        print("对象: ", txt)
        print("文本：", txt.text)  # 新闻标题


        # 爬取所有的文本图片
        img = a.find('img')
        img_url = img.attrs.get('src')
        print(img_url)
        img_response = requests.get("https:"+img_url)

        import uuid
        file_name = str(uuid.uuid4()) + ".img"

        with open(file_name, 'wb') as f:
            f.write(img_response.content)  # 返回二进制 数据 img_response.content

posted @ 2018-07-30 10:38 北风之神Sam 阅读(181) 评论(0) 收藏举报

刷新页面返回顶部

爬虫基础——示例：汽车之家

公告