python 爬虫知识点

1、使用库:request、BeautifulSoup

2、request

esponse =requests.get(
url='https://www.autohome.com.cn/news/'
)

response.encoding = response.apparent_encoding
response.text
response.content
response.status_code

3、BeautifulSoup
转换成soup对象
soup = BeautifulSoup(response.text,features='html.parser') #默认用html.parser,生产用lxml,性能更好
根据id查找
soup.find(id="chazy")
查找li、div、img等html标签下的文本
target = soup.find(id="auto-channel-lazyload-article").find('li') # 找到第一个li
li_list = soup.find(id="auto-channel-lazyload-article").find_all('li') # 找到所有li


4、简单示例
import requests
from bs4 import BeautifulSoup

response =requests.get(
url='https://www.autohome.com.cn/news/'
)
response.encoding = response.apparent_encoding
print(response.status_code)
soup = BeautifulSoup(response.text,features='html.parser') #默认用html.parser,生产用lxml,性能更好

#正则查找
target = soup.find(id="auto-channel-lazyload-article").find('li') # 找到第一个li
li_list = soup.find(id="auto-channel-lazyload-article").find_all('li') # 找到所有li

for li in li_list:
a = li.find('a') #找a标签
if(a):
pass
print(a.attrs)
print(a.attrs.get('href'))
     
  
  img = li.find('img').get('src')
  res = requests.get(img)
  file_name = "%s.jpg" %(title,)
  with open(file_name,'wb') as f:
  f.write(res.content)


posted @ 2018-07-09 15:31  yoyo008  阅读(182)  评论(0编辑  收藏  举报