python爬虫入门(一)
python爬虫总结:
爬取图片
import requests
from bs4 import BeautifulSoup
url="https://pixiv.re/91881094.jpg"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}
with open("./c.jpg", "wb") as f:
response=requests.get(url,headers=headers).content
f.write(response)
注意的是文件的模式wb,写二进制文件
content也是表示处理二进制文件
一般用于图片等处理
发送请求
输入英文单词,返回百度翻译内容
import json
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}
s=input()
url="https://fanyi.baidu.com/sug"
data={
'kw':s
}
response=requests.post(url,data,headers=headers)
dic=response.json()
with open("./d.json","w",encoding='utf-8') as f:
json.dump(dic,f,ensure_ascii=False)
注意post的格式,中间的data要传递那些之通过负载的数据表单获得
最后返回的是json,注意保存的时候要ensure_ascii=False,因为有中文
get是用param
利用正则表达式
import requests
from bs4 import BeautifulSoup
import re
import os
url="https://movie.douban.com/chart"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}
html=requests.get(url=url,headers=headers).text
# <img src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p2897743122.webp" width="75" alt="伸冤人3" class="">
pattern=re.compile('<img src="(.*?)" width.*?alt')
src_list=pattern.findall(html,re.S)
if not os.path.exists("./pic"):
os.makedirs("./pic")
tot=0;
for src in src_list:
tot=tot+1
url=src
response=requests.get(url=url,headers=headers).content
img_path="./pic/"+str(tot)+'.jpg'
with open(img_path,"wb") as f:
f.write(response)
比如爬取豆瓣某些电影的图片,先将整个网页爬下来,然后利用正则表达式找到图片对应的url,然后就跟第一个一样。
然后可以通过引入os模块
爬取json之后,通过字典或者列表访问所需url
import json
import requests
from bs4 import BeautifulSoup
import re
import os
url="https://movie.douban.com/j/chart/top_list"
param={
'type': '24',
'interval_id': '100:90',
'action':'',
'start': '20', #
'limit': '20',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}
file_path='./pic_of_moive'
if not os.path.exists(file_path):
os.makedirs(file_path)
tot=0
for id in range(0,40,20):
param['start']=str(id)
# print(param['start'])
response=requests.get(url=url,params=param,headers=headers).json()
for item in response:
s=item['cover_url']
pic=requests.get(url=s,headers=headers).content
pic_path = file_path + '/' + str(tot) + '.jpg'
tot=tot+1
with open(pic_path,'wb') as f:
f.write(pic)
解决中文乱码问题
import json
import requests
from bs4 import BeautifulSoup
import re
import os
url="https://sanguo.5000yan.com/965.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}
file_path='./pic_of_moive'
if not os.path.exists(file_path):
os.makedirs(file_path)
tot=0
response=requests.get(url=url,headers=headers)
response.encoding="utf-8"
with open("./a.txt","w",encoding="utf-8") as f:
f.write(response.text)
一般来说当html的编码格式是utf-8,并且文件的格式也为utf-8的话,一般是不会有问题的,但有些网页就会有问题,解决办法是像上面这样。
选择器空格问题
这样是会有问题的
a_list=soup.select('main-content container>ul>li>a')
解决办法是将空格改为".",并且在前面要加上标签的名称,而且必须加上标签才可以。
a_list=soup.select('main.main-content.container>ul>li>a')
bs4
import json
import requests
from bs4 import BeautifulSoup
import re
import os
url="https://sanguo.5000yan.com/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}
response=requests.get(url=url,headers=headers)
response.encoding='utf-8'
html_text=response.text
soup=BeautifulSoup(html_text,'lxml')
a_list=soup.select('main.main-content.container>ul>li>a')
with open("./sanguo.txt","w",encoding='utf-8') as f:
for a in a_list:
title=a.string
detail_src=a['href']
# print(title)
# print(detail_src)
detail_response=requests.get(url=detail_src,headers=headers)
detail_response.encoding='utf-8'
detail_text=detail_response.text
# f.write(detail_text)
detail_soup=BeautifulSoup(detail_text,'lxml')
content=detail_soup.find('div',class_='grap').text
coutent=content.replace('\n', '')
f.write(title+':'+content)
# print(detail_src)
# soup=BeautifulSoup(response,'lxml')
#
# li_list=soup.findAll('.menu-item>a')
#
#
#
可以通过select找到对应的标签
标签的属性[]
标签的内容用text/string
text是下面所有的
string是直属的
Xpath
/表示层级关系,开头表示从根节点开始定位
//表示多层关系,从任意位置定位
[@class=content]属性定位
索引定位从1开始
/text返回直系标签内容 //text返回下面所有 返回的都是列表
/@返回属性列表
局部解析 ./ 从"."开始后面可以继续接xpath表达式
同时写代码前可以先在control+f搜索一下,看xpath表达式找到的是不是想要的
import json
from lxml import etree
import requests
import re
import os
url="https://www.58.com/ershoufang/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}
response=requests.get(url=url,headers=headers).text
tree=etree.HTML(response)
a_list=tree.xpath('//td[@class="t"]/a')
with open("./a.txt","w",encoding='utf-8') as f:
for a in a_list:
title=a.xpath("./text()")[0]
f.write(title+'\n')
tbody
遇到tbody去掉即可,开始还很疑惑为什么是空列表,看到编辑器在tbody下画了线,去查了一下,还真是tbody的问题,还好没有浪费太多时间。
import json
from lxml import etree
import requests
import re
import os
url="https://www.58.com/ershoufang/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}
response=requests.get(url=url,headers=headers).text
tree=etree.HTML(response)
tr_list=tree.xpath('//table[@border="0"]/tr')
for tr in tr_list:
title=tr.xpath("./td[@class='t']/a/text()")[0]
money=tr.xpath("./td[@class='pred'][1]/b/text()")[0]
print(title+" "+money+"\n")
xpath结合分页批量下载图片
import json
from lxml import etree
from fake_useragent import UserAgent
import requests
import re
import os
# http://www.netbian.com/fengjing/index_2.htm
s="http://www.netbian.com/fengjing/"
ua=UserAgent()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}
topfile_path="./downpic"
for page in range(9,10):
file_path=topfile_path+f'/page{page}'
if not os.path.exists(file_path):
os.makedirs(file_path)
tot=0
url=s+f"index_{page}.htm"
headers['User-Agent']=ua.random
response=requests.get(url=url,headers=headers).text
tree=etree.HTML(response)
a_list=tree.xpath("//div[@class='list']/ul/li/a")
for a in a_list:
img_src=a.xpath("./img/@src")[0]
tot=tot+1
headers['User-Agent'] = ua.random
content=requests.get(url=img_src,headers=headers).content
save_path=file_path+'/'+str(tot)+'.jpg'
with open(save_path, "wb") as f:
f.write(content)
可以通过fake_useragent每次搞一个新的ua
ip代理、解决gbk乱码问题
import json
from lxml import etree
from fake_useragent import UserAgent
import requests
import re
import os
import time
import random
ua = UserAgent()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}
# proxy_list=[{'http': 'http://60.205.132.71:80', 'https': 'https://60.205.132.71:80'},
# {'http': 'http://183.236.232.160:8080', 'https': 'https://183.236.232.160:8080'}]
def get_pageid():
return int(random.random()*100)+1
def get_newip():
# time.sleep(3)
page_id = random.randint(1,50)
ip_url = f"https://www.kuaidaili.com/free/inha/{page_id}"
# print(page)
ua = UserAgent()
ip_text = requests.get(url=ip_url,headers=headers).text
tree = etree.HTML(ip_text)
ip_list = tree.xpath('//div[@class="table-section"]//td[@data-title="IP"]')
post_list = tree.xpath('//div[@class="table-section"]//td[@data-title="PORT"]')
for a, b in zip(ip_list,post_list):
ip = a.xpath('./text()')[0].strip()
port = b.xpath('./text()')[0].strip()
proxy_list.append({
'ip': ip,
'port': port
})
def get_proxy(ele):
return {
"http": "http://" + ele["ip"] + ":" + ele["port"],
"https": "https://" + ele["ip"] + ":" + ele["port"]
}
def ask(surl):
id = random.randint(0, len(proxy_list)-1)
aproxy = get_proxy(proxy_list[id])
headers['User-Agent'] = ua.random
print(aproxy)
try:
r = requests.get(url=surl, headers=headers, proxies=aproxy, timeout=3)
if r.status_code == 200:
return 1, r
else:
print("fail!")
return 0, 0
except:
print("fail!")
proxy_list.remove(proxy_list[id])
if len(proxy_list) < 5:
get_newip()
return 0, 0
def work(surl):
m_state, m_response = ask(surl)
while m_state == 0:
m_state, m_response = ask(surl)
return m_response
proxy_list = []
get_newip()
url = "http://www.netbian.com/jianzhu/index.htm"
response=work(url)
content=response.text
content=content.encode('iso-8859-1').decode('gbk')
print(content)
当我们的爬取次数多了起来之后,我们也开始担心我们的ip会不会被封
所以我们不断更换ip,模拟浏览器访问。
具体来说就是从快代理上获取ip,然后存到proxy_list中,每次随机取一个,如果不行就删除,当列表大小小于5就新爬一页ip
通过content=content.encode('iso-8859-1').decode('gbk')
解决gbk乱码问题
不过说实话,由于是免费的,很多ip都用不了,往往要跑十多个甚至几十个。