普通请求
#r.request.headers 查看请求信息
import requests
url='https://api.airtable.com/v0/appU9QT7BUOev35GR/iDM?maxRecords=3&view=Grid%20view'
try:
kv={'user-agent':'Mozilla/5.0',
'Authorization':'Bearer key2lrnDfzwxyrgOH'}
r=requests.get(url,headers=kv)
r.raise_for_status()
r.encoding=r.apparent_encoding;
print(r.text)
except:
print("爬取失败")
搜索
import requests
keyword="python"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
"Cookie":'sessionid=wplv193aex84mpp1sknyhj27bd3gdtmk; csrftoken=FJbuCEPObUYV3lsta85VKX6tD8Hz8LaDLAnowtR6Qj9TXh5n4NFjKvX0JHmX3MOX; Hm_lvt_375aa6d601368176e50751c1c6bf0e82=1544436946,1545919724,1546001336; Hm_lpvt_375aa6d601368176e50751c1c6bf0e82=1546005348',
}
proxies = {
}
try:
r=requests.get('http://readfree.me/search/?q=%E7%AE%97%E6%B3%95%E5%AF%BC%E8%AE%BA',headers=headers)
r.raise_for_status()
print(r.text)
except:
print("爬取失败")
爬取图片
import requests
import os
url="http://img11.360buyimg.com/mobilecms/s140x140_jfs/t1/5681/11/8922/341224/5baaf491E3d93e68a/78dd0833dd51a8ec.jpg"
root="desktop//"
path=root+url.split('/')[-1] #这里的地址是相对路径
try:
if not os.path.exists(root): #如果目录不存在就创建一个
os.mkdir(root)
if not os.path.exists(path):
r=requests.get(url)
with open(path,'wb') as f: #打开文件给文件标识符f
f.write(r.content) #把r的内容写入
f.close
print("文件保存成功")
print(path)
else:
print("文件已经存在")
except:
print("爬取失败")
正则表达式
import re
pal=re.compile('imooc') #r说明匹配与单括号中一样的字符串,推荐使用
mal=pal.match('imooc hah') #拿定义好的pal与字符串进行匹配
mal.group()
ma=re.match(r'[[\w]]','[a]') #因为[]是正则的一个字符集,所以匹配需要转义
str1='hahahah=1000'
info=re.search(r'\d+',str1) #在字符串str1中找数字 search只能匹配一个
str2='hahahah=1000,heheh=10000'
info=re.findall(r'\d+',str2) #在字符串str2中找数字 findall匹配多个,返回一个列表
def add1(match):
val=match.group()
num =int(val)+1
return str(num);
str1='hahahah=999'
info=re.sub(r'\d+',add1,str3) #将匹配到的用add1替换
str4='c c++:java'
re.split(r' |:',str4); #将空格和:为标志对字符串分隔
爬虫代理设置
import random
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0'}
proxies = {
'http':'http://117.135.153.10:80'
} #注意http 和 https
#'http':'http://125.74.14.73:80'
#'http':'http://124.89.174.199:8060'
def getProxies():
url='https://www.kuaidaili.com/free/intr/1/'
try:
r = requests.get(url, headers=headers) #设置代理 设置时间
r.raise_for_status()
if(url.find('https://item.jd.com/')!=-1):
r.encoding ='gbk';
else:r.encoding ='utf-8';
except:
print("爬取失败")
soup = BeautifulSoup(r.text, "html.parser")
ipLists = soup.findAll('tbody',limit=1)
ipLists=ipLists[0].findAll('tr')
ipListsRandom=random.randint(0,14)
proxies['http']='http://'+ipLists[ipListsRandom].contents[1].string+':'+ipLists[ipListsRandom].contents[3].string
return proxies
def cheakProxies():
# 用来检测代理是否成功
try:
r = requests.get('http://icanhazip.com', headers=headers, proxies=proxies)
r.raise_for_status()
except:
print("代理失败")
print(r.text)
print('代理成功')
cheakProxies()