爬虫模版&代理

普通请求

#r.request.headers 查看请求信息
import requests
url='https://api.airtable.com/v0/appU9QT7BUOev35GR/iDM?maxRecords=3&view=Grid%20view'

try:
    kv={'user-agent':'Mozilla/5.0',
        'Authorization':'Bearer key2lrnDfzwxyrgOH'}
    r=requests.get(url,headers=kv)
    r.raise_for_status()
    r.encoding=r.apparent_encoding;
    print(r.text)

except:
    print("爬取失败")

搜索

import requests
keyword="python"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
    "Cookie":'sessionid=wplv193aex84mpp1sknyhj27bd3gdtmk; csrftoken=FJbuCEPObUYV3lsta85VKX6tD8Hz8LaDLAnowtR6Qj9TXh5n4NFjKvX0JHmX3MOX; Hm_lvt_375aa6d601368176e50751c1c6bf0e82=1544436946,1545919724,1546001336; Hm_lpvt_375aa6d601368176e50751c1c6bf0e82=1546005348',
}
proxies = {

}
try:
    r=requests.get('http://readfree.me/search/?q=%E7%AE%97%E6%B3%95%E5%AF%BC%E8%AE%BA',headers=headers)
    r.raise_for_status()
    print(r.text)
except:
    print("爬取失败")

爬取图片

import requests
import os

url="http://img11.360buyimg.com/mobilecms/s140x140_jfs/t1/5681/11/8922/341224/5baaf491E3d93e68a/78dd0833dd51a8ec.jpg"
root="‎⁨desktop⁩//"
path=root+url.split('/')[-1]        #这里的地址是相对路径
try:
    if not os.path.exists(root):    #如果目录不存在就创建一个
        os.mkdir(root)          
    if not os.path.exists(path):
        r=requests.get(url)
        with open(path,'wb') as f:      #打开文件给文件标识符f
            f.write(r.content)          #把r的内容写入
            f.close
            print("文件保存成功")
            print(path)
    else:
        print("文件已经存在")
except:
    print("爬取失败")

正则表达式

import re

pal=re.compile('imooc')        #r说明匹配与单括号中一样的字符串，推荐使用
mal=pal.match('imooc hah')      #拿定义好的pal与字符串进行匹配
mal.group()
 
ma=re.match(r'[[\w]]','[a]')  #因为[]是正则的一个字符集，所以匹配需要转义

str1='hahahah=1000'
info=re.search(r'\d+',str1) #在字符串str1中找数字   search只能匹配一个

str2='hahahah=1000，heheh=10000'
info=re.findall(r'\d+',str2) #在字符串str2中找数字   findall匹配多个，返回一个列表

def add1(match):
    val=match.group()
    num =int(val)+1
    return str(num);

str1='hahahah=999'
info=re.sub(r'\d+',add1,str3)   #将匹配到的用add1替换

str4='c c++:java'
re.split(r' |:',str4);   #将空格和：为标志对字符串分隔

爬虫代理设置

import random
import requests
from bs4 import BeautifulSoup

headers = {'User-Agent': 'Mozilla/5.0'}
proxies = {
    'http':'http://117.135.153.10:80'
}                                 #注意http 和 https
#'http':'http://125.74.14.73:80'
#'http':'http://124.89.174.199:8060'

def getProxies():
    url='https://www.kuaidaili.com/free/intr/1/'
    try:
        r = requests.get(url, headers=headers)   #设置代理 设置时间
        r.raise_for_status()
        if(url.find('https://item.jd.com/')!=-1):
            r.encoding ='gbk';
        else:r.encoding ='utf-8';
    except:
        print("爬取失败")
    soup = BeautifulSoup(r.text, "html.parser")
    ipLists = soup.findAll('tbody',limit=1)
    ipLists=ipLists[0].findAll('tr')
    ipListsRandom=random.randint(0,14)
    proxies['http']='http://'+ipLists[ipListsRandom].contents[1].string+':'+ipLists[ipListsRandom].contents[3].string
    return proxies

def cheakProxies():
    # 用来检测代理是否成功
    try:
        r = requests.get('http://icanhazip.com', headers=headers, proxies=proxies)
        r.raise_for_status()
    except:
        print("代理失败")
    print(r.text)
    print('代理成功')

cheakProxies()

posted @ 2021-05-30 20:04 兔子翻书阅读(109) 评论(0) 收藏举报

刷新页面返回顶部

Loading

MYJ's Blog

一个人只有付出，才会珍惜他所得到的

爬虫模版&代理

普通请求

搜索

爬取图片

正则表达式

爬虫代理设置