Loading

爬虫(三)requests库数据挖掘

requests安装和使用

pip install requests

用requests获取百度html代码

import requests

url = 'http://www.baidu.com'

response = requests.get(url).content.decode()

print(response)

 

添加请求头和参数

import requests

url = 'http://www.baidu.com/s?'

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"
}

wd = {"wd":"中国"}

response = requests.get(url,params=wd,headers=headers)

data = response.text # 表示返回一个字符串形式数据

data2 = response.content # 表示返回一个二进制形式数据

print(data)

 

处理POST请求

 使用上一章正则表达式获取有道翻译的post请求

import requests
import re

#构造请求头信息
header={
"User-Agent":"Mozilla/5.0 (Linux; U; An\
droid 8.1.0; zh-cn; BLA-AL00 Build/HUAW\
EIBLA-AL00) AppleWebKit/537.36 (KHTML, l\
ike Gecko) Version/4.0 Chrome/57.0.2987.13\
2 MQQBrowser/8.9 Mobile Safari/537.36"
}

url="http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"

key="自学"

#post请求需要提交的参数
formdata={
   "i":key,
   "from":"AUTO",
   "to":"AUTO",
   "smartresult":"dict",
   "client":"fanyideskweb",
   "salt":"15503049709404",
   "sign":"3da914b136a37f75501f7f31b11e75fb",
   "ts":"1550304970940",
   "bv":"ab57a166e6a56368c9f95952de6192b5",
   "doctype":"json",
   "version":"2.1",
   "keyfrom":"fanyi.web",
   "action":"FY_BY_REALTIME",
   "typoResult":"false"
}

response = requests.post(url,headers=header,data=formdata)

data = response.json()

data2 = response.text

data3 = response.content

# 正则表达式,提取"tgt":"self-study"}]]}中间的变量
pat = r'"tgt":"(.*?)"}]]}'

result = re.findall(pat,response.text)

print(result)

 

代理IP

import requests

# 设置ip地址
# proxy = {"http":"http://代理IP地址:端口号"}
proxy = {"http":"http://118.113.247.26:9999"}

url = "http://www.baidu.com"

response = requests.get(url,proxies=proxy)

print(response)

 

获取响应cookie

import requests

url = 'http://www.baidu.com'

response = requests.get(url)

# 1、获取返回的cookiejar对象
cookiejar = response.cookies

# 2、将cookiejar转换成字典
cookiedict = requests.utils.dict_from_cookiejar(cookiejar)

print(cookiedict)

 

 

seesion实现登录

import requests

url = "http://www.renren.com/Plogin.do"

header={
"User-Agent":"Mozilla/5.0 (Linux; U; An\
droid 8.1.0; zh-cn; BLA-AL00 Build/HUAW\
EIBLA-AL00) AppleWebKit/537.36 (KHTML, l\
ike Gecko) Version/4.0 Chrome/57.0.2987.13\
2 MQQBrowser/8.9 Mobile Safari/537.36"
}

# 创建一个session对象
ses = requests.session()

# 构造登录需要的参数
data={"email":"binzi_chen@126.com","password":"5tgb^YHN"}

# 通过传递用户名密码得到cookie信息
ses.post(url,data=data)

# 请求需要的页面
response = ses.get("http://www.renren.com/880151237/profile")

print(response.text)

 

实战爬取音乐资源

 

 

 爬虫是为了破解只能下载两次的限制

 

 

 

import re # 引入正则表达式库
import requests
import time

# http://www.htqyy.com/top/musicList/hot?pageIndex=1&pageSize=20 # 第一页
# http://www.htqyy.com/top/musicList/hot?pageIndex=1&pageSize=20 # 第二页
# http://www.htqyy.com/top/musicList/hot?pageIndex=2&pageSize=20 # 第三页

# 页码-1

# 歌曲url  http://www.htyy.com/play/20
# 播放资源 http://f2.htqyy.com/play8/11/mp3/8

# page = int(input("请输入您要爬取的页数:"))

header = {
    "Referer":"http://www.htqyy.com/top/hot"
}

songID = []
songName = []

for i in range(0,2):
    url = "http://www.htqyy.com/top/musicList/hot?pageIndex="+ str(i)+"&pageSize=20"

    # 获取音乐榜单的网页信息
    html = requests.get(url,headers=header)

    strr = html.text

    # print(strr)
    # class ="title" > < a href="/play/33" target="play" title="清晨" sid="33" > 清晨 < / a > < / span >

    part1 = r'title="(.*?)" sid'
    part2 = r'sid="(.*?)"'


    idlist = re.findall(part2,strr)
    titlelist = re.findall(part1,strr)

    songID.extend(idlist) # extend()把两个列表合成一个列表
    songName.extend(titlelist)

print("歌曲列表:",songName)
print("歌曲ID:",songID)

for i in range(0,len(songID)):
    songUrl = "http://f2.htqyy.com/play8/" + str(songID[i]) + "/mp3/8"
    song = songName[i]

    data = requests.get(songUrl).content

    with open("D:\\Documents\\Music\\{}.mp3".format(song),"wb") as f:
        f.write(data)
    time.sleep(0.5)

 

posted @ 2020-08-08 12:56  Binzichen  阅读(295)  评论(0)    收藏  举报