爬虫实现案例

import requests
from lxml import etree
from 爬虫.old_boy.p3 import get_code_text

session = requests.session()
# session的作用与requests的作用几乎一样,都可以请求的发送,并且请求发送的方式也是一致的
# session进行请求的发送,如果产生cookie的话会自动保存

url = 'http://www.renren.com/'
headers = {
    'Use-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
    'Connection': 'close',
}
response = session.get(url=url, headers=headers).content
xpath_data = etree.HTML(response)
pic = xpath_data.xpath('//*[@id="verifyPic_login"]/@src')[0]
print(pic)
pic = requests.get(url=pic, headers=headers).content

with open('pic.jpg', 'wb') as fp:
    fp.write(pic)

# 获取result
result = get_code_text('pic.jpg')
# print(result)

login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019331853198 '
data = {
    'captcha_type':    'web_login',
    'domain':    'renren.com',
    'email':    '18744585483',
    'f': 'http%3A%2F%2Fwww.renren.com%2F970459497',
    'icode': result,
    'key_id': '1',
    'origURL': 'http://www.renren.com/home',
    'password':    '9722733e821526e5879a37d439f40666e1af794712cad1fce23d83f7b2f57041',
    'rkey':    '0de33e22f20835059cb6b28da4bffdc9'
}

# 进行登录
response = session.post(url=login_url, headers=headers, data=data)

# 对登录成功的当前用户进行详情页发送访问
detail_url = 'http://www.renren.com/970459497'

# 该请求使用的是session对象
ren_response = session.get(url=detail_url, headers=headers).content
with open('./renren.html', 'wb') as fp:
    fp.write(ren_response)
人人网
import requests
from lxml import etree
from 爬虫.old_boy.p3 import get_code_text

url = 'https://so.gushiwen.org/user/login.aspx?from='
headers = {
    'Use-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}

proxies = {
    'http': '193.68.135.125:59278'
}

session = requests.session()
response = session.get(url=url, headers=headers, verify=False, proxies=proxies).content
xpath_data = etree.HTML(response)
pic_src = 'https://so.gushiwen.org' + str(xpath_data.xpath('//*[@id="imgCode"]/@src')[0])
# print(pic_src)
pic = session.get(url=pic_src, headers=headers, verify=False, proxies=proxies).content
with open('pic.jpg', 'wb') as fp:
    fp.write(pic)

code = get_code_text('pic.jpg')
print(code)

post_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'
data = {
    "__VIEWSTATE": "ahdYeAQW0HtfdBdmYQKvu1cIOsMVQy6b8+Tl3fFmuwmB//7WZsi1kJXIrAcqfvRP5UVTbb74NTJ389/H6FgBc60xjuUtXmCu6V15vp7reQ3DjcBq01LPXOubOG8=",
    "__VIEWSTATEGENERATOR": "C93BE1AE",
    "from:": "http://so.gushiwen.org/user/collect.aspx",
    "email": "862032955@qq.com",
    "pwd": "123456",
    "code": code,
    "denglu": "登录",
}
session.post(url=post_url, headers=headers, data=data, verify=False, proxies=proxies)

detail_url = 'https://so.gushiwen.org/user/collect.aspx'
d_response = session.get(url=detail_url, verify=False, headers=headers, proxies=proxies).content
with open('古诗文.html', 'wb') as fp:
    fp.write(d_response)
古诗网
import requests
from lxml import etree
import re
from multiprocessing.dummy import Pool
import random

url = 'https://www.pearvideo.com/category_8'
headers = {
    'Use-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}

response = requests.get(url=url, headers=headers, verify=False).content.decode()
xpath_data = etree.HTML(response)
li_list = xpath_data.xpath('//*[@id="listvideoListUl"]/li')

# 实现并发建立的数据池
video_url_list = []

for li in li_list:
    # print(li)
    v_href = 'https://www.pearvideo.com/' + li.xpath('.//div[@class="vervideo-bd"]/a/@href')[0]
    # print(v_href)
    d_response = requests.get(url=v_href, headers=headers).content.decode()
    video_url = re.findall('srcUrl="(.*?)",', d_response, re.S)[0]
    video_url_list.append(video_url)
    # print(video_url)

# 常见5个线程
pool = Pool(5)
dowmloadVideo = lambda link: requests.get(url=link, headers=headers).content
# map返回的列表中存储的就是下载完毕的视频二进制数据值
video_url_list = pool.map(dowmloadVideo, video_url_list)

def save_video(data):
    i = random.randint(1, 1000)
    video_name = 'video/' + str(i) + '.mp4'
    # i = i + 1
    with open(video_name, 'wb') as fp:
        fp.write(data)

pool.map(save_video, video_url_list)

pool.close()
pool.join()
多线程

 

posted @ 2019-04-18 23:22  安智伟  阅读(148)  评论(0编辑  收藏  举报