爬虫基础01

写在前面

　　　　逆水行舟

  1 爬虫
  2     
  3     - 基本操作
  4         概要：
  5             - 发送Http请求，Python Http请求，requests
  6             - 提取指定信息，Python 正则表达式，beautifulsoup
  7             - 数据持久化，
  8         
  9         Python两个模块
 10             - requests
 11             - beautifulsoup
 12         
 13         Http请求相关知识
 14             - 请求：
 15                 请求头
 16                     - cookie
 17                 请求体
 18                     - 发送内容
 19                     
 20             - 响应：
 21                 响应头
 22                     - 浏览器读取
 23                 响应体
 24                     - 看到的内容
 25             
 26             特殊：
 27                 - cookie
 28                 - csrftoken
 29                 - content-type:
 30                 
 31                     content-type:application/url-form....
 32                     name=alex&age=18
 33                     
 34                     content-type:application/json
 35                     {name:'alex',age:18}
 36     - 性能相关
 37         - 串行： 1个人，一个任务一个任务，空余时间，玩。
 38         - 线程： 10个人，一个任务一个任务，空余时间，玩。
 39         - 进程： 10个家庭，一个任务一个任务，空余时间，玩。
 40         - 【协程】异步非阻塞：1个人，充分利用时间。
 41     
 42     - scrapy框架
 43         - 规则
 44         
 45     - redis-scrapy组件
 46     
 47     
 48     
 49 内容详细：
 50     - 基本操作，python伪造浏览器发送请求并或者指定内容
 51     
 52         pip3 install requests
 53         response = requests.get('http://www.baidu.com')
 54         response.text
 55         
 56         
 57         pip3 install beautifulsoup4
 58         from bs4 import Beautifulsoup
 59         
 60         soup = Beautifulsoup(response.text,'html.parser')
 61         soup.find(name='h3',attrs={'class':'t'})
 62         soup.find_all(name='h3')
 63         
 64         示例：爬取汽车之家新闻
 65         
 66         
 67     - 模块
 68     
 69         requests
 70             GET:
 71                 requests.get(url="http://www.oldboyedu.com")
 72                 # data="http GET / http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n"
 73                 
 74                 requests.get(url="http://www.oldboyedu.com/index.html?p=1")
 75                 # data="http GET /index.html?p=1 http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n"
 76                 
 77                 requests.get(url="http://www.oldboyedu.com/index.html",params={'p':1})
 78                 # data="http GET /index.html?p=1 http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n"
 79             
 80             POST:
 81                 requests.post(url="http://www.oldboyedu.com",data={'name':'alex','age':18}) # 默认请求头：url-formend....
 82                 data="http POST / http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\nname=alex&age=18"
 83                 
 84                 
 85                 requests.post(url="http://www.oldboyedu.com",json={'name':'alex','age':18}) # 默认请求头：application/json
 86                 data="http POST / http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n{"name": "alex", "age": 18}"
 87 
 88                 
 89                 requests.post(
 90                     url="http://www.oldboyedu.com",
 91                     params={'p':1},
 92                     json={'name':'alex','age':18}
 93                 ) # 默认请求头：application/json
 94                 
 95                 data="http POST /?p=1 http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n{"name": "alex", "age": 18}"
 96                 
 97                 
 98                 补充：
 99                     request.body,永远有值
100                     request.POST，可能没有值
101                     
102                 
103         beautifulsoup
104             soup = beautifulsoup('HTML格式字符串','html.parser')
105             
106             tag = soup.find(name='div',attrs={})
107             tags = soup.find_all(name='div',attrs={})
108             
109             
110             tag.find('h3').text
111             tag.find('h3').get('属性名称')
112             tag.find('h3').attrs
113     
114     
115         HTTP请求：
116             GET请求：
117                 data="http GET /index?page=1 http1.1\r\nhost:baidu.com\r\n....\r\n\r\n"
118                 
119                 
120             POST请求：
121                 data="http POST /index?page=1 http1.1\r\nhost:baidu.com\r\n....\r\n\r\nname=alex&age=18"
122                 
123                 
124             socket.sendall(data)
125     
126     
127         示例【github和抽屉】：任何一个不用验证码的网站，通过代码自动登录
128             
129             1. 按理说
130                 r1 = requests.get(url='https://github.com/login')
131                 s1 = beautifulsoup(r1.text,'html.parser')
132                 val = s1.find(attrs={'name':'authenticity_token'}).get('value')
133                 
134                 r2 = requests.post(
135                         url= 'https://github.com/session',
136                         data={
137                             'commit': 'Sign in',
138                             'utf8': '✓',
139                             'authenticity_token': val,
140                             'login':'xxxxx',
141                             'password': 'xxxx',
142                             
143                         }
144                     )
145                     
146                 r2_cookie_dict = r2.cookies.get_dict() # {'session_id':'asdfasdfksdfoiuljksdf'}
147         
148                 保存登录状态，查看任意URL
149                 
150                 r3 = requests.get(
151                     url='xxxxxxxx',
152                     cookies=r2_cookie_dict
153                 )
154         
155                 print(r3.text) # 登录成功之后，可以查看的页面
156                 
157             2. 不按理说
158                 r1 = requests.get(url='https://github.com/login')
159                 s1 = beautifulsoup(r1.text,'html.parser')
160                 val = s1.find(attrs={'name':'authenticity_token'}).get('value')
161                 # cookie返回给你
162                 r1_cookie_dict = r1.cookies.get_dict()
163                 
164                 
165                 r2 = requests.post(
166                         url= 'https://github.com/session',
167                         data={
168                             'commit': 'Sign in',
169                             'utf8': '✓',
170                             'authenticity_token': val,
171                             'login':'xxxxx',
172                             'password': 'xxxx',
173                             
174                         },
175                         cookies=r1_cookie_dict
176                     )
177                 # 授权
178                 r2_cookie_dict = r2.cookies.get_dict() # {}
179         
180         
181         
182                 保存登录状态，查看任意URL
183                 
184                 r3 = requests.get(
185                     url='xxxxxxxx',
186                     cookies=r1_cookie_dict
187                 )
188         
189                 print(r3.text) # 登录成功之后，可以查看的页面
190             
191             
192     - requests
193         """
194         1. method
195         2. url
196         3. params
197         4. data
198         5. json
199         6. headers
200         7. cookies
201         8. files
202         9. auth
203         10. timeout
204         11. allow_redirects
205         12. proxies
206         13. stream
207         14. cert
208         ================ session,保存请求相关信息（不推荐）===================
209         import requests
210 
211         session = requests.Session()
212 
213         i1 = session.get(url="http://dig.chouti.com/help/service")
214         i2 = session.post(
215             url="http://dig.chouti.com/login",
216             data={
217                 'phone': "8615131255089",
218                 'password': "xxooxxoo",
219                 'oneMonth': ""
220             }
221         )
222         i3 = session.post(
223             url="http://dig.chouti.com/link/vote?linksId=8589523"
224         )
225         print(i3.text)
226 
227         """
228     - beautifulsoup
229         - find()
230         - find_all()
231         - get()
232         - attrs
233         - text
234         
235 内容：
236     1. 示例：汽车之家
237     2. 示例：github和chouti
238     3. requests和beautifulsoup
239     4. 轮询和长轮询
240     5. Django
241         request.POST
242         request.body
243         
244         # content-type:xxxx
245         
246 作业：web微信
247       功能：
248         1. 二维码显示
249         2. 长轮询：check_login
250         3. 
251             - 检测是否已经扫码
252             - 扫码之后201，头像： base64:.....
253             - 点击确认200，response.text     redirect_ur=....
254         4. 可选，获取最近联系人信息
255         
256 安装：
257     twsited
258     scrapy框架
259     
260     
261

武Sir - 笔记

参考：http://www.cnblogs.com/wupeiqi/articles/6283017.html

爬虫相关
	- 基本操作
		- 概要
			- 发送http请求	requests模块
			- 提取指定信息 	正则	Beautifulsoup模块
			- 数据持久化

		- Python的2个模块
			- requests
			- Beautifulsoup

		- Http请求相关知识
			- 请求
				- 请求头 
					- cookie
				- 请求体 
					- 发送的内容
			- 响应 
				- 响应头 
					- 浏览器读取
				- 响应体
					- 看到的内容

			- 特殊
				- cookie
				- csrf_token
				- content-type 用来指定客户端按照哪种格式进行解析


	- 性能相关
		- 进程
		- 线程
		- 协程

		- 【协程】异步非阻塞：充分利用系统资源


	- scrapy框架
		- 学习scrapy的规则


	- redis&scrapy组件：完成一个简单的分布式爬虫



内容详细
	- 基本操作	Python伪造浏览器发送请求

		pip3 install requests
		pip3 install Beautifulsoup4

		import requests
		from bs4 import BeautifulSoup


		response = requests.get("http://www.baidu.com")
		response.text  ->  网页内容

		soup = Beautifulsoup(response.text,'html.parse')

		# 从上到下第一个 <h3 class='t'> 标签
		soup.find(name='h3',attrs={'class':'t'})
		# 查找全部 <h3>标签
		soup.find_all(name='h3')

		...

	模块
		requests
			response = requests.get(url='url路径')
			# 解决乱码问题
			response.encoding = response.apparent_encoding

			GET请求：
				requests.get(url='www.baidu.com')
				data = "http GET / ...."
				requests.get(url='www.baidu.com?page=1')
				data = "http GET page=1 ...."
				requests.get(url='www.baidu.com',params={'page':1})


			POST请求：
				requests.post(url='www.baidu.com',data={'name':'alex','age':18}) # 默认携带请求头类型：application/x-www-form-urlencoded

				requests.post(url='www.baidu.com',json={'name':'alex','age':18}) # 默认携带请求头类型：application/json

				# POST请求既可以在请求体里传参，又可以在url里传参
				requests.post(url='www.baidu.com',params={'page':1},json={'name':'alex','age':18})



				补充：
					django里的 request.POST 里的值是django根据请求体里的数据转换过来的
						所以，如果body里的数据格式不对，那么就转换不了，导致request.POST里面没有值
					django里的 request.body 里永远有值
					django里的 request.POST 可能没有值



		BeautifulSoup
			soup = BeautifulSoup('html格式字符串','html.parser')
			tag = soup.find(name='div',attrs={...})
			tag = soup.find_all(name='div',attrs={...})

			tag.find('h3').text
			tag.find('h3').content 
			tag.find('h3').get('属性名称')
			tag.find('h3').attrs['属性名称']







服务器端不能主动给客户端发消息
但是websocket可以

- 【轮询】     	http协议，客户端轮询（每秒1次）请求服务端；一次请求，服务端收到后不管有没有新消息都立即返回
- 【长轮询】 	http协议，客户端发来请求，服务器把客户端给hang住，直到服务端收到新消息并发送给所有客户端、才断开连接；
				客户端收到消息后，再立即发请求到服务端进行下一次hang住。
				hang住，有一个超时时间，web微信超时时间是25s
				应用：web微信
- 【WebSocket】	不是http协议，建立在tcp之上
				一次连接不断开，双工通道，可以互相发送消息
				但是浏览器兼容性不太好，以后将会应用的更广泛




浏览器有同源策略
ajax发送跨域请求是接收不到结果的





http://www.cnblogs.com/wupeiqi/articles/6283017.html




#!/usr/bin/python
# -*- coding:utf-8 -*-

import requests

requests.request()

requests.get(url='xxx')
# 本质上就是：
requests.request(method='get',url='xxx')

import json
requests.post(url='xxx',data={'name':'alex','age':18}) # content_type: application/x-www-form-urlencoded
requests.post(url='xxx',data="name=alex&age=18")   # content_type: application/x-www-form-urlencoded
# 不伦不类
requests.post(url='xxx',data=json.dumps({'name':'alex','age':18}))  # content_type: application/x-www-form-urlencoded
# 利用headers参数重写 Content_type
requests.post(url='xxx',data=json.dumps({'name':'alex','age':18}),headers={'Content_type':'application/json'})  # content_type: application/x-www-form-urlencoded
requests.post(url='xxx',json={'name':'alex','age':18})  # content_type: application/json


"""
1.method
2.url
3.params
4.data
5.json
6.headers
7.cookies

8.files
9.auth
10.timeout
11.allow_redirects
12.proxies
13.stream
14.cert

=================== session,保存请求相关信息  ==================
session = requests.Session()
session.get(url='xxx')
session.post(...)
"""

"""
8.files 用作文件上传
"""
file_dict = {
    'f1': open('readme', 'rb')
}
requests.post(url='xxx',file=file_dict)
# 发送文件，定制文件名
# file_dict = {
#   'f1': ('test.txt', open('readme', 'rb'))
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict)

# 发送文件，定制文件名
# file_dict = {
#   'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf")
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict)



"""
9.auth  基本认证    路由器登录
"""
from requests.auth import HTTPBasicAuth,HTTPDigestAuth

requests.get('https://api.github.com/user',auth=HTTPBasicAuth('gypsying','password'))


"""
timeout     (连接超时，响应超时)
"""
requests.get('http://google.com',timeout=3)
requests.get('http://google.com',timeout=(5,1))


"""
allow_redirects
"""

"""
proxies 应对IP被封的情况
"""
proxyDict = {
    "http": "61.172.249.96:80",
    "https": "http://61.185.219.126:3128",
}
proxies = {'http://10.20.1.128': 'http://10.10.1.10:5323'}

"""
stream
"""
from contextlib import closing
with closing(requests.get('xxx',stream=True)) as f:
    for i in f.iter_content():
        print(i)




requests.put()
requests.delete()





BeautifulSoup
	- find()
	- find_all()
	- get()
	- attrs
	- text

soup = BeautifulSoup('html格式字符串','html.parser')
soup = BeautifulSoup('html格式字符串',features='lxml')	第三方，需额外安装，但是速度比'html.parser'更快


soup = BeautifulSoup('html格式字符串','html.parser')
tag = soup.find(attrs={'class':'c1'})
tag.name  ->  标签名字

tag = soup.find(attrs={'class':'c1'})
等价于：
tag = soup.find(class_='c1')

print(tag.attrs)

tag.attrs['id'] = 1
del tag.attrs['class']
# attrs 进行增删改查都可以


tag.children  	所有孩子
tag.descendants	所有后代
tag.find_all()	包含的所有标签，并且递归了
tag.find_all(recursive=False)	包含的所有标签，不递归

tag.clear()		清空内部元素，保留自己
tag.decompose()	递归删除所有标签，包含自己
res = tag.extract()	相当于字典的pop，其余同decompose()


tag = soup.find(class_='c1')	# 对象
tag.decode()	# 对象变成字符串
tag.encode()	# 对象变成字节

tag.find('a')
# tag = soup.find('a')
# print(tag)
# tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tag)

find_all()
# tags = soup.find_all('a')
# print(tags)
 
# tags = soup.find_all('a',limit=1)
# print(tags)
 
# tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tags)
 
 
# ####### 列表 #######
# v = soup.find_all(name=['a','div'])
# print(v)
 
# v = soup.find_all(class_=['sister0', 'sister'])
# print(v)
 
# v = soup.find_all(text=['Tillie'])
# print(v, type(v[0]))
 
 
# v = soup.find_all(id=['link1','link2'])
# print(v)
 
# v = soup.find_all(href=['link1','link2'])
# print(v)
 
# ####### 正则 #######
import re
# rep = re.compile('p')
# rep = re.compile('^p')
# v = soup.find_all(name=rep)
# print(v)
 
# rep = re.compile('sister.*')
# v = soup.find_all(class_=rep)
# print(v)
 
# rep = re.compile('http://www.oldboy.com/static/.*')
# v = soup.find_all(href=rep)
# print(v)
 
# ####### 方法筛选 #######
# def func(tag):
# return tag.has_attr('class') and tag.has_attr('id')
# v = soup.find_all(name=func)
# print(v)
 
 
# ## get,获取标签属性
# tag = soup.find('a')
# v = tag.get('id')
# print(v)


from bs4.element import Tag

tag.has_attr()
tag.text  等价于 tag.get_text()


v = tag.index(tag.find('div'))


tag.text
tag.string 也可以获取内容，并扩展了修改内容
tag.string = "xxxx"
tag.stripped_strings 相当于join给分割成了list 
tag.children
for item in tag.children:
	print(item,type(item))




from bs4.element import Tag
tag= Tag(name='i',attrs={'id':'it'})
tag.string = "asasasasasasazxzxzx"


soup.find(id='xxx').append(tag)




""" 扩展copy模块 """
import copy
copy.deepcopy()
...



tag.wrap(tag1)
tag.unwrap()

++++++++++++++++++++++++++++++++++++


内容梳理：
	- 汽车之间新闻爬取示例
	- github和抽屉自动登录  以及 登陆后的操作
	- requests 和 Beautifulsoup 基本使用
	- 轮训和长轮询
	- Django 里 content-type问题
		request.POST 
		request.body






练习：web微信
	1. 二维码显示
	2. 长轮询 check_login() ：ajax递归  （js递归没有层数限制）
	3. 检测是否已经扫码
		- 扫码之后201：替换头像 base64:...
		src="img_path"
		或者
		src="base64:xxxxxxxx...."
		- 扫码之后继续轮训，等待用户点击确认
		- 点击确认之后，返回200 
			response.text redirect_url-....
		- 获取最近联系人信息






下节课前安装
	twsited
	scrapy框架

服务器端不能主动给客户端发消息
但是websocket可以

- 【轮询】     	http协议，客户端轮询（每秒1次）请求服务端；一次请求，服务端收到后不管有没有新消息都立即返回
- 【长轮询】 	http协议，客户端发来请求，服务器把客户端给hang住，直到服务端收到新消息并发送给所有客户端、才断开连接；
				客户端收到消息后，再立即发请求到服务端进行下一次hang住。
				hang住，有一个超时时间，web微信超时时间是25s
				应用：web微信
- 【WebSocket】	不是http协议，建立在tcp之上
				一次连接不断开，双工通道，可以互相发送消息
				但是浏览器兼容性不太好，以后将会应用的更广泛

一、爬虫几点基础知识

- 基本操作
	- 概要
		- 发送http请求	requests模块
		- 提取指定信息 	正则	Beautifulsoup模块
		- 数据持久化

	- Python的2个模块
		- requests
		- Beautifulsoup

	- Http请求相关知识
		- 请求
			- 请求头 
				- cookie
			- 请求体 
				- 发送的内容
		- 响应 
			- 响应头 
				- 浏览器读取
			- 响应体
				- 看到的内容

		- 特殊
			- cookie
			- csrf_token
			- content-type 用来指定客户端按照哪种格式进行解析


- 性能相关
	- 进程
	- 线程
	- 协程

	- 【协程】异步非阻塞：充分利用系统资源


- scrapy框架
	- 学习scrapy的规则


- redis&scrapy组件：完成一个简单的分布式爬虫

二、爬取汽车之家新闻示例

#!/usr/bin/python
# -*- coding:utf-8 -*-

"""
爬取汽车之家的新闻
"""
import os
import requests
from bs4 import BeautifulSoup

response = requests.get('http://www.autohome.com.cn/news/')
"""  指定编码，否则会乱码 """
# print(response.apparent_encoding)
# print(response.encoding)
""" Good """
response.encoding = response.apparent_encoding
# print(response.encoding)
# print(type(response.text))      # <class 'str'>
# print(type(response.content))   # <class 'bytes'>

""" BeautifulSoup把各种HTML标签转换成各种对象，所以可以使用 obj.attr 方式 """
soup = BeautifulSoup(response.text,'html.parser')
tag = soup.find(name='div',attrs={'id':'auto-channel-lazyload-article'})

li_list = tag.find_all('li') # [标签对象,标签对象,标签对象...]
for li in li_list:
    h3 = li.find(name='h3')
    if not h3:
        continue
    else:
        print(h3.text)
        # 获取属性
        print(li.find(name='a').get('href'))
        # 或者：print(li.find(name='a').attrs['href'])
        print(li.find('p').text)

        # 下载图片
        img_url = li.find('img').get('src')
        print(img_url)
        res = requests.get('http:'+img_url)
        img_path = os.path.join('autohome',img_url.split('/')[-1])
        with open(img_path,'wb') as fw:
            fw.write(res.content)

一抹红的专属感 Macan Turbo特别版官图
//www.autohome.com.cn/news/201710/908351.html#pvareaid=102624
[汽车之家 新车官图]  日前，保时捷发布了Macan Turbo Exclusive Performance Edition的官图，作为一款特别版车...
//www3.autoimg.cn/newsdfs/g10/M0F/B2/EA/120x90_0_autohomecar__wKgH0VnqsC6AYGDFAAGFLm8dSfc007.jpg
还要怎么轻？ 路特斯Elise Cup 260官图
//www.autohome.com.cn/news/201710/908350.html#pvareaid=102624
[汽车之家 新车官图]  日前，路特斯官方宣布推出Elise Cup 260，这款车相比于已经进行进一步轻量化改造的新款Cup 250要更轻更快，全球...
//www3.autoimg.cn/newsdfs/g18/M0C/B9/7A/120x90_0_autohomecar__wKgH6FnqrhyAH3UDAAFOwoge9w4751.jpg
...

三、自动登录网站示例

参考：http://www.cnblogs.com/wupeiqi/articles/6283017.html

　　- .2种网站授权登录的方式

requests.get()  +  requests.post()

    - 方式1

　　　　1.第一次GET请求获取token

　　　　2.第二次POST请求进行验证并获取cookie

　　　　3.第三次GET/POST请求并携带cookie实现用户登录后的某些操作

 
    - 方式2

　　　　1.第一次GET请求获取token和未被授权的cookie

　　　　2.第二次POST请求并携带cookie进行验证并授权

　　　　3.第三次GET/POST请求并携带授权过的cookie实现用户登录后的某些操作

另外可以使用 requests.session() 更简单的实现：

session = requests.Session()

session.get()  + session.post()

　　- .自动登录Github并浏览个人主页

#!/usr/bin/python
# -*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup


"""
第二种Python登录的cookie携带方式
以登录 github账户为例：
    - 第一次去请求 https://github.com/login 这个页面的时候，服务端就给返回了cookie
    - 第二次去请求 https://github.com/session 进行提交用户名密码的时候，要带上上一次返回的cookie进行授权
    - 第三次去请求用户登录后才能看到的页面（例如个人主页），需要带上上面授权好的cookie，才可以
"""

""" 1.获取token和cookie """
rsp1 = requests.get(url='https://github.com/login')
soup1 = BeautifulSoup(rsp1.text,'html.parser')
# 根据属性值找到对应标签，进而获取其value值
token = soup1.find(attrs={'name':'authenticity_token'}).get('value')
# 获取第一次请求获得的cookie
rsp1_cookie_dict = rsp1.cookies.get_dict()
print(token)
print(rsp1_cookie_dict)

""" 2.发起登录POST请求 """
rsp2 = requests.post(
    url='https://github.com/session',
    data={
        'commit':'Sign in',
        'utf8':'✓',
        'authenticity_token':token,
        'login':'gypsying',
        'password':'xxxxxxxxx',
    },
    cookies=rsp1_cookie_dict
)
# 获取第二次请求获得的cookie
rsp2_cookie_dict = rsp2.cookies.get_dict()
print(rsp2_cookie_dict)

all_cookie_dict = {}
all_cookie_dict.update(rsp1_cookie_dict)
all_cookie_dict.update(rsp2_cookie_dict)

print(all_cookie_dict)

""" 3.发起查看个人主页的GET请求 """
rsp3 = requests.get(
    url='https://github.com/Gypsying',
    cookies=all_cookie_dict
)

soup3 = BeautifulSoup(rsp3.text,'html.parser')
email = soup3.find(name='a',attrs={'class':'u-email'}).text
print(email)  # 就可以拿到了 hitwh_Gypsy@163.com

　　- .自动登录抽屉并实施点赞操作

import requests
from bs4 import BeautifulSoup

index_url = "http://dig.chouti.com/"
rsp1 = requests.get(index_url)

soup = BeautifulSoup(rsp1.text,'html.parser')
a_list = soup.find_all(attrs={'class':'digg-a'})
id_list = []
# 获取首页上所有新闻的id
for item in a_list:
    news_id = item.find(name='i').text
    id_list.append(news_id)

# 获得GET首页时候返回的 cookie ，此时的cookie是没有授权的
index_cookie = rsp1.cookies.get_dict()
login_url = "http://dig.chouti.com/login"
data = {
    'phone':8600000000000,
    'password':'xxxxxx',
    'oneMonth':1
}
# 提交用户名和密码，并带上未授权的cookie进行授权
login_ret = requests.post(url=login_url,data=data,cookies=index_cookie)
login_cookie = login_ret.cookies.get_dict()
login_ret = eval(login_ret.text)
code = login_ret.get('result').get('code')
if "9999"  == code:
    print("登录成功")
else:
    print("登录失败")
"""
{"result":{"code":"8887", "message":"手机号格式不对", "data":""}}
{"result":{"code":"21100", "message":"该手机号未注册", "data":""}}
{"result":{"code":"29998", "message":"手机号或密码错误", "data":{}}}

{"result":{"code":"9999", "message":"", "data":{"complateReg":"0","destJid":"cdu_50613120077"}}}
"""

# 点赞的时候需要带上上次授权好的cookie
for news_id in id_list:
    like_url = "http://dig.chouti.com/link/vote?linksId={}".format(news_id)
    like_ret = requests.post(url=like_url,cookies=index_cookie)
    print(like_ret.text)

"""
{"result":{"code":"30010", "message":"您已经推荐过了", "data":""}}
{"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_50613120077","likedTime":"1509378903908000","lvCount":"8","nick":"gypsy","uvCount":"1","voteTime":"小于1分钟前"}}}
"""

四、模拟Web版微信相关操作

"""
微信网页版登录示例

GET        https://login.wx.qq.com/jslogin?appid=wx782c26e4c19acffb&redirect_uri=https%3A%2F%2Fwx.qq.com%2Fcgi-bin%2Fmmwebwx-bin%2Fwebwxnewloginpage&fun=new&lang=zh_CN&_=1508052025433
得到响应：   window.QRLogin.code = 200; window.QRLogin.uuid = "IapQqsoqcA==";

二维码src   https://login.weixin.qq.com/qrcode/IapQqsoqcA==

长轮询：     https://login.wx.qq.com/cgi-bin/mmwebwx-bin/login?loginicon=true&uuid=IapQqsoqcA==&tip=0&r=-518626217&_=1508052025438
"""

posted @ 2017-10-21 12:20 青山应回首阅读(339) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

青山

跑步、读书、数学、

爬虫基础01

写在前面

一、爬虫几点基础知识

二、爬取汽车之家新闻示例

三、自动登录网站示例

- .2种网站授权登录的方式

- .自动登录Github并浏览个人主页

- .自动登录抽屉并实施点赞操作

四、模拟Web版微信相关操作

公告