python之爬虫

=======================转自：https://www.cnblogs.com/wupeiqi/articles/6283017.html=================

一、requests

　　Requests是使用Apache2 Licensed 许可证的基于Python开发的HTTP 库，其在Python内置模块的基础上进行了高度的封装，从而使得Pythoner进行网络请求时，变得美好了许多，使用Requests可以轻而易举的完成浏览器可有的任何操作。

1、GET请求

 1 import requests
 2 ###无参
 3 ret  = requests.get('http://www.baidu.com')
 4 print(ret)
 5 print('=====>>>',ret.content)
 6 ###带参数
 7 payload = {'key1':'value1','key2':'value2'}
 8 ret2 = requests.get('https://www.baidu.com',params=payload)
 9 print('========>>',ret2.content)
10 print(ret2.url)

View Code

2、POST请求

import requests, json

###基本post
payload = {'key1': 'value1', 'key2': 'value2'}
ret = requests.post('http://www.baidu.com', data=payload)
print(ret.url, ret.headers)
###post请求发送请求头和数据实例
url = 'https://api.github.com/some/endpoint'
headers = {'content-type':'application/json'}
payload = {'some':'data'}
ret1 = requests.post(url,data=json.dumps(payload),headers=headers)
print(ret1.text)
print(ret1.cookies)

View Code

3、其他请求

requests.get(url, params=None, **kwargs)
requests.post(url, data=None, json=None, **kwargs)
requests.put(url, data=None, **kwargs)
requests.head(url, **kwargs)
requests.delete(url, **kwargs)
requests.patch(url, data=None, **kwargs)
requests.options(url, **kwargs)
  
# 以上方法均是在此方法的基础上构建
requests.request(method, url, **kwargs)

View Code

更多参数

def param_method_url():
    # requests.request(method='get', url='http://127.0.0.1:8000/test/')
    # requests.request(method='post', url='http://127.0.0.1:8000/test/')
    pass


def param_param():
    # - 可以是字典
    # - 可以是字符串
    # - 可以是字节（ascii编码以内）

    # requests.request(method='get',
    # url='http://127.0.0.1:8000/test/',
    # params={'k1': 'v1', 'k2': '水电费'})

    # requests.request(method='get',
    # url='http://127.0.0.1:8000/test/',
    # params="k1=v1&k2=水电费&k3=v3&k3=vv3")

    # requests.request(method='get',
    # url='http://127.0.0.1:8000/test/',
    # params=bytes("k1=v1&k2=k2&k3=v3&k3=vv3", encoding='utf8'))

    # 错误
    # requests.request(method='get',
    # url='http://127.0.0.1:8000/test/',
    # params=bytes("k1=v1&k2=水电费&k3=v3&k3=vv3", encoding='utf8'))
    pass


def param_data():
    # 可以是字典
    # 可以是字符串
    # 可以是字节
    # 可以是文件对象

    # requests.request(method='POST',
    # url='http://127.0.0.1:8000/test/',
    # data={'k1': 'v1', 'k2': '水电费'})

    # requests.request(method='POST',
    # url='http://127.0.0.1:8000/test/',
    # data="k1=v1; k2=v2; k3=v3; k3=v4"
    # )

    # requests.request(method='POST',
    # url='http://127.0.0.1:8000/test/',
    # data="k1=v1;k2=v2;k3=v3;k3=v4",
    # headers={'Content-Type': 'application/x-www-form-urlencoded'}
    # )

    # requests.request(method='POST',
    # url='http://127.0.0.1:8000/test/',
    # data=open('data_file.py', mode='r', encoding='utf-8'), # 文件内容是：k1=v1;k2=v2;k3=v3;k3=v4
    # headers={'Content-Type': 'application/x-www-form-urlencoded'}
    # )
    pass


def param_json():
    # 将json中对应的数据进行序列化成一个字符串，json.dumps(...)
    # 然后发送到服务器端的body中，并且Content-Type是 {'Content-Type': 'application/json'}
    requests.request(method='POST',
                     url='http://127.0.0.1:8000/test/',
                     json={'k1': 'v1', 'k2': '水电费'})


def param_headers():
    # 发送请求头到服务器端
    requests.request(method='POST',
                     url='http://127.0.0.1:8000/test/',
                     json={'k1': 'v1', 'k2': '水电费'},
                     headers={'Content-Type': 'application/x-www-form-urlencoded'}
                     )


def param_cookies():
    # 发送Cookie到服务器端
    requests.request(method='POST',
                     url='http://127.0.0.1:8000/test/',
                     data={'k1': 'v1', 'k2': 'v2'},
                     cookies={'cook1': 'value1'},
                     )
    # 也可以使用CookieJar（字典形式就是在此基础上封装）
    from http.cookiejar import CookieJar
    from http.cookiejar import Cookie

    obj = CookieJar()
    obj.set_cookie(Cookie(version=0, name='c1', value='v1', port=None, domain='', path='/', secure=False, expires=None,
                          discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False,
                          port_specified=False, domain_specified=False, domain_initial_dot=False, path_specified=False)
                   )
    requests.request(method='POST',
                     url='http://127.0.0.1:8000/test/',
                     data={'k1': 'v1', 'k2': 'v2'},
                     cookies=obj)


def param_files():
    # 发送文件
    # file_dict = {
    # 'f1': open('readme', 'rb')
    # }
    # requests.request(method='POST',
    # url='http://127.0.0.1:8000/test/',
    # files=file_dict)

    # 发送文件，定制文件名
    # file_dict = {
    # 'f1': ('test.txt', open('readme', 'rb'))
    # }
    # requests.request(method='POST',
    # url='http://127.0.0.1:8000/test/',
    # files=file_dict)

    # 发送文件，定制文件名
    # file_dict = {
    # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf")
    # }
    # requests.request(method='POST',
    # url='http://127.0.0.1:8000/test/',
    # files=file_dict)

    # 发送文件，定制文件名
    # file_dict = {
    #     'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'})
    # }
    # requests.request(method='POST',
    #                  url='http://127.0.0.1:8000/test/',
    #                  files=file_dict)

    pass


def param_auth():
    from requests.auth import HTTPBasicAuth, HTTPDigestAuth

    ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))
    print(ret.text)

    # ret = requests.get('http://192.168.1.1',
    # auth=HTTPBasicAuth('admin', 'admin'))
    # ret.encoding = 'gbk'
    # print(ret.text)

    # ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))
    # print(ret)
    #


def param_timeout():
    # ret = requests.get('http://google.com/', timeout=1)
    # print(ret)

    # ret = requests.get('http://google.com/', timeout=(5, 1))
    # print(ret)
    pass


def param_allow_redirects():
    ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False)
    print(ret.text)


def param_proxies():
    # proxies = {
    # "http": "61.172.249.96:80",
    # "https": "http://61.185.219.126:3128",
    # }

    # proxies = {'http://10.20.1.128': 'http://10.10.1.10:5323'}

    # ret = requests.get("http://www.proxy360.cn/Proxy", proxies=proxies)
    # print(ret.headers)


    # from requests.auth import HTTPProxyAuth
    #
    # proxyDict = {
    # 'http': '77.75.105.165',
    # 'https': '77.75.105.165'
    # }
    # auth = HTTPProxyAuth('username', 'mypassword')
    #
    # r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth)
    # print(r.text)

    pass


def param_stream():
    ret = requests.get('http://127.0.0.1:8000/test/', stream=True)
    print(ret.content)
    ret.close()

    # from contextlib import closing
    # with closing(requests.get('http://httpbin.org/get', stream=True)) as r:
    # # 在此处理响应。
    # for i in r.iter_content():
    # print(i)


def requests_session():
    import requests

    session = requests.Session()

    ### 1、首先登陆任何页面，获取cookie

    i1 = session.get(url="http://dig.chouti.com/help/service")

    ### 2、用户登陆，携带上一次的cookie，后台对cookie中的 gpsd 进行授权
    i2 = session.post(
        url="http://dig.chouti.com/login",
        data={
            'phone': "8615131255089",
            'password': "xxxxxx",
            'oneMonth': ""
        }
    )

    i3 = session.post(
        url="http://dig.chouti.com/link/vote?linksId=8589623",
    )
    print(i3.text)

参数示例

View Code

二、Beautifulsoup

BeautifulSoup是一个模块，该模块用于接收一个HTML或XML字符串，然后将其进行格式化，之后遍可以使用他提供的方法进行快速查找指定元素，从而使得在HTML或XML中查找指定元素变得简单。

1、BeautifulSoup安装

　　进入python解释器安装目录/Lib/site-packages/，使用pip3 install beautifulsoup4直接安装成功，这样安装插件相当于全局变量，后续工程可以不用再重复下载插件；

2、BeautifulSoup的方法

实例化一个对象：

from bs4 import BeautifulSoup
 
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
asdf
    <div class="title">
        <b>The Dormouse's story总共</b>
        <h1>f</h1>
    </div>
<div class="story">Once upon a time there were three little sisters; and their names were
    <a  class="sister0" id="link1">Els<span>f</span>ie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</div>
ad<br/>sf
<p class="story">...</p>
</body>
</html>
"""
 
soup = BeautifulSoup(html_doc, features="lxml")

View Code

 1 from bs4 import BeautifulSoup
 2 html_doc = """
 3 <html><head><title>The Dormouse's story</title></head>
 4 <body>
 5 asdf
 6     <div class="title">
 7         <b>The Dormouse's story总共</b>
 8         <h1>f</h1>
 9     </div>
10 <div class="story">Once upon a time there were three little sisters; and their names were
11     <a  class="sister0" id="link1">Els<span>f</span>ie</a>,
12     <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
13     <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
14 and they lived at the bottom of a well.</div>
15 ad<br/>sf
16 <p class="story">...</p>
17 </body>
18 </html>
19 """
20 soup = BeautifulSoup(html_doc,features='lxml')
21 print(soup)
22 #找到第一个a标签
23 tag1 = soup.find(name='a')
24 #找到所有的a标签
25 tag2 = soup.find_all(name='a')
26 #找到id为link1的标签
27 tag3 = soup.select('#link1')
28 print('======>>>',tag1)
29 print('======>>>',tag2)
30 print('======>>>',tag3)
31 """
32 备注：当BeautifulSoup未指定features的时候，python解释器会自动给你加上一个features；
33 当指定了features的时候，需要安装指定格式的解释器，不然会报错；===》pip3 install lxml"""

示例

1.1、name标签名称

1 soup = BeautifulSoup(html_doc,features='lxml')
2 #找到第一个a标签
3 tag1 = soup.find(name='a')
4 name = tag1.name
5 print('====>>',name)
6 tag1.name='span'
7 print(soup)

标签名称

1.2、attr标签属性

soup = BeautifulSoup(html_doc,features='lxml')
#找到第一个a标签
tag1 = soup.find(name = 'a')
attr_list = tag1.attrs
print(tag1)
###将找到的标签的属性设置为下面的属性；
tag1.attrs={'k1':123}
attr_list = tag1.attrs
print(attr_list)  ##{'k1': 123}

###新增属性；
tag1.attrs['id']='new_id'
attr_list = tag1.attrs
print(attr_list) ##{'k1': 123, 'id': 'new_id'}

属性

1.3、children所有子标签

1 body = soup.find('body')
2 child_body = body.children ###一个可迭代的对象
3 print(list(child_body))

children

1.4、descendants所有的子子孙孙标签

1 ###children,所有子子孙孙标签
2 print('======>>',list(body.descendants))
3 ###children、descendants一个是迭代器，一个是生成器，最终遍历出来的结果是一模一样的；？

descendants

1.5、clear将标签的所有子标签全部清空（保留标签名）

1 tag1 = soup.find('body')
2 print(tag1)
3 tag1.clear()
4 print(tag1)###<body></body>

clear

1.6、decompose递归删除所有的标签

1 tag1 = soup.find('body')
2 tag1.decompose()
3 print(tag1)
4 print(soup)  ###<html><head><title>The Dormouse's story</title></head></html>

decompose

1.7、extract递归地删除所有的标签，并获取删除的标签

1 tag1 = soup.find('body')
2 tag1_del = tag1.extract()
3 print(soup)  ##<html><head><title>The Dormouse's story</title></head></html>
4 print(tag1_del)

extract

1.8、decode转换为字符串（含当前标签），decode_contents（不含当前标签）

1 tag1 = soup.find('body')
2 v1 = tag1.decode()
3 print(v1)
4 v2 = tag1.decode_contents()
5 print(v2)

decode

1.9、encode转换为字符串（含当前标签），encode_contents（不含当前标签）

1 tag1 = soup.find('body')
2 v1 = tag1.encode()
3 print(v1)
4 v2 = tag1.encode_contents()
5 print(v2)

encode

1.10、find获取匹配的第一个标签

1 tag1 = soup.find('a')
2 print(tag1)
3 ###recursive：递归；class_：因为class为定义类的内部关键字
4 tag2 = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
5 tag3 = soup.find(name='a', class_ ='sister', recursive=True, text='Lacie')
6 print(tag2)
7 print(tag3)

find

1.11、find_all获取匹配的所有标签

 1 #找到第一个a标签
 2 
 3 
 4 # tags = soup.find_all('a')
 5 # print(tags)
 6 
 7 # tags = soup.find_all('a',limit=1)
 8 # print(tags)
 9 
10 # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
11 # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
12 # print(tags)
13 
14 
15 # ####### 列表 #######
16 # v = soup.find_all(name=['a','div'])
17 # print(v)
18 
19 # v = soup.find_all(class_=['sister0', 'sister'])
20 # print(v)
21 
22 # v = soup.find_all(text=['Tillie'])
23 # print(v, type(v[0]))
24 
25 
26 # v = soup.find_all(id=['link1','link2'])
27 # print(v)
28 
29 # v = soup.find_all(href=['link1','link2'])
30 # print(v)
31 
32 # ####### 正则 #######
33 import re
34 # rep = re.compile('p')
35 # rep = re.compile('^p')
36 # v = soup.find_all(name=rep)
37 # print(v)
38 
39 # rep = re.compile('sister.*')
40 # v = soup.find_all(class_=rep)
41 # print(v)
42 
43 # rep = re.compile('http://www.oldboy.com/static/.*')
44 # v = soup.find_all(href=rep)
45 # print(v)
46 
47 # ####### 方法筛选 #######
48 # def func(tag):
49 # return tag.has_attr('class') and tag.has_attr('id')
50 # v = soup.find_all(name=func)
51 # print(v)
52 
53 
54 # ## get,获取标签属性
55 # tag = soup.find('a')
56 # v = tag.get('id')
57 # print(v)

find_all

1.12、has_attr检查标签是否具有该属性

1 # tag = soup.find('a')
2 # v = tag.has_attr('id')
3 # print(v)

has_attr

1.13、get_text，获取标签内部文本内容

1 # tag = soup.find('a')
2 # v = tag.get_text('id')
3 # print(v)

get_text

1.14、index检查标签在某标签中的索引位置

 1 ###检查div标签在body标签中的位置
 2 tag = soup.find('body')
 3 v = tag.index(tag.find('div'))
 4 print('=====>>',v)
 5 
 6 print('====================')
 7 ###将body标签的所有子标签index及标签都罗列出来
 8 tag = soup.find('body')
 9 for i,v in enumerate(tag):
10     print(i,v)

index

1.15、is_empty_element，是否是空标签（是否可以是空），或者自闭合标签，判断是否是如下标签：'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'

1 # tag = soup.find('br')
2 # v = tag.is_empty_element
3 # print(v)

is_empty_element

1.16、当前的关联标签

# soup.next
# soup.next_element
# soup.next_elements
# soup.next_sibling
# soup.next_siblings
 
#
# tag.previous
# tag.previous_element
# tag.previous_elements
# tag.previous_sibling
# tag.previous_siblings
 
#
# tag.parent
# tag.parents

关联标签

1.17、查找某标签的关联标签

 1 # tag.find_next(...)
 2 # tag.find_all_next(...)
 3 # tag.find_next_sibling(...)
 4 # tag.find_next_siblings(...)
 5  
 6 # tag.find_previous(...)
 7 # tag.find_all_previous(...)
 8 # tag.find_previous_sibling(...)
 9 # tag.find_previous_siblings(...)
10  
11 # tag.find_parent(...)
12 # tag.find_parents(...)
13  
14 # 参数同find_all

某标签的关联标签

1.18、select、select_one、css选择器

 1 soup.select("title")
 2  
 3 soup.select("p nth-of-type(3)")
 4  
 5 soup.select("body a")
 6  
 7 soup.select("html head title")
 8  
 9 tag = soup.select("span,a")
10  
11 soup.select("head > title")
12  
13 soup.select("p > a")
14  
15 soup.select("p > a:nth-of-type(2)")
16  
17 soup.select("p > #link1")
18  
19 soup.select("body > a")
20  
21 soup.select("#link1 ~ .sister")
22  
23 soup.select("#link1 + .sister")
24  
25 soup.select(".sister")
26  
27 soup.select("[class~=sister]")
28  
29 soup.select("#link1")
30  
31 soup.select("a#link2")
32  
33 soup.select('a[href]')
34  
35 soup.select('a[href="http://example.com/elsie"]')
36  
37 soup.select('a[href^="http://example.com/"]')
38  
39 soup.select('a[href$="tillie"]')
40  
41 soup.select('a[href*=".com/el"]')
42  
43  
44 from bs4.element import Tag
45  
46 def default_candidate_generator(tag):
47     for child in tag.descendants:
48         if not isinstance(child, Tag):
49             continue
50         if not child.has_attr('href'):
51             continue
52         yield child
53  
54 tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)
55 print(type(tags), tags)
56  
57 from bs4.element import Tag
58 def default_candidate_generator(tag):
59     for child in tag.descendants:
60         if not isinstance(child, Tag):
61             continue
62         if not child.has_attr('href'):
63             continue
64         yield child
65  
66 tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)
67 print(type(tags), tags)

选择器

1.19、标签的内容

 1 # tag = soup.find('span')
 2 # print(tag.string)          # 获取
 3 # tag.string = 'new content' # 设置
 4 # print(soup)
 5  
 6 # tag = soup.find('body')
 7 # print(tag.string)
 8 # tag.string = 'xxx'
 9 # print(soup)
10  
11 # tag = soup.find('body')
12 # v = tag.stripped_strings  # 递归内部获取所有标签的文本
13 # print(v)

标签的内容

1.20、append在当前标签内部追加一个标签

 1 # tag = soup.find('body')
 2 # tag.append(soup.find('a'))
 3 # print(soup)
 4 #
 5 # from bs4.element import Tag
 6 # obj = Tag(name='i',attrs={'id': 'it'})
 7 # obj.string = '我是一个新来的'
 8 # tag = soup.find('body')
 9 # tag.append(obj)
10 # print(soup)

在标签内部追加标签

1.21、insert在当前标签内部指定位置插入一个标签

1 # from bs4.element import Tag
2 # obj = Tag(name='i', attrs={'id': 'it'})
3 # obj.string = '我是一个新来的'
4 # tag = soup.find('body')
5 # tag.insert(2, obj)
6 # print(soup)

View Code

1.22、 insert_after,insert_before 在当前标签后面或前面插入

1 # from bs4.element import Tag
2 # obj = Tag(name='i', attrs={'id': 'it'})
3 # obj.string = '我是一个新来的'
4 # tag = soup.find('body')
5 # # tag.insert_before(obj)
6 # tag.insert_after(obj)
7 # print(soup)

View Code

1.23、replace_with 在当前标签替换为指定标

1 # from bs4.element import Tag
2 # obj = Tag(name='i', attrs={'id': 'it'})
3 # obj.string = '我是一个新来的'
4 # tag = soup.find('div')
5 # tag.replace_with(obj)
6 # print(soup)

View Code

1.24、创建标签间的联系

1 # tag = soup.find('div')
2 # a = soup.find('a')
3 # tag.setup(previous_sibling=a)
4 # print(tag.previous_sibling)

View Code

1.25、wrap，将指定标签把当前标签包裹起来

# from bs4.element import Tag
# obj1 = Tag(name='div', attrs={'id': 'it'})
# obj1.string = '我是一个新来的'
#
# tag = soup.find('a')
# v = tag.wrap(obj1)
# print(soup)
 
# tag = soup.find('a')
# v = tag.wrap(soup.find('p'))
# print(soup)

View Code

1.26、unwrap，去掉当前标签，将保留其包裹的标签

1 # tag = soup.find('a')
2 # v = tag.unwrap()
3 # print(soup)

View Code

2、示例

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 import requests
 4 
 5 
 6 # ############## 方式一 ##############
 7 """
 8 # ## 1、首先登陆任何页面，获取cookie
 9 i1 = requests.get(url="http://dig.chouti.com/help/service")
10 i1_cookies = i1.cookies.get_dict()
11 
12 # ## 2、用户登陆，携带上一次的cookie，后台对cookie中的 gpsd 进行授权
13 i2 = requests.post(
14     url="http://dig.chouti.com/login",
15     data={
16         'phone': "8615131255089",
17         'password': "xxooxxoo",
18         'oneMonth': ""
19     },
20     cookies=i1_cookies
21 )
22 
23 # ## 3、点赞（只需要携带已经被授权的gpsd即可）
24 gpsd = i1_cookies['gpsd']
25 i3 = requests.post(
26     url="http://dig.chouti.com/link/vote?linksId=8589523",
27     cookies={'gpsd': gpsd}
28 )
29 
30 print(i3.text)
31 """
32 
33 
34 # ############## 方式二 ##############
35 """
36 import requests
37 
38 session = requests.Session()
39 i1 = session.get(url="http://dig.chouti.com/help/service")
40 i2 = session.post(
41     url="http://dig.chouti.com/login",
42     data={
43         'phone': "8615131255089",
44         'password': "xxooxxoo",
45         'oneMonth': ""
46     }
47 )
48 i3 = session.post(
49     url="http://dig.chouti.com/link/vote?linksId=8589523"
50 )
51 print(i3.text)
52 
53 """
54 
55 抽屉新热榜

拉勾网自动登录点赞

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup

# ############## 方式一 ##############
#
# # 1. 访问登陆页面，获取 authenticity_token
# i1 = requests.get('https://github.com/login')
# soup1 = BeautifulSoup(i1.text, features='lxml')
# tag = soup1.find(name='input', attrs={'name': 'authenticity_token'})
# authenticity_token = tag.get('value')
# c1 = i1.cookies.get_dict()
# i1.close()
#
# # 1. 携带authenticity_token和用户名密码等信息，发送用户验证
# form_data = {
# "authenticity_token": authenticity_token,
#     "utf8": "",
#     "commit": "Sign in",
#     "login": "wupeiqi@live.com",
#     'password': 'xxoo'
# }
#
# i2 = requests.post('https://github.com/session', data=form_data, cookies=c1)
# c2 = i2.cookies.get_dict()
# c1.update(c2)
# i3 = requests.get('https://github.com/settings/repositories', cookies=c1)
#
# soup3 = BeautifulSoup(i3.text, features='lxml')
# list_group = soup3.find(name='div', class_='listgroup')
#
# from bs4.element import Tag
#
# for child in list_group.children:
#     if isinstance(child, Tag):
#         project_tag = child.find(name='a', class_='mr-1')
#         size_tag = child.find(name='small')
#         temp = "项目:%s(%s); 项目路径:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, )
#         print(temp)



# ############## 方式二 ##############
# session = requests.Session()
# # 1. 访问登陆页面，获取 authenticity_token
# i1 = session.get('https://github.com/login')
# soup1 = BeautifulSoup(i1.text, features='lxml')
# tag = soup1.find(name='input', attrs={'name': 'authenticity_token'})
# authenticity_token = tag.get('value')
# c1 = i1.cookies.get_dict()
# i1.close()
#
# # 1. 携带authenticity_token和用户名密码等信息，发送用户验证
# form_data = {
#     "authenticity_token": authenticity_token,
#     "utf8": "",
#     "commit": "Sign in",
#     "login": "wupeiqi@live.com",
#     'password': 'xxoo'
# }
#
# i2 = session.post('https://github.com/session', data=form_data)
# c2 = i2.cookies.get_dict()
# c1.update(c2)
# i3 = session.get('https://github.com/settings/repositories')
#
# soup3 = BeautifulSoup(i3.text, features='lxml')
# list_group = soup3.find(name='div', class_='listgroup')
#
# from bs4.element import Tag
#
# for child in list_group.children:
#     if isinstance(child, Tag):
#         project_tag = child.find(name='a', class_='mr-1')
#         size_tag = child.find(name='small')
#         temp = "项目:%s(%s); 项目路径:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, )
#         print(temp)

github

github

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
import json
import base64

import rsa
import requests


def js_encrypt(text):
    b64der = 'MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCp0wHYbg/NOPO3nzMD3dndwS0MccuMeXCHgVlGOoYyFwLdS24Im2e7YyhB0wrUsyYf0/nhzCzBK8ZC9eCWqd0aHbdgOQT6CuFQBMjbyGYvlVYU2ZP7kG9Ft6YV6oc9ambuO7nPZh+bvXH0zDKfi02prknrScAKC0XhadTHT3Al0QIDAQAB'
    der = base64.standard_b64decode(b64der)

    pk = rsa.PublicKey.load_pkcs1_openssl_der(der)
    v1 = rsa.encrypt(bytes(text, 'utf8'), pk)
    value = base64.encodebytes(v1).replace(b'\n', b'')
    value = value.decode('utf8')

    return value


session = requests.Session()

i1 = session.get('https://passport.cnblogs.com/user/signin')
rep = re.compile("'VerificationToken': '(.*)'")
v = re.search(rep, i1.text)
verification_token = v.group(1)

form_data = {
    'input1': js_encrypt('wptawy'),
    'input2': js_encrypt('asdfasdf'),
    'remember': False
}

i2 = session.post(url='https://passport.cnblogs.com/user/signin',
                  data=json.dumps(form_data),
                  headers={
                      'Content-Type': 'application/json; charset=UTF-8',
                      'X-Requested-With': 'XMLHttpRequest',
                      'VerificationToken': verification_token}
                  )

i3 = session.get(url='https://i.cnblogs.com/EditDiary.aspx')

print(i3.text)

博客园

博客园

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 import time
 4 
 5 import requests
 6 from bs4 import BeautifulSoup
 7 
 8 session = requests.Session()
 9 
10 i1 = session.get(
11     url='https://www.zhihu.com/#signin',
12     headers={
13         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
14     }
15 )
16 
17 soup1 = BeautifulSoup(i1.text, 'lxml')
18 xsrf_tag = soup1.find(name='input', attrs={'name': '_xsrf'})
19 xsrf = xsrf_tag.get('value')
20 
21 current_time = time.time()
22 i2 = session.get(
23     url='https://www.zhihu.com/captcha.gif',
24     params={'r': current_time, 'type': 'login'},
25     headers={
26         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
27     })
28 
29 with open('zhihu.gif', 'wb') as f:
30     f.write(i2.content)
31 
32 captcha = input('请打开zhihu.gif文件，查看并输入验证码：')
33 form_data = {
34     "_xsrf": xsrf,
35     'password': 'xxooxxoo',
36     "captcha": 'captcha',
37     'email': '424662508@qq.com'
38 }
39 i3 = session.post(
40     url='https://www.zhihu.com/login/email',
41     data=form_data,
42     headers={
43         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
44     }
45 )
46 
47 i4 = session.get(
48     url='https://www.zhihu.com/settings/profile',
49     headers={
50         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
51     }
52 )
53 
54 soup4 = BeautifulSoup(i4.text, 'lxml')
55 tag = soup4.find(id='rename-section')
56 nick_name = tag.find('span',class_='name').string
57 print(nick_name)
58 
59 知乎

知乎

posted @ 2020-07-13 16:18 小菜鸡1枚阅读(156) 评论(0) 收藏举报

刷新页面返回顶部

小菜鸡

好文要收藏+分享，一旦错过可能就要几倍的成本去重新搜索查找！

python之爬虫

公告