爬虫 - beautifulsoup4

爬虫 - beautifulsoup4

BeautifulSoup是一个模块, 该模块用于接收一个HTML或XML字符串, 然后将其进行格式化, 之后遍可以使用他提供的方法进行快速查找指定元素, 从而使得在HTML或XML中查找指定元素变得简单.

属性及示例:

from bs4 import BeautifulSoup
 
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
asdf
    <div class="title">
        <b>The Dormouse's story总共</b>
        <h1>f</h1>
    </div>
<div class="story">Once upon a time there were three little sisters; and their names were
    <a  class="sister0" id="link1">Els<span>f</span>ie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</div>
ad<br/>sf
<p class="story">...</p>
</body>
</html>
"""
 
soup = BeautifulSoup(html_doc, features="lxml")
# 找到第一个a标签
tag1 = soup.find(name='a')
# 找到所有的a标签
tag2 = soup.find_all(name='a')
# 找到id=link2的标签
tag3 = soup.select('#link2')


安装:
pip3 install beautifulsoup4

使用示例:
from bs4 import BeautifulSoup
 
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
    ...
</body>
</html>
"""
 
soup = BeautifulSoup(html_doc, features="lxml")


1. name,标签名称
# tag = soup.find('a')
# name = tag.name # 获取
# print(name)
# tag.name = 'span' # 设置
# print(soup)


2. attr,标签属性
# tag = soup.find('a')
# attrs = tag.attrs    # 获取
# print(attrs)
# tag.attrs = {'ik':123} # 设置
# tag.attrs['id'] = 'iiiii' # 设置
# print(soup)


3. children,所有子标签
# body = soup.find('body')
# v = body.children


4. children,所有子子孙孙标签
# body = soup.find('body')
# v = body.descendants


5. clear,将标签的所有子标签全部清空(保留标签名)
# tag = soup.find('body')
# tag.clear()
# print(soup)


6. decompose,递归的删除所有的标签
# body = soup.find('body')
# body.decompose()
# print(soup)


7. extract,递归的删除所有的标签,并获取删除的标签
# body = soup.find('body')
# v = body.extract()
# print(soup)


8. decode,转换为字符串(含当前标签);decode_contents(不含当前标签)
# body = soup.find('body')
# v = body.decode()
# v = body.decode_contents()
# print(v)


9. encode,转换为字节(含当前标签);encode_contents(不含当前标签)
# body = soup.find('body')
# v = body.encode()
# v = body.encode_contents()
# print(v)


10. find,获取匹配的第一个标签
# tag = soup.find('a')
# print(tag)
# tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tag)


11. find_all,获取匹配的所有标签
# tags = soup.find_all('a')
# print(tags)
 
# tags = soup.find_all('a',limit=1)
# print(tags)
 
# tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tags)
 
 
# ####### 列表 #######
# v = soup.find_all(name=['a','div'])
# print(v)
 
# v = soup.find_all(class_=['sister0', 'sister'])
# print(v)
 
# v = soup.find_all(text=['Tillie'])
# print(v, type(v[0]))
 
 
# v = soup.find_all(id=['link1','link2'])
# print(v)
 
# v = soup.find_all(href=['link1','link2'])
# print(v)
 
# ####### 正则 #######
import re
# rep = re.compile('p')
# rep = re.compile('^p')
# v = soup.find_all(name=rep)
# print(v)
 
# rep = re.compile('sister.*')
# v = soup.find_all(class_=rep)
# print(v)
 
# rep = re.compile('http://www.oldboy.com/static/.*')
# v = soup.find_all(href=rep)
# print(v)
 
# ####### 方法筛选 #######
# def func(tag):
# return tag.has_attr('class') and tag.has_attr('id')
# v = soup.find_all(name=func)
# print(v)
 
 
# ## get,获取标签属性
# tag = soup.find('a')
# v = tag.get('id')
# print(v)


12. has_attr,检查标签是否具有该属性
# tag = soup.find('a')
# v = tag.has_attr('id')
# print(v)


13. get_text,获取标签内部文本内容
# tag = soup.find('a')
# v = tag.get_text('id')
# print(v)


14. index,检查标签在某标签中的索引位置
# tag = soup.find('body')
# v = tag.index(tag.find('div'))
# print(v)
 
# tag = soup.find('body')
# for i,v in enumerate(tag):
# print(i,v)


15. is_empty_element,是否是空标签(是否可以是空)或者自闭合标签,

     判断是否是如下标签:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'
# tag = soup.find('br')
# v = tag.is_empty_element
# print(v)


16. 当前的关联标签
# soup.next
# soup.next_element
# soup.next_elements
# soup.next_sibling
# soup.next_siblings
 
#
# tag.previous
# tag.previous_element
# tag.previous_elements
# tag.previous_sibling
# tag.previous_siblings
 
#
# tag.parent
# tag.parents


17. 查找某标签的关联标签
# tag.find_next(...)
# tag.find_all_next(...)
# tag.find_next_sibling(...)
# tag.find_next_siblings(...)
 
# tag.find_previous(...)
# tag.find_all_previous(...)
# tag.find_previous_sibling(...)
# tag.find_previous_siblings(...)
 
# tag.find_parent(...)
# tag.find_parents(...)
 
# 参数同find_all


18. select,select_one, CSS选择器
soup.select("title")
 
soup.select("p nth-of-type(3)")
 
soup.select("body a")
 
soup.select("html head title")
 
tag = soup.select("span,a")
 
soup.select("head > title")
 
soup.select("p > a")
 
soup.select("p > a:nth-of-type(2)")
 
soup.select("p > #link1")
 
soup.select("body > a")
 
soup.select("#link1 ~ .sister")
 
soup.select("#link1 + .sister")
 
soup.select(".sister")
 
soup.select("[class~=sister]")
 
soup.select("#link1")
 
soup.select("a#link2")
 
soup.select('a[href]')
 
soup.select('a[href="http://example.com/elsie"]')
 
soup.select('a[href^="http://example.com/"]')
 
soup.select('a[href$="tillie"]')
 
soup.select('a[href*=".com/el"]')
 
 
from bs4.element import Tag
 
def default_candidate_generator(tag):
    for child in tag.descendants:
        if not isinstance(child, Tag):
            continue
        if not child.has_attr('href'):
            continue
        yield child
 
tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)
print(type(tags), tags)
 
from bs4.element import Tag
def default_candidate_generator(tag):
    for child in tag.descendants:
        if not isinstance(child, Tag):
            continue
        if not child.has_attr('href'):
            continue
        yield child
 
tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)
print(type(tags), tags)


19. 标签的内容
# tag = soup.find('span')
# print(tag.string)          # 获取
# tag.string = 'new content' # 设置
# print(soup)
 
# tag = soup.find('body')
# print(tag.string)
# tag.string = 'xxx'
# print(soup)
 
# tag = soup.find('body')
# v = tag.stripped_strings  # 递归内部获取所有标签的文本
# print(v)


20.append在当前标签内部追加一个标签
# tag = soup.find('body')
# tag.append(soup.find('a'))
# print(soup)
#
# from bs4.element import Tag
# obj = Tag(name='i',attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.append(obj)
# print(soup)


21.insert在当前标签内部指定位置插入一个标签
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.insert(2, obj)
# print(soup)


22. insert_after,insert_before 在当前标签后面或前面插入
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# # tag.insert_before(obj)
# tag.insert_after(obj)
# print(soup)


23. replace_with 在当前标签替换为指定标签
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('div')
# tag.replace_with(obj)
# print(soup)


24. 创建标签之间的关系
# tag = soup.find('div')
# a = soup.find('a')
# tag.setup(previous_sibling=a)
# print(tag.previous_sibling)


25. wrap,将指定标签把当前标签包裹起来
# from bs4.element import Tag
# obj1 = Tag(name='div', attrs={'id': 'it'})
# obj1.string = '我是一个新来的'
#
# tag = soup.find('a')
# v = tag.wrap(obj1)
# print(soup)
 
# tag = soup.find('a')
# v = tag.wrap(soup.find('p'))
# print(soup)


26. unwrap,去掉当前标签,将保留其包裹的标签
# tag = soup.find('a')
# v = tag.unwrap()
# print(soup)

 


解析器

Beautiful Soup支持Python标准库中的HTML解析器, 还支持一些第三方的解析器, 如果不安装它,则 Python 会使用 Python默认的解析器. lxml 解析器更加强大,速度更快,推荐安装. Python版本的标准库中内置的HTML解析方法不够稳定.

下面是常见解析器:

 


基本使用

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())
print(soup.title.string)

结果

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title" name="dromouse">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    <!-- Elsie -->
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>
The Dormouse's story
View Code

三种方式

一. 标签选择器

选择元素

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.title)
print(type(soup.title))
print(soup.head)
print(soup.p)

结果

<title>The Dormouse's story</title>
<class 'bs4.element.Tag'>
<head><title>The Dormouse's story</title></head>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

通过这种方式获取标签,如果文档中有多个这样的标签,返回的结果是第一个标签的内容,如上面我们通过soup.p获取p标签,而文档中有多个p标签,但是只返回了第一个p标签内容

获取名称

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.title.name)

结果: titile

获取属性

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.attrs['name'])
print(soup.p['name'])

结果:

dromous

dromous

获取内容

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p clss="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.string)

结果:

The Dromouse's story

嵌套选择

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.head.title.string)

结果:

The Dromouse's story

子节点和子孙结点

使用contents:

html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.contents)

结果是将p标签下的所有子标签存入到了一个列表中:

['\n            Once upon a time there were three little sisters; and their names were\n            ', <a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>, '\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' \n            and\n            ', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, '\n            and they lived at the bottom of a well.\n        ']

使用children:

也可以获取p标签下的所有子节点内容和通过contents获取的结果是一样的,但是不同的地方是soup.p.children是一个迭代对象,而不是列表,只能通过循环的方式获取素有的信息.

html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.children)
for i, child in enumerate(soup.p.children):
    print(i, child)

结果:

<list_iterator object at 0x1064f7dd8>
0 
            Once upon a time there were three little sisters; and their names were
            
1 <a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
2 

3 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
4  
            and
            
5 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
6 
            and they lived at the bottom of a well.

通过contents以及children都是获取子节点,如果想要获取子孙节点可以通过descendants
print(soup.descendants)同时这种获取的结果也是一个迭代器

父节点和祖先节点

通过soup.a.parent就可以获取父节点的信息

html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.a.parent)

结果:

<p class="story">
            Once upon a time there were three little sisters; and their names were
            <a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> 
            and
            <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>

通过list(enumerate(soup.a.parents))可以获取祖先节点, 这个方法返回的结果是一个列表, 会分别将a标签的父节点的信息存放到列表中, 以及父节点的父节点也放到列表中, 并且最后还会讲整个文档放到列表中, 所有列表的最后一个元素以及倒数第二个元素都是存的整个文档的信息

from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(list(enumerate(soup.a.parents)))

 

兄弟节点

soup.a.next_siblings 获取后面的兄弟节点
soup.a.previous_siblings 获取前面的兄弟节点
soup.a.next_sibling 获取下一个兄弟标签
souo.a.previous_sinbling 获取上一个兄弟标签

html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(list(enumerate(soup.a.next_siblings)))
print(list(enumerate(soup.a.previous_siblings)))

结果:

[(0, '\n'), (1, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>), (2, ' \n            and\n            '), (3, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>), (4, '\n            and they lived at the bottom of a well.\n        ')]
[(0, '\n            Once upon a time there were three little sisters; and their names were\n            ')]

 


二. 标准选择器

find_all

find_all(name,attrs,recursive,text,**kwargs)
可以根据标签名,属性,内容查找文档

name的用法

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all('ul'))
print(type(soup.find_all('ul')[0]))

结果:

[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>, <ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>]
<class 'bs4.element.Tag'>

同时可以针对结果再次find_all,从而获取所有的li标签信息

for ul in soup.find_all('ul'):
    print(ul.find_all('li'))

结果:

[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>]

 

attrs的用法

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1" name="elements">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))

结果:

[<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]
[<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]

attrs可以传入字典的方式来查找标签, 但是这里有个特殊的就是class, 因为class在python中是特殊的字段, 所以如果想要查找class相关的可以更改attrs={'class_':'element'}或者soup.find_all('',{"class":"element}),特殊的标签属性可以不写attrs,例如id.

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(id='list-1'))
print(soup.find_all(class_='element'))

结果:

[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]

 

text的用法

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(text='Foo'))

结果: ['Foo','Foo']

 

find

find(name,attrs,recursive,text,**kwargs)

find返回的匹配结果的第一个元素

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find('ul'))
print(type(soup.find('ul')))
print(soup.find('page'))

结果:

<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<class 'bs4.element.Tag'>
None

其他一些类似的用法:
find_parents()返回所有祖先节点,find_parent()返回直接父节点
find_next_siblings()返回后面所有兄弟节点,find_next_sibling()返回后面第一个兄弟节点
find_previous_siblings()返回前面所有兄弟节点,find_previous_sibling()返回前面第一个兄弟节点
find_all_next()返回节点后所有符合条件的节点, find_next()返回第一个符合条件的节点
find_all_previous()返回节点后所有符合条件的节点, find_previous()返回第一个符合条件的节点

 


三. CSS选择器

通过select()直接传入CSS选择器就可以完成选择
熟悉前端的人对CSS可能更加了解,其实用法也是一样的
.表示class #表示id
标签1,标签2 - 找到所有的标签1和标签2
标签1 标签2 - 找到标签1内部的所有的标签2
[attr] - 可以通过这种方法找到具有某个属性的所有标签
[atrr=value] - 例子[target=_blank]表示查找所有target=_blank的标签

 

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.select('.panel .panel-heading'))
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
print(type(soup.select('ul')[0]))

结果

[<div class="panel-heading">
<h4>Hello</h4>
</div>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>]
<class 'bs4.element.Tag'>

获取内容

通过get_text()就可以获取文本内容

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for li in soup.select('li'):
    print(li.get_text())

结果

Foo
Bar
Jay
Foo
Bar

获取属性

获取属性的时候可以通过[属性名]或者attrs[属性名] 

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
    print(ul['id'])
    print(ul.attrs['id'])

结果

list-1
list-1
list-2
list-2

 

 


总结

推荐使用lxml解析库,必要时使用html.parser
标签选择筛选功能弱但是速度快
建议使用find()、find_all() 查询匹配单个结果或者多个结果
如果对CSS选择器熟悉建议使用select()
记住常用的获取属性和文本值的方法

 


爬虫模拟登录

chouti - work

'''
# ############## 方式一 ##############
import requests

# 1、首先登陆任何页面,获取cookie
r1 = requests.get(
    'https://dig.chouti.com/',
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15'},
)
print(r1.cookies.get_dict())
c1 = r1.cookies.get_dict()

form_data = {
    'MIME Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'phone': '8614714922903',
    'password': 'gkl273318',
    'oneMonth': '1',
}

# 2、用户登陆,携带上一次的cookie,后台对cookie中的 gpsd 进行授权
r2 = requests.post(
    url='https://dig.chouti.com/login',
    data = form_data,
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
    },
    cookies = c1
)
print(r2.cookies.get_dict())
print(r2.text)

# 3、点赞(只需要携带已经被授权的gpsd即可)
# gpsd = c1['gpsd']  # 登入的cookies
r3 = requests.post(
    url='https://dig.chouti.com/link/vote?linksId=19561071',
    # data={'linksId': 19561071},
    cookies = c1,
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
    },

)
print(r3.text)
'''

# ############## 方式二 ##############

import requests

form_data = {
    'MIME Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'phone': '8614714922903',
    'password': 'gkl273318',
    'oneMonth': '1',
}

session = requests.Session()
i1 = session.get(
    url="http://dig.chouti.com/",
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
    },
)
print(i1.cookies.get_dict())
i2 = session.post(  # 用session发请求自动携带了cookies
    url="https://dig.chouti.com/login",
    data=form_data,
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
    },
)
print(i2.cookies.get_dict())
i3 = session.post(
    url="https://dig.chouti.com/link/vote?linksId=19563006",
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
    },
)
print(i3.text)
chouti

github - work

import requests
from bs4 import BeautifulSoup

'''
# ############## 方式一 ##############
# 1. 访问登陆页面,获取 authenticity_token
r1 = requests.get(
    url='https://github.com/login',
)
soup = BeautifulSoup(r1.text, features='html.parser')
tag = soup.find(name='input', attrs={'name': 'authenticity_token'})
authenticity_token = tag.get('value')
c1 = r1.cookies.get_dict()
print(c1)
# 2. 携带authenticity_token和用户名密码等信息,发送用户验证
form_data = {
    'MIME Type': 'application/x-www-form-urlencoded',
    'commit': 'Sign in',
    'utf8': '✓',
    'authenticity_token': authenticity_token,
    'login': 'Charonnnnn',
    'password': 'gkl273318',
}
r2 = requests.post(
    url='https://github.com/session',
    data=form_data,
    cookies=c1,
)
# print(r2.text)
c2 = r2.cookies.get_dict()
c1.update(c2)

# 携带cookies到repositories取项目名字
r3 = requests.get('https://github.com/settings/repositories', cookies=c1)
# print(r3.text)

soup1 = BeautifulSoup(r3.text, features='html.parser')
list_group = soup1.find(name='div', class_='listgroup')

from bs4.element import Tag

for child in list_group.children:
    if isinstance(child, Tag):
        project_tag = child.find(name='a', class_='mr-1')
        size_tag = child.find(name='small')
        temp = "项目:%s(%s); 项目路径:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, )
        print(temp)
'''

# ############## 方式二 ##############
session = requests.Session()
# 1.
r1 = session.get('https://github.com/login')
soup = BeautifulSoup(r1.text,features='html.parser')
tag = soup.find(name='input',attrs={'name': 'authenticity_token'})
authenticity_token = tag.get('value')

# 2.
form_data = {
    'MIME Type': 'application/x-www-form-urlencoded',
    'commit': 'Sign in',
    'utf8': '',
    'authenticity_token': authenticity_token,
    'login': 'Charonnnnn',
    'password': 'gkl273318',
}
r2 = session.post('https://github.com/session',data=form_data)

# 3.
r3 = session.get('https://github.com/settings/repositories')
soup1 = BeautifulSoup(r3.text,features='html.parser')
list_group = soup1.find(name='div',class_='listgroup')

from bs4.element import Tag
for child in list_group.children:
    if isinstance(child,Tag): # 判断是否为标签
        project_tag = child.find(name='a',class_='mr-1')
        size_tag = child.find(name='small')
        temp = "项目:%s(%s); 项目路径:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, ) # tag.string获取内容
        print(temp)
github

cnblog - won't work(maybe verification)

# 过时了 爬不了

import re
import json
import base64

import rsa
import requests

              # MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDKJPFz0k33Xq4fCKDNQpn/ttUhLLmajOKBhVe0idsvk3rrNN6N5ESosOpd+jZ+8DQrwGQKGbDd8is5qBi5egRa6fJvTxIxj55ZkuhUmcSHMJd9CpDQhZ/9Vmh8N3/lHailfoWZTwD9SDsVqlLrqnmHBKqbzJ7q5mR09LgciUkgkwIDAQAB
def js_encrypt(text):
    b64der = 'MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCp0wHYbg/NOPO3nzMD3dndwS0MccuMeXCHgVlGOoYyFwLdS24Im2e7YyhB0wrUsyYf0/nhzCzBK8ZC9eCWqd0aHbdgOQT6CuFQBMjbyGYvlVYU2ZP7kG9Ft6YV6oc9ambuO7nPZh+bvXH0zDKfi02prknrScAKC0XhadTHT3Al0QIDAQAB'
    der = base64.standard_b64decode(b64der)

    pk = rsa.PublicKey.load_pkcs1_openssl_der(der)
    v1 = rsa.encrypt(bytes(text, 'utf8'), pk)
    value = base64.encodebytes(v1).replace(b'\n', b'')
    value = value.decode('utf8')

    return value


session = requests.Session()

i1 = session.get('https://passport.cnblogs.com/scripts/signin_bundle.js?v=1spnpY8gb0K9MfNetxJoLoPjd7dN7PIKB8kMqcak-RQ1')
rep = re.compile("'VerificationToken': '(.*)'")
v = re.search(rep, i1.text)
verification_token = v.group(1)

form_data = {
    'input1': js_encrypt('gkl273318'),
    'input2': js_encrypt('gkl273318+*9'),
    'remember': False
}

i2 = session.post(url='https://passport.cnblogs.com/user/signin',
                  data=json.dumps(form_data),
                  headers={
                      'Content-Type': 'application/json; charset=UTF-8',
                      'X-Requested-With': 'XMLHttpRequest',
                      'VerificationToken': verification_token}
                  )

i3 = session.get(url='https://i.cnblogs.com/EditDiary.aspx')

print(i3.text)
cnblog

zhihu - won't work(ban already)

#########知乎禁掉了第三方爬虫的抓取

'''
过时了 爬不了了
import time

import requests
from bs4 import BeautifulSoup

session = requests.Session()

i1 = session.get(
    url='https://www.zhihu.com/#signin',
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
    }
)

soup1 = BeautifulSoup(i1.text, 'lxml')
xsrf_tag = soup1.find(name='input', attrs={'name': '_xsrf'})
xsrf = xsrf_tag.get('value')

current_time = time.time()
i2 = session.get(
    url='https://www.zhihu.com/captcha.gif',
    params={'r': current_time, 'type': 'login'},
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
    })

with open('zhihu.gif', 'wb') as f:
    f.write(i2.content)

captcha = input('请打开zhihu.gif文件,查看并输入验证码:')
form_data = {
    "_xsrf": xsrf,
    'password': 'xxooxxoo',
    "captcha": 'captcha',
    'email': '424662508@qq.com'
}
i3 = session.post(
    url='https://www.zhihu.com/login/email',
    data=form_data,
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
    }
)

i4 = session.get(
    url='https://www.zhihu.com/settings/profile',
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
    }
)

soup4 = BeautifulSoup(i4.text, 'lxml')
tag = soup4.find(id='rename-section')
nick_name = tag.find('span',class_='name').string
print(nick_name)

'''


# 还是爬不了
# 利用requests 模拟登陆
import requests
import http.cookiejar as cookielib
import re
import time
import hmac
from hashlib import sha1
import json
import base64
from PIL import Image

# 利用session保持链接
session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename="cookies.txt")  # cookie存储文件,
# 提取保存的cookie
try:
    session.cookies.load(ignore_discard=True)  # 从文件中读取cookie
except:
    print("cookie 未能加载")

# 伪造header
agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
header = {
    "HOST": "www.zhihu.com",
    "Referer": "https://www.zhihu.com",
    "User-Agent": agent,
    'Connection': 'keep-alive'
}


def is_login():
    # 通过个人中心页面返回状态码来判断是否登录
    # 通过allow_redirects 设置为不获取重定向后的页面
    response = session.get("https://www.zhihu.com/inbox", headers=header, allow_redirects=False)
    if response.status_code != 200:
        zhihu_login("243054985@qq.com", "476493546+*9")
    else:
        print("你已经登陆了")


def get_xsrf_dc0():
    # 获取xsrf code和d_c0
    # 在请求登录页面的时候页面会将xsrf code 和d_c0加入到cookie中返回给客户端
    response = session.get("https://www.zhihu.com/signup", headers=header)
    return response.cookies["_xsrf"], response.cookies["d_c0"]


def get_signature(time_str):
    # 生成signature,利用hmac加密
    # 根据分析之后的js,可发现里面有一段是进行hmac加密的
    # 分析执行加密的js 代码,可得出加密的字段,利用python 进行hmac几码
    h = hmac.new(key='d1b964811afb40118a12068ff74a12f4'.encode('utf-8'), digestmod=sha1)
    grant_type = 'password'
    client_id = 'c3cef7c66a1843f8b3a9e6a1e3160e20'
    source = 'com.zhihu.web'
    now = time_str
    h.update((grant_type + client_id + source + now).encode('utf-8'))
    return h.hexdigest()


def get_identifying_code(headers):
    # 判断页面是否需要填写验证码
    # 如果需要填写则弹出验证码,进行手动填写

    # 请求验证码的url 后的参数lang=en,意思是取得英文验证码
    # 原因是知乎的验证码分为中文和英文两种
    # 中文验证码是通过选择倒置的汉字验证的,破解起来相对来说比较困难,
    # 英文的验证码则是输入验证码内容即可,破解起来相对简单,因此使用英文验证码
    response = session.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=en', headers=headers)
    # 盘但是否存在验证码
    r = re.findall('"show_captcha":(\w+)', response.text)
    if r[0] == 'false':
        return ''
    else:
        response = session.put('https://www.zhihu.com/api/v3/oauth/captcha?lang=en', headers=header)
        show_captcha = json.loads(response.text)['img_base64']
        with open('captcha.jpg', 'wb') as f:
            f.write(base64.b64decode(show_captcha))
        im = Image.open('captcha.jpg')
        im.show()
        im.close()
        captcha = input('输入验证码:')
        session.post('https://www.zhihu.com/api/v3/oauth/captcha?lang=en', headers=header,
                     data={"input_text": captcha})
        return captcha


def zhihu_login(account, password):
    '''知乎登陆'''
    post_url = 'https://www.zhihu.com/api/v3/oauth/sign_in'
    XXsrftoken, XUDID = get_xsrf_dc0()
    header.update({
        "authorization": "oauth c3cef7c66a1843f8b3a9e6a1e3160e20",  # 固定值
        "X-Xsrftoken": XXsrftoken,
    })
    time_str = str(int((time.time() * 1000)))
    # 直接写在引号内的值为固定值,
    # 只要知乎不改版反爬虫措施,这些值都不湖边
    post_data = {
        "client_id": "c3cef7c66a1843f8b3a9e6a1e3160e20",
        "grant_type": "password",
        "timestamp": time_str,
        "source": "com.zhihu.web",
        "password": password,
        "username": account,
        "captcha": "",
        "lang": "en",
        "ref_source": "homepage",
        "utm_source": "",
        "signature": get_signature(time_str),
        'captcha': get_identifying_code(header)
    }

    response = session.post(post_url, data=post_data, headers=header, cookies=session.cookies)
    if response.status_code == 201:
        # 保存cookie,下次直接读取保存的cookie,不用再次登录
        session.cookies.save()
    else:
        print("登录失败")


if __name__ == '__main__':
    is_login()
zhihu

leetcode - incomplete

import requests
from bs4 import BeautifulSoup

session = requests.session()
r1 = session.get('https://leetcode.com/accounts/login/')
# print(r1.text)
soup = BeautifulSoup(r1.text, features='html.parser')
csrfmiddlewaretoken = soup.find(name='input', attrs={'name': 'csrfmiddlewaretoken'}).get('value')

request_data = {
    "form": {
        "fields": {
            "login": {
                "label": "Login",
                "value": "Charonnnnn",
                "help_text": "",
                "errors": [],
                "widget": {
                    "attrs": {
                        "placeholder": "Username or e-mail",
                        "autofocus": "autofocus"
                    }
                }
            },
            "password": {
                "label": "Password",
                "value": "gkl273318",
                "help_text": "",
                "errors": [],
                "widget": {
                    "attrs": {
                        "placeholder": "Password"
                    }
                }
            },
            "remember": {
                "label": "Remember Me",
                "value": 'false',
                "help_text": "",
                "errors": [],
                "widget": {
                    "attrs": {}
                }
            }
        },
        "field_order": [
            "login",
            "password",
            "remember"
        ],
        "errors": [
            "The username and/or password you specified are not correct."
        ]
    },
    "html": "<!DOCTYPE html>\n\n\n<html>\n  <head>\n    <meta charset=\"utf-8\">\n    <title>LeetCode Account Login</title>\n    <meta property=\"og:title\" content=\"\" />\n\n    \n    <meta content='width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, viewport-fit=cover' name='viewport' />\n    \n    <meta name=\"description\" content=\"Level up your coding skills and quickly land a job. This is the best place to expand your knowledge and get prepared for your next interview.\" />\n    \n    <meta property=\"og:image\" content=\"/static/images/LeetCode_Sharing.png\" />\n    <meta property=\"og:description\" content=\"Level up your coding skills and quickly land a job. This is the best place to expand your knowledge and get prepared for your next interview.\" />\n\n    \n\n    <link rel=\"apple-touch-icon\" sizes=\"57x57\" href=\"/apple-touch-icon-57x57.png\" />\n    <link rel=\"apple-touch-icon\" sizes=\"60x60\" href=\"/apple-touch-icon-60x60.png\" />\n    <link rel=\"apple-touch-icon\" sizes=\"72x72\" href=\"/apple-touch-icon-72x72.png\" />\n    <link rel=\"apple-touch-icon\" sizes=\"76x76\" href=\"/apple-touch-icon-76x76.png\" />\n    <link rel=\"apple-touch-icon\" sizes=\"114x114\" href=\"/apple-touch-icon-114x114.png\" />\n    <link rel=\"apple-touch-icon\" sizes=\"120x120\" href=\"/apple-touch-icon-120x120.png\" />\n    <link rel=\"apple-touch-icon\" sizes=\"144x144\" href=\"/apple-touch-icon-144x144.png\" />\n    <link rel=\"apple-touch-icon\" sizes=\"152x152\" href=\"/apple-touch-icon-152x152.png\" />\n    <link rel=\"apple-touch-icon\" sizes=\"180x180\" href=\"/apple-touch-icon-180x180.png\" />\n    <link rel=\"icon\" type=\"image/png\" href=\"/favicon-16x16.png\" sizes=\"16x16\" />\n    <link rel=\"icon\" type=\"image/png\" href=\"/favicon-32x32.png\" sizes=\"32x32\" />\n    <link rel=\"icon\" type=\"image/png\" href=\"/favicon-96x96.png\" sizes=\"96x96\" />\n    <link rel=\"icon\" type=\"image/png\" href=\"/favicon-160x160.png\" sizes=\"160x160\" />\n    <link rel=\"icon\" type=\"image/png\" href=\"/favicon-192x192.png\" sizes=\"192x192\" />\n    <meta name=\"application-name\" content=\"LeetCode\"/>\n    <meta name=\"msapplication-TileColor\" content=\"#da532c\" />\n    <meta name=\"msapplication-TileImage\" content=\"/mstile-144x144.png\" />\n\n    <script>\n  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){\n  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o)\n  ,m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a\n  ,m)})(window,document,'script','//www.google-analytics.com/analytics.js','ga'\n  );\n  ga('create', 'UA-6849839-6', 'leetcode.com');\n  ga('require', 'displayfeatures');\n  ga('send', 'pageview');\n</script>\n\n    <link href=\"/static/bootstrap/dist/css/bootstrap.min.css?v=3.3.7\" rel=\"stylesheet\">\n    <link href=\"/static/font-awesome/css/font-awesome.min.css\" rel=\"stylesheet\">\n    <link href=\"/static/bootstrap-table/dist/bootstrap-table.min.css\" rel=\"stylesheet\" />\n\n    \n\n    <link rel=\"stylesheet\" href=\"/static/CACHE/css/261e37700c6a.css\" type=\"text/css\" />\n\n    \n\n    \n    \n  <link rel=\"stylesheet\" href=\"/static/CACHE/css/6a4c3865578d.css\" type=\"text/css\" />\n\n\n    \n\n    <script type=\"text/javascript\" src=\"/static/CACHE/js/552cba795668.js\"></script>\n    <script src=\"/static/jquery/dist/jquery.min.js?v=3.3.1\"></script>\n    <script src=\"/static/jquery-sticky/jquery.sticky.js\"></script>\n    <script src=\"/static/clipboard/dist/clipboard.min.js\"></script>\n    <script src=\"/static/sweetalert2/dist/sweetalert2.min.js\"></script>\n    <script>\n      $(document).ready(function(){\n        $('.sticky').sticky({topSpacing:0});\n      });\n\n      \n      window.LeetCodeData = {\n        features: {\n          questionTranslation: false,\n          subscription: true,\n          signUp: true,\n        },\n        regionInfo: \"US\",\n        userStatus: {\n          isSignedIn: false,\n          isPremium: false,\n          isAdmin:  false ,\n          isStaff:  false ,\n          isSuperuser:  false ,\n          request_region: 'US',\n          region: '',\n          \n        },\n        chinaURL: \"https://leetcode-cn.com\",\n        navbar: {\n          \n            loginSocial: [{\"id\": \"linkedin\", \"login_url\": \"/accounts/linkedin/login/\"}, {\"id\": \"google\", \"login_url\": \"/accounts/google/login/\"}, {\"id\": \"github\", \"login_url\": \"/accounts/github/login/\"}, {\"id\": \"facebook\", \"login_url\": \"/accounts/facebook/login/\"}],\n          \n          loginNext: undefined,\n          subscription: true,\n          explore: true,\n          mi: true,\n          contest: true,\n          discuss: true,\n          store: true,\n          book: true,\n          translate: false,\n          identity:  \"\" ,\n          discussURL: \"https://leetcode.com/discuss/\",\n          newDiscuss: true,\n          enableNotifications: true,\n        },\n      };\n    </script>\n    <script src=\"/static/angular/angular.min.js\"></script>\n    <script src=\"/static/jquery-ui-dist/jquery-ui.min.js\"></script>\n    <script src=\"/static/noty/lib/noty.min.js\"></script>\n    <!-- HTML5 Shim and Respond.js IE8 support of HTML5 elements and media queries -->\n    <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->\n    <!--[if lt IE 9]>\n      <script src=\"/static/html5shiv/dist/html5shiv.min.js\"></script>\n      <script src=\"/static/respond.js/dest/respond.min.js\"></script>\n    <![endif]-->\n\n    \n  <script>\n    $(document).ready(function() {\n      var time_diff = new Date() - new Date(localStorage.getItem('region_switcher_last_close_ts'))\n      if (time_diff > 86400000) {  // 86400000ms == 1day\n        $(\"#region_switcher\").removeClass('hide');\n      }\n    });\n\n    function openNavList() {\n      var menu = $('#nav-menu-btn');\n      var navbar = $('.navbar-collapse');\n      if(!navbar.hasClass('collapsing')) {\n        if(navbar.hasClass('in')) {\n          menu.removeClass('open');\n        } else {\n          menu.addClass('open');\n        }\n      }\n    };\n\n    function closeRegion() {\n      $(\"#region_switcher\").addClass('hide');\n      localStorage.setItem('region_switcher_last_close_ts', new Date());\n    }\n  </script>\n\n  </head>\n  <body>\n    <script src=\"/static/bootstrap/dist/js/bootstrap.min.js?v=3.3.7\"></script>\n    <script src=\"/static/bootstrap-table/dist/bootstrap-table.min.js\"></script>\n    <script type=\"text/javascript\" src=\"/static/CACHE/js/4428496449a3.js\"></script>\n\n    <script type=\"text/javascript\" src=\"/static/webpack_bundles/manifest.921c258b.js\" ></script>\n    <script type=\"text/javascript\" src=\"/static/webpack_bundles/common/vendor.496226f7.js\" ></script>\n    <script type=\"text/javascript\" src=\"/static/webpack_bundles/common/libraries.9f80309e.js\" ></script>\n    \n    \n    \n  <div class=\"content-wrapper\">\n\n     \n       \n     \n\n    \n\n    <div id=\"lc_navbar\" class=\"navbar navbar-inverse navbar-extra\"></div>\n    <div id=\"lc_navbar_placeholder\"></div>\n    <div id=\"base_content\">\n      \n  \n  <input type='hidden' name='csrfmiddlewaretoken' value='uOo1J3Cyz7wt2DBGt2hUdeD4lx8Uj0zkpWHRpsq6W0Ov1KoZWJh6BaAlEYaEBYpV' />\n  <div id=\"login_form\" ></div>\n\n    </div>\n\n  </div>\n\n  \n  <footer class=\"site-footer\" id=\"lc-footer\">\n    <div class=\"container\">\n      <hr>\n      <div class=\"row\">\n        \n        <div class=\"col-sm-4 copyright\">\n          <span>Copyright © 2018 LeetCode</span>\n        </div>\n        \n        <div class=\"text-right col-sm-8 region-us\">\n          <div class=\"links\">\n            <a href=\"/support/\">Contact Us</a>\n            &nbsp;|&nbsp;\n            \n              <span class=\"hidden-xs hidden-sm\" id=\"Join_LeetCode\">\n                <a href=\"/jobs/\">Jobs</a>\n                &nbsp;|&nbsp;\n              </span>\n            \n            \n            <a href=\"/faq/\">F<span class=\"hide-too-small\">requently </span>A<span class=\"hide-too-small\">sked </span>Q<span class=\"hide-too-small\">uestions</span></a>\n            &nbsp;|&nbsp;\n            <a href=\"/terms/\">Terms<span class=\"hide-too-small\"> of Service</span></a>\n            &nbsp;|&nbsp;\n            <a href=\"/privacy/\">Privacy<span class=\"hide-too-small\">  Policy</span></a>\n            \n              <span class=\"region-base\">\n                \n                  <span class=\"hidden-xs space\">&nbsp; &nbsp; </span>\n                  <br class=\"visible-xs newline\"/>\n                  <a href=\"/region/\" class=\"choose-region us\">\n                    <img src=\"/static/images/region/us.svg\" height=\"14\"/>\n                    <span class=\"hidden-md\">United States</span>\n                  </a>\n                \n              </span>\n            \n          </div>\n        </div>\n      </div>\n      \n    </div>\n  </footer>\n\n  <div class=\"modal fade simple-modal\" id=\"supportModal\" role=\"dialog\">\n    <div class=\"modal-center\">\n      <div class=\"modal-dialog\">\n        <div class=\"modal-content\">\n          <div class=\"modal-header\">\n            <button type=\"button\" class=\"close\" data-dismiss=\"modal\" aria-hidden=\"true\">&times;</button>\n            <span class=\"text-lg text-300\">&nbsp;How can we help you?</span>\n          </div>\n\n          <div class=\"modal-body\">\n            <div class=\"row text-center\">\n              <div class=\"col-sm-4\">\n                <a\n                  class=\"support-module btn btn-default\"\n                  href=\"mailto:billing@leetcode.com?subject=Billing%20Issue&body=Name:%0D%0A%0D%0AUsername:%0D%0A%0D%0AMessage:%0D%0A%0D%0A\"\n                  target=\"_blank\"\n                >\n                  <i class=\"fa fa-credit-card text-xl\" aria-hidden=\"true\"></i>\n                  <br/><span class=\"text\">Billing &amp; Account</span>\n                </a>\n              </div>\n              <div class=\"col-sm-4\">\n                <a\n                  class=\"support-module btn btn-default\"\n                  href=\"mailto:support@leetcode.com?subject=General%20Support&body=Name:%0D%0A%0D%0AUsername:%0D%0A%0D%0AMessage:%0D%0A%0D%0A\"\n                  target=\"_blank\"\n                >\n                  <i class=\"fa fa-life-ring text-xl\" aria-hidden=\"true\"></i>\n                  <br/><span class=\"text\">General Support</span>\n                </a>\n              </div>\n              <div class=\"col-sm-4\">\n                <a\n                  class=\"support-module btn btn-default\"\n                  href=\"mailto:feedback@leetcode.com?subject=Other%20Inquiries&body=Name:%0D%0A%0D%0AUsername:%0D%0A%0D%0AMessage:%0D%0A%0D%0A\"\n                  target=\"_blank\"\n                >\n                  <i class=\"fa fa-question-circle text-xl\" aria-hidden=\"true\"></i>\n                  <br/><span class=\"text\">Other Inquiries</span>\n                </a>\n              </div>\n            </div>\n          </div>\n        </div>\n      </div>\n    </div>\n  </div>\n  \n\n  <script type=\"text/javascript\" src=\"/static/webpack_bundles/apps/navigation/index.1916ad7f.js\" ></script>\n  <script type=\"text/javascript\" src=\"/static/CACHE/js/eb02df0bc0e3.js\"></script>\n\n  <script>\n    (function($, w, d){\n      \n      \n\n      $(function() {\n        $('#navbar-right img').addClass('img-rounded');\n        setFooterStyles();\n      });\n\n      function setFooterStyles() {\n        var $footer = $('#lc-footer'),\n            height = $footer.outerHeight();\n        if (!$footer.parent().is('body')) return ;\n        $footer.css('margin-top', -height);\n        $('.content-wrapper').css('padding-bottom', height);\n      }\n\n    }(jQuery, window, document));\n    try {\n      ace.config.set(\"basePath\", \"/static/ace-builds/src-noconflict/\")\n    } catch(err) {}\n  </script>\n\n\n    <script>\n      (function($, w, d){\n\n        $('#exit-mock-user-btn').click(function() {\n          $.ajax({\n            type: 'POST',\n            url: \"/api/exit_mock_user/\",\n            data: $('#exit-mock-user-form').serialize(),\n            success: function() {\n              window.location = window.location.href.split(\"?\")[0];\n            }\n          });\n        });\n      }(jQuery, window, document));\n    </script>\n    \n<script src=\"/static/jquery.cookie/jquery.cookie.js\"></script>\n<script>\n  const pageData = {\n    next: undefined,\n  };\n</script>\n\n<script type=\"text/javascript\" src=\"/static/webpack_bundles/apps/authentication/login_form.9c34a505.js\" ></script>\n\n\n  </body>\n</html>\n"
}

req_data = {
    'MIME Type': 'multipart/form-data',
    'Boundary': '----WebKitFormBoundaryD6ah3yTWyLn2auBv',
    'Request Data': request_data
}

r2 = session.post(
    url='https://leetcode.com/accounts/login/',
    data=req_data,
)
print(r2.text)
leetcode

 

posted @ 2018-05-16 14:03  Charonnnnn  阅读(182)  评论(0)    收藏  举报