beautifulsoup4 使用

爬取汽车之家新闻
beautifulsoup4的使用
代理池搭建
验证码破解之-打码平台

爬取汽车之家新闻

import requests
# pip3 install beautifulsoup4  解析html和xml，修改html和xml
from bs4 import BeautifulSoup

res = requests.get('https://www.autohome.com.cn/news/1/#liststart')
# 第二个参数，使用什么解析器
# html.parser内置，不需要安装第三方模块
# soup=BeautifulSoup(res.text,'html.parser')
# pip3 install lxml
soup = BeautifulSoup(res.text, 'lxml')

# 查找class为article-wrapper的div
div = soup.find(class_='article-wrapper')
div = soup.find(id='auto-channel-lazyload-article')
# print(div)
ul = soup.find(class_='article')
# print(ul)
# 继续找ul下的s所有li
li_list = ul.find_all(name='li')
# print(len(li_list))
for li in li_list:
    # 找每个li下的东西
    title = li.find(name='h3')
    if title:
        title = title.text
        # url=li.find('a')['href']
        url = 'https:' + li.find('a').attrs.get('href')
        desc = li.find('p').text
        img = 'https:' + li.find(name='img').get('src')
        print('''
        新闻标题：%s
        新闻地址：%s
        新闻摘要：%s
        新闻图片：%s

        ''' % (title, url, desc, img))

beautifulsoup4的使用

# 遍历文档树
# 搜索文档树（5种过滤规则）
# limit和recursive参数

import requests
# pip3 install beautifulsoup4  解析html和xml，修改html和xml
from bs4 import BeautifulSoup

# res=requests.get('https://www.autohome.com.cn/news/1/#liststart')
#
# with open('a.html','w') as f:
#     f.write(res.text)


html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p id="my_p" class="title">hello<b id="bbb" class="boldest">The Dormouse's story</b>
</p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
# 文档容错能力，不是一个标准的html也能解析
soup=BeautifulSoup(html_doc,'lxml')

# print(soup.prettify())

# 遍历文档树
#遍历文档树：即直接通过标签名字选择，特点是选择速度快，但如果存在多个相同的标签则只返回第一个
#1、用法
# head=soup.head
# print(head)
#2、获取标签的名称
# head=soup.head
# print(head.name)
#3、获取标签的属性(重点)
# p=soup.body.p
# # class可能有多个，即便有一个也放到列表中
# # print(p.attrs)
# print(p.attrs.get('class'))
# print(p['class'])
# print(p.get('class'))

#4、获取标签的内容
# p=soup.body.p
# # text会取该标签，子子孙孙的内容，拼到一起
# print(p.text)
# print(p.string)# # p下的文本只有一个时，取到，否则为None
# print(p.strings)#  生成器
# print(list(p.strings))  # #拿到一个生成器对象, 取到p下所有的文本内容,一个一个的在生成器中

#5、嵌套选择
# a=soup.body.a
# print(a.get('id'))
#6、子节点、子孙节点
# print(soup.p.contents) #p下所有子节点
# print(soup.p.children) #得到一个迭代器,包含p下所有子节点
# print(list(soup.p.children)) #得到一个迭代器,包含p下所有子节点
#7、父节点、祖先节点
# print(soup.a.parent) #获取a标签的父节点(只有一个)
# print(soup.p.parent) #获取p标签的父节点
# print(soup.a.parents) #找到a标签所有的祖先节点，父亲的父亲，父亲的父亲的父亲...
# print(list(soup.a.parents))#找到a标签所有的祖先节点，父亲的父亲，父亲的父亲的父亲...
# print(len(list(soup.a.parents)))#找到a标签所有的祖先节点，父亲的父亲，父亲的父亲的父亲...
#8、兄弟节点
# print(soup.a.next_sibling) #下一个兄弟
# print(soup.a.previous_sibling) #上一个兄弟
#
# print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象
# print(list(soup.a.previous_siblings)) #上面的兄弟们=>生成器对象

# 重点：取标签名，取属性值，嵌套选择


# 搜索文档树
# find()  # 只返回找到的第一个
# find_all() # 找到的所有
# 五种过滤器: 字符串、正则表达式、列表、True、方法

# 字符串过滤，过滤内容是字符串
# a=soup.find(name='a')
# res=soup.find(id='my_p')
# res=soup.find(class_='story')
# res=soup.find(href='http://example.com/elsie')

# res=soup.find(attrs={'id':'my_p'})
# res=soup.find(attrs={'class':'story'})
# print(res)

# 正则表达式
# import re
# # re_b=re.compile('^b')
# res=soup.find(name=re_b)
# # res=soup.find_all(name=re_b)
# res=soup.find_all(id=re.compile('^l'))
# print(res)

# 列表

# res=soup.find_all(name=['body','b'])
# res=soup.find_all(class_=['sister','title'])
# print(res)

# True和false

# res=soup.find_all(name=True)
# res=soup.find_all(id=True)
# res=soup.find_all(id=False)
# res=soup.find_all(href=True)
# print(res)


# 方法（了解）
# def has_class_but_no_id(tag):
#     return tag.has_attr('class') and not tag.has_attr('id')
#
# print(soup.find_all(has_class_but_no_id))


#limit(限制查找的条数)
# res=soup.find_all(name=True,limit=1)
# print(res)
# recursive（recursive递归查找，找子子孙孙）
# res=soup.body.find_all(name='b ',recursive=False)
# res=soup.body.find_all(name='p',recursive=False)
# res=soup.body.find_all(name='b',recursive=True)
# print(res)

# css选择
# ret=soup.select('#my_p')
# https://www.w3school.com.cn/cssref/css_selectors.asp
# ret=soup.select('body p')  # 子子孙孙
# ret=soup.select('body>p')  # 直接子节点（儿子）
# ret=soup.select('body>p')[0].text  # 直接子节点（儿子）
# # ret=soup.select('body>p')[0].a.find()
# print(ret)



# bs4的修改文档树  软件配置文件是xml格式的

# 软件的配置文件
# ini：configparser
# conf
# xml：bs4
# yaml格式

代理池搭建

# github，下载免费代理池开源代码（建议读一下别人的代码）
# git clone git@github.com:jhao104/proxy_pool.git
# pycharm打开，修改配置文件（reids地址修改）
# 启动爬虫：
python proxyPool.py schedule
# 启动服务：
python3 proxyPool.py server

# 随机获取一个代理
requests.get("http://127.0.0.1:5010/get/").json()
#删除一个代理
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))

验证码破解之-打码平台

# 1 验证码破解 图像处理
# 2 专业打码平台，破解验证码（收费）
# 申请超级鹰，注册
# 登录，下载sdk（代码如下），填入用户名密码，软件id
#!/usr/bin/env python
# coding:utf-8

import requests
from hashlib import md5

class Chaojiying_Client():

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


if __name__ == '__main__':
    chaojiying = Chaojiying_Client('306334678', 'lqz12345', '903641')	#用户中心>>软件ID 生成一个替换 96001
    im = open('a.jpg', 'rb').read()													#本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    print(chaojiying.PostPic(im, 1902))									#1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()

posted @ 2020-08-15 14:45 Joab-0429 阅读(322) 评论(0) 收藏举报

刷新页面返回顶部

Personal site

↑点击传送

xone

beautifulsoup4 使用

爬取汽车之家新闻

beautifulsoup4的使用

代理池搭建

验证码破解之-打码平台

公告