爬虫~bs4

今日内容

1 requests+bs4爬汽车之家新闻

# 今日头条 
# https://www.autohome.com.cn/news/1/#liststart


######
#2  爬取汽车之家新闻
######

import requests

# 向汽车之家发送get请求，获取到页面
ret=requests.get('https://www.autohome.com.cn/news/1/#liststart')
# ret.encoding='gb2312'
# print(ret.text)

# bs4解析（不用re了）
# 安装 pip3 install beautifulsoup4
# 使用
from bs4 import BeautifulSoup
# 实例化得到对象，传入要解析的文本，解析器
# html.parser内置解析器，速度稍微慢一些，但是不需要装第三方模块
# lxml：速度快一些，但是需要安装 pip3 install lxml
soup=BeautifulSoup(ret.text,'html.parser')
# soup=BeautifulSaoup(open('a.html','r'))  #这个是可以爬打开一个文件
# find（找到的第一个）
# find_all(找到的所有)
# 找页面所有的li标签
li_list=soup.find_all(name='li')
for li in  li_list:
    # li是Tag对象
    # print(type(li))
    h3=li.find(name='h3')
    if not h3:
        continue

    title=h3.text
    desc=li.find(name='p').text
    # 对象支持[]取值，为什么？重写了__getitem__魔法方法
    # 面试题：你使用过的魔法方法？
    img=li.find(name='img')['src']# type:str
    url=li.find(name='a')['href']
    # 图片下载到本地
    ret_img=requests.get('https:'+img)  #<Response [200]>
    img_name=img.rsplit('/',1)[-1]
    with open(img_name,'wb') as f:
        for line in ret_img.iter_content():
            f.write(line)
    print('''
    新闻标题：%s
    新闻摘要：%s
    新闻链接：%s
    新闻图片：%s
    '''%(title,desc,url,img))

2 bs4的使用（遍历文档树和查找文档树）

# 1 从html或者xml中提取数据的python库，修改xml
# 补充：java，配置文件基本都是xml格式，以后可能会用python修改配置文件（自动化运维平台，devops平台），mycat，自动上线，自动安装软件，配置，查看nginx日志
# 视频，生鲜，crm，鲜果配送，在线教育，cmdb      ---》（sugo平台）
# 飞猪 （旅游相关）    毒app     兔女郎

遍历文档树

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"id="id_p"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
# pip3 install lxml
soup=BeautifulSoup(html_doc,'lxml')
# 美化
# print(soup.prettify())

# 遍历文档树
#1、用法（通过.来查找，只能找到第一个）
# Tag对象
# head=soup.head
# title=head.title
# # print(head)
# print(title)

# p=soup.p
# print(p)
#2、获取标签的名称
#Tag对象
# p=soup.body
# print(type(p))
from  bs4.element import Tag
# print(p.name)

#3、获取标签的属性
# p=soup.p
# 方式一
# 获取class属性,可以有多个，拿到列表
# print(p['class'])
# print(p['id'])
# print(p.get('id'))
# 方式二
# print(p.attrs['class'])
# print(p.attrs.get('id'))
#4、获取标签的内容
# p=soup.p
# print(p.text) # 所有层级都拿出来拼到一起
# print(p.string) # 只有一层，才能去除
# print(list(p.strings)) # 把每次都取出来，做成一个生成器
#5、嵌套选择
# title=soup.head.title
# print(title)
#6、子节点、子孙节点
# p1=soup.p.children   # 迭代器
# p2=soup.p.contents  # 列表
# print(list(p1))
# print(p2)
#7、父节点、祖先节点
# p1=soup.p.parent  # 直接父节点   这个是在body内的
# p2=soup.p.parents
# print(p1)
# # print(len(list(p2)))
# print(list(p2))
#8、兄弟节点
# print(soup.a.next_sibling) #下一个兄弟
# print(soup.a.previous_sibling) #上一个兄弟
#
# print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象
# print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象

查找文档树

# 查找文档树（find，find_all），速度比遍历文档树慢
# 两个配合着使用（soup.p.find()）
# 五种过滤器: 字符串、正则表达式、列表、True、方法
# 以find为例
#1 字符串查找 引号内是字符串
# p=soup.find(name='p')
# p=soup.find(name='body')
# print(p)
# 查找类名是title的所有标签,class是关键字，class_
# ret=soup.find_all(class_='title')
# href属性为http://example.com/elsie的标签
# ret=soup.find_all(href='http://example.com/elsie')
# 找id为xx的标签
# ret=soup.find_all(id='id_p')
# print(ret)

#2 正则表达式
# import re
# # reg=re.compile('^b')
# # ret=soup.find_all(name=reg)
# #找id以id开头的标签
# reg=re.compile('^id')
# ret=soup.find_all(id=reg)
# print(ret)

# 3 列表
# ret=soup.find_all(name=['body','b'])
# ret=soup.find_all(id=['id_p','link1'])
# ret=soup.find_all(class_=['id_p','link1'])
# and 关系
# ret=soup.find_all(class_='title',name='p')
# print(ret)


#4  True
# 所有有名字的标签
# ret=soup.find_all(name=True)
#所有有id的标签
# ret=soup.find_all(id=True)
# 所有有herf属性的
# ret=soup.find_all(href=True)
# print(ret)

# 5 方法
# def has_class_but_no_id(tag):
#     return tag.has_attr('class') and not tag.has_attr('id')
#
# print(soup.find_all(has_class_but_no_id))

# 6 其他使用
# ret=soup.find_all(attrs={'class':"title"})
# ret=soup.find_all(attrs={'id':"id_p1",'class':'title'})
# print(ret)

# 7 拿到标签，取属性，取text
# ret=soup.find_all(attrs={'id':"id_p",'class':'title'})
# print(ret[0].text)

# 8 limit(限制条数)
# soup.find()  就是find_all limit=1
# ret=soup.find_all(name=True,limit=2)
# print(len(ret))

# 9 recursive
# recursive=False (只找儿子)不递归查找，只找第一层
# ret=soup.body.find_all(name='p',recursive=False)
# print(ret)

3 带你搭一个免费的代理池

# https://github.com/jhao104/proxy_pool
# 收费的：提供给你一个接口，每掉一次这个接口，获得一个代理
# 免费：用爬虫爬取，免费代理，放到我的库中，flask，django搭一个服务（删除代理，自动测试代理可用性），每次发一个请求，获取一个代理

# 带你配置

# 1 下载，解压，用pycharm打开
# 2 安装依赖 pip install -r requirements.txt
# 3 配置Config/setting.py:
	DB_TYPE = getenv('db_type', 'redis').upper()
	DB_HOST = getenv('db_host', '127.0.0.1')
	DB_PORT = getenv('db_port', 6379)
	DB_PASSWORD = getenv('db_password', '')
# 4 本地启动redis-server

# 5 可以在cli目录下通过ProxyPool.py
		cd到cli
	-python proxyPool.py schedule :调度程序，他会取自动爬取免费代理
  -python proxyPool.py webserver:启动api服务，把flask启动起来
    
    如果这个 0.0.0.0：5010 不行 我们就用http://127.0.0.1:5010/get/

4 验证码破解

# 1 简单验证码，字母，数字
# 2 高级的，选择，你好，12306选择乒乓球，滑动验证（极验）

# 打码平台（自动破解验证码，需要花钱）云打码，超级鹰（12306）
http://www.yundama.com/
http://www.chaojiying.com/

# 注册账号，（充钱）把demo下载下来，运行即可

5 爬取糗事百科段子，自动通过微信发给女朋友（老板）

## 6 爬取拉钩职位
## 7 爬取cnblogs新闻
## 8 爬取红楼梦小说写入txt
```
http://www.shicimingju.com/book/hongloumeng.html
```
## 9 爬取糗事百科段子，自动通过微信发给女朋友（老板）
## 10 肯德基餐厅信息
http://www.kfc.com.cn/kfccda/storelist/index.aspx


#####
# 1 爬取糗事百科，微信自动发送
#####
# https://www.qiushibaike.com/text/
# https://www.qiushibaike.com/text/page/1/

import requests
from bs4 import BeautifulSoup
ret=requests.get('https://www.qiushibaike.com/text/page/1/')
# print(ret.text)
ll=[]
soup=BeautifulSoup(ret.text,"lxml")
article_list=soup.find_all(name='div',id=True,class_='article')
for article in article_list:
    content=article.find(name='div',class_='content').span.text
    # content=article.find(name='div',class_='content').text
    # content=article.find(class_='content').text
    # print(content)
    # 入库
    #我们放到列表中
    ll.append(content)
print(ll)

# 微信自动发消息
# wxpy：实现了web微信的接口
# pip3 install wxpy
from wxpy import *
# 实例化得到一个对象，微信机器人对象
import random
bot=Bot(cache_path=True)

@bot.register() # 接收从指定好友发来的消息，发送者即recv_msg.sender为指定好友girl_friend
def recv_send_msg(recv_msg):
    print('收到的消息：',recv_msg.text) # recv_msg.text取得文本
    return random.choice(ll)

embed()

6 爬虫数据保存在mysql中

import requests
import pymysql
conn = pymysql.Connect(host='127.0.0.1',user='root',password='123456',database='autohome')
cursor = conn.cursor()
ret = requests.get('https://www.autohome.com.cn/news/5/#liststart')
# print(ret.text)
from bs4 import BeautifulSoup
soup = BeautifulSoup(ret.text,'lxml')
li_list = soup.find_all(name='li')
# print(li_list)
import os
BASE_DIR = r'E:\新建文件夹\flask_first\pachong'
PIC_DIR = os.path.join(BASE_DIR,'pic_list')
try:
    os.mkdir(PIC_DIR)
except Exception as e:
    pass
for li in li_list:
    h3 = li.find(name='h3')
    # print(type(h3))

    if not h3:
        continue
    title = h3.text
    desc = li.find(name='p').text
    img = li.find(name='img')['src']

    url = li.find(name='a')['href']
    ret_img = requests.get('https:'+img)
    img_name =img.rsplit('/',1)[-1]

    IMG_NAME = os.path.join(PIC_DIR,img_name)
    with open(IMG_NAME,'wb') as f:
        for  line in ret_img.iter_content():
            f.write(line)
    # print(f"""
    # 新闻标题:{title}
    # 新闻摘要:{desc}
    # 新闻连接:{'https:'+url}
    # 新闻图片:{'https:'+img}
    # """
    #       )
    print(f"""
        新闻标题:{title}
        新闻摘要:{desc}
        新闻连接:{url}
        新闻图片:{img}
        """
          )
    sql = f"insert into car_home(title,content,url,img) values('{title}','{desc}','{'https:'+url}','{'https:'+img}');"
    print(sql)
    cursor.execute(sql)
    conn.commit()
cursor.close()
conn.close()

1 什么是函数，什么是方法？

类来调用对象的绑定方法，这个方法就是一个普通函数

对象的绑定方法和类的绑定方法，类可以调用对象的绑定方法，它就是个普通函数

2 python中的魔法方法，分别什么作用

3 psutil

4 APScheduler（做定时任务的框架）

-celery：定时任务，异步任务，mq（redis，rabbimq）

5 https://github.com/jhao104/django-blog

8 pyecharts可视化的模块

posted @ 2020-05-31 21:53 ^更上一层楼$ 阅读(161) 评论(0) 收藏举报

刷新页面返回顶部

天天好心情

Working and Learning make me happy !

爬虫~bs4

今日内容

1 requests+bs4爬汽车之家新闻

2 bs4的使用（遍历文档树和查找文档树）

3 带你搭一个免费的代理池

4 验证码破解

5 爬取糗事百科段子，自动通过微信发给女朋友（老板）

6 爬虫数据保存在mysql中

1 什么是函数，什么是方法？

2 python中的魔法方法，分别什么作用

3 psutil

4 APScheduler（做定时任务的框架）

5 https://github.com/jhao104/django-blog

8 pyecharts可视化的模块

公告