爬虫的知识

在我的理解下，Python的主要用途就是三点：一是做单机处理程序。（文本、数字、文件）。二是WEB程序（django/flask。）三是爬虫。

要做爬虫，就离不了这两个第三方库：一是requests，这个大家都熟悉，要使用它来模拟浏览器的GET、POST操作，安装也比较简单，直接pip3 install requests即可。

注：如果完成后，无法正常使用，提示没有get参数时，可以到安装目录下找一找，看有没有一个文件夹的名字是requests，如果有的话，改名、删除即可正常使用。

二是beautifulsoup， BeautifulSoup是python的一个第三方库，在爬虫中，起着网页解析器的作用，可以对下载好的网页进行页面元素分析，提取出有价值的数据。安装j

比较简单，直接pip3 install bs4 ，（这里注意，不能直接安装beautifulsoup），导入时from bs4 import BeautifulSoup即可。

示例一：

看一下这段代码，就是到博客园中，获取某一页关于PYTHON的文件列表，最终生成的列表中，存储了标题和链接。将图片存储到了项目文件夹下的img目录。

'''
简单的爬虫程序
流程：
1、导入requests模块。从bs4中导入BeautifulSoup模块。为了操作path,导入os模块。
2、利用requests.get方法，爬取指定地址的网页。
3、利用Beautifulsoup进行网页解析。
4、从解析的对象中，查找相应标签，并存入列表。 
5、多线程的方式，从列表中读取url，并下载保存。

'''

import requests
from bs4 import BeautifulSoup
import os
url="https://www.cnblogs.com/cate/python/"

#个别网站有反爬措施，要检查request的header，所以需要进行伪装。有时需要携带cookie.
res=requests.get(url=url,headers={'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'})

soup=BeautifulSoup(res.text,"html.parser")
res_div=soup.find(name='div',attrs={'id':"post_list"})

url_list=res_div.find_all(name='div',attrs={'class':'post_item_body'})
data_list=[]
import threading
def task(item):

    title = item.find(name='h3').text
    link = item.find(name='a', attrs={'class': "titlelnk"}).attrs.get('href')
    data_list.append({'title': title, 'link': link})
    img_url = item.find(name='img', attrs={'class': 'pfs'})
    if not img_url:
        return None
    src = img_url.get('src')
    src = "https:" + src
    file_name = os.path.join('img', src.rsplit('/', maxsplit=1)[1])
    ret = requests.get(src)
    with open(file_name, 'wb') as f:
        f.write(ret.content)
if __name__=='__main__':
    for item in url_list:
        thr=threading.Thread(target=task,args=(item,)) #多线程。不推荐
        thr.start()

示例二：到抽屉网上，自动登录，并完成为文章点赞的功能。

import requests

# 1首次访问页面，获取cookies
rs1 = requests.get(url="http://dig.chouti.com",
                   headers={
                       'user-agent': 'Mozilla/5.0(Windows NT 6.1; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
                   })
u_cook = rs1.cookies.get_dict()

# 2登录页面，让网站对前边的cookies进行备案授权
rs2 = requests.post(url="https://dig.chouti.com/login",
                    headers={
                        'user-agent': 'Mozilla/5.0(Windows NT 6.1; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
                        'origin':'https://dig.chouti.com',
                        'referer':'https://dig.chouti.com/'
                    },
                    data={'phone': '86******',#手机号码
                          'password': '*****',#密码。
                          'oneMonth': 1
                          },
                    cookies=u_cook
                    )
print (rs2.text)

#3点赞
rs3=requests.post(
    url="https://dig.chouti.com/link/vote?linksId=25519526",
    headers={'user-agent': 'Mozilla/5.0(Windows NT 6.1; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'},
    cookies=rs1.cookies.get_dict())
print(rs3.text)

示例三：

自动登录github.com，并下载网页的内容。

import requests
from bs4 import BeautifulSoup
rs=requests.get("https://github.com/login")
soup=BeautifulSoup(rs.text,"html.parser")
to_ken=soup.find(name='input',attrs={'name':'authenticity_token'}).attrs.get('value')

rslogin=requests.post(
    url="https://github.com/session",
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    },
    data={
        'commit':'Sign in',
        'utf8':'✓',
        'authenticity_token':to_ken,
        'login':'*******',#自己的用户名。
        'password':'****',#自己的密码。
        'webauthn-support':'supported'
            },
    cookies=rs.cookies.get_dict()
)
rsviem=requests.get(
    url='https://github.com/lzszs/learn',
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

    },
    cookies=rslogin.cookies.get_dict()

)
print(rsviem.text)

示例四。利用协程来高速并行下载。

from gevent import monkey; monkey.patch_all()
import requests
from bs4 import BeautifulSoup
import os
import gevent
url="https://www.cnblogs.com/cate/python/"
res=requests.get(url=url,headers={'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'})
soup=BeautifulSoup(res.text,"html.parser")
res_div=soup.find(name='div',attrs={'id':"post_list"})
url_list=res_div.find_all(name='div',attrs={'class':'post_item_body'})
data_list=[]
src_list=[]
file_name_list=[]
for item in url_list:
    title = item.find(name='h3').text
    link = item.find(name='a', attrs={'class': "titlelnk"}).attrs.get('href')
    data_list.append({'title': title, 'link': link})
    img_url = item.find(name='img', attrs={'class': 'pfs'})
    if not img_url:
        continue
    src = img_url.get('src')
    src = "https:" + src
    src_list.append(src)
    file_name = os.path.join('img', src.rsplit('/', maxsplit=1)[1])
    file_name_list.append(file_name)
def task(src,file_name):
    ret = requests.get(src)
    with open(file_name, 'wb') as f:
        f.write(ret.content)
    print(src,'ok')
gevent_list=[]
for s_item,f_item in zip(src_list,file_name_list): #构建协程工作列表。

    gevent_list.append(gevent.spawn(task,s_item,f_item))


if __name__=='__main__':
    gevent.joinall(gevent_list)#将工作列表传给gevent，开始工作。

posted @ 2019-04-05 15:47 LZ鱼乐阅读(169) 评论(0) 收藏举报

刷新页面返回顶部

py01

爬虫的知识

公告