初识爬虫

---恢复内容开始---

 通过路飞学城的课程使用一个登录github的例子来了解一下爬虫是如何工作的

import requests
from bs4 import BeautifulSoup
#首先把需要使用的模块导入进来

class GithubProfile:
    def __init__(self, email, psw):
        self.email = email
        self.password = psw
        self.token = None
        self.cookies = None
        self.name = None
        self.run()

    def run(self):
        '''获取token与cookies'''
        r1 = requests.get(
            url='https://github.com/login',
        )
        soup = BeautifulSoup(r1.text, 'html.parser')
        self.token = soup.find(name='input', attrs={'name': 'authenticity_token'}).get('value')
        self.cookies = r1.cookies.get_dict()

    def logon(self):
        '''登录并返回登录信息'''
        r2 = requests.post(
            url='https://github.com/session',
            data={
                'commit': 'Sign in',
                'utf8': '✓',
                'authenticity_token': self.token,
                'login': self.email,
                'password': self.password,
            },
            cookies=self.cookies
        )
        soup2 = BeautifulSoup(r2.text, 'html.parser')
        msg = soup2.find(name='div', id='js-flash-container').text.strip()
        if msg:
            print(msg)
            return False
        else:
            print('login success')
            self.name = soup2.find(name='meta', attrs={'name': "octolytics-actor-login"}).get('content')
            return True

    def print_profile(self):
        '''负责打印用户信息'''
        r3 = requests.get(
            url='https://github.com/%s' % self.name,
            cookies=self.cookies
        )
        soup3 = BeautifulSoup(r3.text, 'html.parser')
        p_name = soup3.find(name='span', attrs={'class': 'p-name'})
        p_nickname = soup3.find(name='span', attrs={'class': 'p-nickname'})
        p_note = soup3.find(name='div', attrs={'class': 'p-note user-profile-bio'})
        h3 = soup3.find(name='h3')
        text_center = soup3.find(name='div', attrs={'class': 'text-center text-gray pt-3'})

        print('%s(%s)' % (p_nickname.text, p_name.text))
        print(p_note.text)
        print(h3.text.strip())
        print(text_center.text)


if __name__ == '__main__':
    test = GithubProfile('fdsfds', 'fsdf')  # 填入正确的用户名密码会得到用户信息
    if test.logon():
        test.print_profile()

定义一个类函数,GithubProfile,我们完成爬虫的操作都在这里面进行。先说登录有三个步骤,打开要登录的网站,输入账号密码,点登录就可以了这是在浏览器上的情况,使用代码模拟差不多也是这个情况,
先访问一下https://github.com/login,使用浏览器的开发者工具,检查github的登录框,可以发现登录框中有一个隐藏的标签:token,我们需要带着这个去登录,否则没有模拟到正常的登录,我们就无法成功登录获取一下未加密的cookies,把这个获取下来,
定义一个函数:get_login_msg

def get_login_msg(self):
'''获取token与cookies'''
r1 = requests.get(
url='https://github.com/login',
)
soup = BeautifulSoup(r1.text, 'html.parser')
self.token = soup.find(name='input', attrs={'name': 'authenticity_token'}).get('value')
self.cookies = r1.cookies.get_dict()

  

这样就获取到初始登录信息,带着这些消息和我们的信息尝试登录

def logon(self):
    '''登录并返回登录信息'''
    r2 = requests.post(
        url='https://github.com/session',
        data={
            'commit': 'Sign in',
            'utf8': '✓',
            'authenticity_token': self.token,
            'login': self.email,
            'password': self.password,
        },
        cookies=self.cookies
    )
    soup2 = BeautifulSoup(r2.text, 'html.parser')
    msg = soup2.find(name='div', id='js-flash-container').text.strip()
    if msg:
        print(msg)
        return False
    else:
        print('login success')
        self.name = soup2.find(name='meta', attrs={'name': "octolytics-actor-login"}).get('content')
        return True

  

  

我们的信息就可以推出去了,然后使用开发者工具找到我们登录的信息,
msg = soup2.find(name='div', id='js-flash-container').text.strip()

登录成功就可以获取到我们想要的信息

soup3 = BeautifulSoup(r3.text, 'html.parser')
p_name = soup3.find(name='span', attrs={'class': 'p-name'})
p_nickname = soup3.find(name='span', attrs={'class': 'p-nickname'})
p_note = soup3.find(name='div', attrs={'class': 'p-note user-profile-bio'})
h3 = soup3.find(name='h3')
text_center = soup3.find(name='div', attrs={'class': 'text-center text-gray pt-3'})

这样就完成了一个初始的登录github程序

---恢复内容结束---

posted @ 2018-07-11 22:18  Daniel_hui  阅读(115)  评论(0)    收藏  举报