初识爬虫
---恢复内容开始---
通过路飞学城的课程使用一个登录github的例子来了解一下爬虫是如何工作的
import requests
from bs4 import BeautifulSoup
#首先把需要使用的模块导入进来
class GithubProfile:
def __init__(self, email, psw):
self.email = email
self.password = psw
self.token = None
self.cookies = None
self.name = None
self.run()
def run(self):
'''获取token与cookies'''
r1 = requests.get(
url='https://github.com/login',
)
soup = BeautifulSoup(r1.text, 'html.parser')
self.token = soup.find(name='input', attrs={'name': 'authenticity_token'}).get('value')
self.cookies = r1.cookies.get_dict()
def logon(self):
'''登录并返回登录信息'''
r2 = requests.post(
url='https://github.com/session',
data={
'commit': 'Sign in',
'utf8': '✓',
'authenticity_token': self.token,
'login': self.email,
'password': self.password,
},
cookies=self.cookies
)
soup2 = BeautifulSoup(r2.text, 'html.parser')
msg = soup2.find(name='div', id='js-flash-container').text.strip()
if msg:
print(msg)
return False
else:
print('login success')
self.name = soup2.find(name='meta', attrs={'name': "octolytics-actor-login"}).get('content')
return True
def print_profile(self):
'''负责打印用户信息'''
r3 = requests.get(
url='https://github.com/%s' % self.name,
cookies=self.cookies
)
soup3 = BeautifulSoup(r3.text, 'html.parser')
p_name = soup3.find(name='span', attrs={'class': 'p-name'})
p_nickname = soup3.find(name='span', attrs={'class': 'p-nickname'})
p_note = soup3.find(name='div', attrs={'class': 'p-note user-profile-bio'})
h3 = soup3.find(name='h3')
text_center = soup3.find(name='div', attrs={'class': 'text-center text-gray pt-3'})
print('%s(%s)' % (p_nickname.text, p_name.text))
print(p_note.text)
print(h3.text.strip())
print(text_center.text)
if __name__ == '__main__':
test = GithubProfile('fdsfds', 'fsdf') # 填入正确的用户名密码会得到用户信息
if test.logon():
test.print_profile()
定义一个类函数,GithubProfile,我们完成爬虫的操作都在这里面进行。先说登录有三个步骤,打开要登录的网站,输入账号密码,点登录就可以了这是在浏览器上的情况,使用代码模拟差不多也是这个情况,
先访问一下https://github.com/login,使用浏览器的开发者工具,检查github的登录框,可以发现登录框中有一个隐藏的标签:token,我们需要带着这个去登录,否则没有模拟到正常的登录,我们就无法成功登录获取一下未加密的cookies,把这个获取下来,
定义一个函数:get_login_msg
def get_login_msg(self):
'''获取token与cookies'''
r1 = requests.get(
url='https://github.com/login',
)
soup = BeautifulSoup(r1.text, 'html.parser')
self.token = soup.find(name='input', attrs={'name': 'authenticity_token'}).get('value')
self.cookies = r1.cookies.get_dict()
这样就获取到初始登录信息,带着这些消息和我们的信息尝试登录
def logon(self):
'''登录并返回登录信息'''
r2 = requests.post(
url='https://github.com/session',
data={
'commit': 'Sign in',
'utf8': '✓',
'authenticity_token': self.token,
'login': self.email,
'password': self.password,
},
cookies=self.cookies
)
soup2 = BeautifulSoup(r2.text, 'html.parser')
msg = soup2.find(name='div', id='js-flash-container').text.strip()
if msg:
print(msg)
return False
else:
print('login success')
self.name = soup2.find(name='meta', attrs={'name': "octolytics-actor-login"}).get('content')
return True
我们的信息就可以推出去了,然后使用开发者工具找到我们登录的信息,
msg = soup2.find(name='div', id='js-flash-container').text.strip()
登录成功就可以获取到我们想要的信息
soup3 = BeautifulSoup(r3.text, 'html.parser')
p_name = soup3.find(name='span', attrs={'class': 'p-name'})
p_nickname = soup3.find(name='span', attrs={'class': 'p-nickname'})
p_note = soup3.find(name='div', attrs={'class': 'p-note user-profile-bio'})
h3 = soup3.find(name='h3')
text_center = soup3.find(name='div', attrs={'class': 'text-center text-gray pt-3'})
这样就完成了一个初始的登录github程序
---恢复内容结束---

浙公网安备 33010602011771号