1 import requests
2 #无论是post、get请求
3 #要注意防爬虫策略:一般是加个请求头
4 #登陆
5
6 #下面的过程无法完成点赞
7 import requests
8 #无论是post、get请求
9 #要注意防爬虫策略:一般是加个请求头
10 #登陆
11 response_login = requests.post(
12 url = 'https://dig.chouti.com/login',
13 data = {
14 'phone':'8613125397685',
15 'password':'478324asd',
16 'oneMonth':'1'
17 },
18 #加个请求头就可以爬取了
19 headers = {
20 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
21 }
22 )
23 cookies_dict = response_login.cookies.get_dict()#就是一个字典
24 #print(cookies_dict)打印cookies信息
25 #点赞
26 r1 = requests.get(
27 url = '',#只有url的话可能会被拦截,要加个请求头
28 headers = {
29 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
30 },
31 cookies = cookies_dict
32 )
33 print(r1.text)
#可以正确点赞
import requests
#2,3步都是post
#1:访问抽屉新热榜,获取cookie(未授权)
r1 = requests.get(
url = 'https://dig.chouti.com/all/hot/recent/1',#只有url的话可能会被拦截,要加个请求头
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
},
)
r1_cookie_dict = r1.cookies.get_dict()
#2 :发送用户名和密码认证 + cookie(未授权)
#:注意用爬虫策略
response_login = requests.post(
url = 'https://dig.chouti.com/login',
data = {
'phone':'8613125397685',
'password':'478324asd',
'oneMonth':'1'
},
#加个请求头就可以爬取了
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
},
cookies = r1_cookie_dict
)
#3:点赞
r1 = requests.post(
url = 'https://dig.chouti.com/link/vote?linksId=22900531',#只有url的话可能会被拦截,要加个请求头
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
},
cookies = r1_cookie_dict#授权了
)
print(r1.text)
使用多行注释:
1.选中多行
2.按快捷键Ctrl + /
去掉多行注释:
1.选中被注释的多行内容
2.按快捷键Ctrl + /
1 在使用pycharm时,经常会需要多行代码同时缩进、左移,pycharm提供了快捷方式
2
3 1、pycharm使多行代码同时缩进
4
5 鼠标选中多行代码后,按下Tab键,一次缩进四个字符
6
7 2、pycharm使多行代码同时左移
8
9 鼠标选中多行代码后,同时按住shift+Tab键,一次左移四个字符
10 ---------------------
#左边出现bokmark的标记时可以用F11键取消
import requests#伪造浏览器向某个地址发请求
from bs4 import BeautifulSoup#解析HTML格式的字符串
#1:下载页面
ret = requests.get(url='https://www.autohome.com.cn/news/')
#print(ret) ret 是一个对象
#print(ret.content)原始字节
#print(ret.text)字符串,但是会出现乱码
#ret.encoding = 'gbk'可以自己设置编码格式
#print(ret.text)若格式正确,则不会出现乱码了
#print(ret.apparent_encoding)返回网页的编码
ret.encoding = ret.apparent_encoding#直接设置为你网页的编码
#print(ret.text)
#2:获取想要的指定内容Beautifulsuop
soup = BeautifulSoup(ret.text,'html.parser')#解析器,parser与'之间不能有空格
#print(type(soup))#soup为对象
#div = soup.find(name = 'div',id = 'focus-1')
div = soup.find(name = 'div',attrs={'id':'focus-1','class':'focusimg focusimg02'})
print(div)
li_list = div.find_all('li')#列表
#print(li_list)\
for li in li_list:
h2 = li.find('h2')
a = li.find('a')
p = li.find('p')#默认第一个参数为name
img = li.find('img')
src = img.get('src')
file_name = src.rsplit('__',maxsplit=1)[1]
ret_img = requests.get(url='https:'+src)
with open(file_name,'wb') as f:
f.write(ret_img.content)
print(h2.text, a.get('href')) # href在搜索栏会自动添加http/https
print(p.text)
print('=' * 15)
#print(a.attrs)a的所有属性
#print(a.get('href'))获取a的某一个属性
#print(h2)
#print(h2.text)
#print(a.text)
#p = li.find('p')#默认第一个参数为name
#print(p.text)
1 #加快下载
2 pip install requests -i https://pypi.douban.com/simple
3 忘了的话 百度 python 豆瓣下载源
4 使用douban源下载python包 - 中国陆特 - 博客园
5 里面有
1 #对多页进行点赞
2 import requests
3 from bs4 import BeautifulSoup
4 for page_num in range(8,9):
5 r1 = requests.get(
6 url = 'https://dig.chouti.com/all/hot/recent/%s'%page_num,
7 headers = {
8 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
9 }
10 )
11 #print(r1.text)
12 r1_cookie_dict = r1.cookies.get_dict()
13 response_login = requests.post(
14 url = 'https://dig.chouti.com/login',
15 data ={
16 'phone': '8613125397685',
17 'password': '478324asd',
18 'oneMonth': '1'
19 },
20 headers = {
21 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
22 },
23 cookies = r1_cookie_dict
24 )
25 # response_index = requests.get(
26 # url = 'https://dig.chouti.com/',
27 # headers = {
28 # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
29 # }
30 # )
31 soup = BeautifulSoup(r1.text,'html.parser')#r1.text是对的,不是response_index
32 div = soup.find(attrs={'id':'content-list'})
33 items = div.find_all(attrs={'class':'item'})
34 for item in items:
35 tag = item.find(attrs={'class':'part2'})
36 nid = tag.get('share-linkid')
37 print(nid)
38 r1 = requests.post(#r1前面出现了也没问题
39 url = 'https://dig.chouti.com/link/vote?linksId=%s'%nid,
40 headers={
41 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
42 },
43 cookies=r1_cookie_dict
44 )
45 print(r1.text)
![]()
1 import requests
2 from bs4 import BeautifulSoup
3 #找到value
4 r1 = requests.get(
5 url = 'https://github.com/login'
6 )
7 s1 = BeautifulSoup(r1.text,'html.parser')
8 token = s1.find(name = 'input',attrs={'name':'authenticity_token'}).get('value')
9 #print(r1.text)
10 #print(token)
11 r1_cookie_dict = r1.cookies.get_dict()
12 r2 = requests.post(
13 url = 'https://github.com/session',
14 data = {
15 'commit':'Sign in',
16 'utf8':'✓',
17 'authenticity_token':token,
18 'login':'clttyou',
19 'password':'9430'
20 },
21 cookies = r1_cookie_dict
22 )
23 print(r1.text)
1 #爬主页的模板
2 import requests
3 def getHtml(url):
4 try:
5 r = requests.get(url,timeout = 30)
6 r.raise_for_status()
7 r.encoding = r.apparent_encoding
8 return r.text
9 except:
10 return "产生异常"
11 if __name__ == "__main__":
12 url = "https://www.taobao.com/"
13 print(getHtml(url))
str = "00000003210Runoob01230000000";
print(str.strip('0')); # 去除首尾字符 0
str2 = " Runoob "; # 去除首尾空格
print(str2.strip());
'''
3210Runoob0123
Runoob
'''