python requests bs4入门(一)-获取TOP100榜电影名字和主演写入数据库
Requests
获取猫眼TOP100榜电影名字和主演
1 import time 2 import requests 3 from model import * 4 from bs4 import BeautifulSoup 5 headers = {"Content-Type": "text/html; charset=utf-8", 6 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"} 7 8 url = f"https://www.maoyan.com/board/4?timeStamp=1637291300330&channelId=40011&index=2&signKey=c286a92c2bb667036254185fde905f09&sVersion=1" 9 r = requests.get(url, timeout=3, headers=headers) 10 print(url) 11 soup = BeautifulSoup(r.text, "html5lib") 12 bb = soup.find('ul', class_="list-pager").text.replace("\n", "").split(" ") 13 page = [i for i in bb if i != ''] 14 print(page[-2]) 15 for i in range(0,int(page[-2])): 16 p=i*10 17 print(i) 18 url = f"https://www.maoyan.com/board/4?timeStamp=1637053092611&sVersion=1&index=7&signKey=52df1051b8c1478e914905882e09e10a&channelId=40011&requestCode=c30fbaba9d9f7b73a53f83fe71ac0ec13ztmp&offset={p}" 19 r = requests.get(url, timeout=3, headers=headers) 20 print(url) 21 time.sleep(3) 22 print(r.status_code) 23 soup = BeautifulSoup(r.text, "html5lib") 24 # 获取某标签的属性值 25 soup1 = BeautifulSoup(r.text, "html5lib") 26 aa = soup.find_all('div', class_="board-item-content") 27 for a in aa: 28 bb = [] 29 datalist={} 30 dd = a.find('p', class_="name") 31 ff = a.find('p', class_="star") 32 ss = a.find('p', class_="releasetime") 33 34 movie_name=dd.getText().replace("\n","") 35 art_name=ff.getText().replace("\n","").replace(" ","").replace("主演:","") 36 movie_time=ss.getText().replace("\n", "").replace(" ", "").replace("上映时间:","") 37 print(movie_name) 38 print(art_name) 39 print(movie_time) 40 bb.append(dict(movie_name=movie_name, art_name=art_name, movie_time=movie_time, remark='猫眼')) 41 for l in bb: 42 find_data = SQLsession.query(Infos).filter_by(movie_name=l['movie_name'], remark='猫眼').first() 43 if not find_data: 44 SQLsession.add(Infos(**l)) 45 SQLsession.commit()
如果出现以下情况点进去滑动验证重新运行就可以正常获取数据
豆瓣电影
Requests-HTML
json解析
参考:













浙公网安备 33010602011771号