#导入网页结构解析的包
from bs4 import BeautifulSoup
#导入请求的包
import requests
import html5lib
import csv
# 构造头部,有些网站不构造头部进不去,被爬的多了。。。
headers = {
"User-Agent":'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0',
"Cookie":'douban-fav-remind=1; _vwo_uuid_v2=D6C532C778CBFC308A7B3BD02661E83E0|91a5bda9895769b4b43965b28781cab8; gr_user_id=eb35c836-a95b-48ea-940d-16253bde92f9; __utmv=30149280.22329; _ga=GA1.2.233925341.1587723778; _vwo_uuid_v2=D6C532C778CBFC308A7B3BD02661E83E0|91a5bda9895769b4b43965b28781cab8; __yadk_uid=0KwbUUrHHR04zBWsc11XX3f1FZEb0jjd; bid=Jlhtm7lPP2I; viewed="4913064_35430455_26358448_30443579"; push_noty_num=0; push_doumail_num=0; ll="108303"; dbcl2="223290636:g+JRMQRbmMg"; ck=VMEG; __utmc=30149280; __utmc=81379588; ap_v=0,6.0; __utma=30149280.233925341.1587723778.1626047696.1626055931.59; __utmz=30149280.1626055931.59.22.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; __utmt_douban=1; __utma=81379588.233925341.1587723778.1626047696.1626055988.32; __utmz=81379588.1626055988.32.6.utmcsr=movie.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=d7b1ff61-8bc9-42bc-9cdf-f2968a7493ea; gr_cs1_d7b1ff61-8bc9-42bc-9cdf-f2968a7493ea=user_id%3A1; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1626055988%2C%22https%3A%2F%2Fmovie.douban.com%2F%22%5D; _pk_ses.100001.3ac3=*; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_d7b1ff61-8bc9-42bc-9cdf-f2968a7493ea=true; __gads=ID=867b5743964304b6-22afb38145ca00ca:T=1626055988:RT=1626055988:S=ALNI_MZ9f7PhPo95UTXMt5Zji6LR-cnbQA; __utmb=30149280.17.10.1626055931; __utmb=81379588.5.10.1626055988; _pk_id.100001.3ac3=a2f8aa796fff4445.1618412408.32.1626056047.1626047696.; ct=y'
}
url = '' #具体网址就不留了
response=requests.get(url =url,headers=headers)
# 返回的状态码是200 远程访问的连接是正常的
status_code = response.status_code;
print(status_code)
'''
京东继续不需要构造头部
url = 'https://www.jd.com/?cu=true&utm_source=www.jiegeng.com&utm_medium=tuiguang&utm_campaign=t_1000159524_&utm_term=aa80efab990e45aaa713081609f8abce'
response = requests.get(url)
status_code = response.status_code;
print(status_code)
print(type(response.headers),response.headers)
'''
# 返回的状态码是200 远程访问的连接是正常的
status_code = response.status_code;
print(status_code)
if status_code == 200:
#获取请求的网页的源码
content = response.content
#print(content.decode("utf-8"))
#需要解析网页的内容
bs = BeautifulSoup(content,"html5lib")
#获取装有所有评论的 li标签
commentItemList = bs.find_all("li",attrs={"class":"comment-item"})
#print(commentItemList)
# 遍历li标签获取评论信息
listInfor = []
for item in commentItemList:
dict_infor = {}
#获取作者名
author_span = item.find('span',attrs={"class":'comment-info'})
author = author_span.find('a').text
#print(author)
#获取评论信息
commentInfor = item.find('span',attrs={"class":"short"}).text
#print(commentInfor)
#获取评分信息
rating = item.find('span',attrs={"class":"rating"}).get('title')
#print(rating)
dict_infor['author'] = author
dict_infor['rating'] = rating
#将换行替换位空
dict_infor['comment'] = commentInfor.replace('\n','')
listInfor.append(dict_infor)
print(listInfor)