IP代理访问豆瓣读书

# 豆瓣读书榜单
import requests,csv, re
from bs4 import BeautifulSoup
url = "https://book.douban.com/top250?icn=index-book250-all"
headers = {
"Accept": "application/json, text/plain, */*",
"Cookie": "bid=aSF8G-zF4x8; douban-fav-remind=1; __gads=ID=757e0078add304f3-2296343d66c60057:T=1615638328:RT=1615638328:S=ALNI_MYs-2AZmTiKjP35ljRt9vrMrKrl9A; ll='118339'; ct=y; gr_user_id=8cebe265-4652-4837-a458-b85e17ef1797; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=ef504ee2-e460-461d-8dee-0d26a63b690a; gr_cs1_ef504ee2-e460-461d-8dee-0d26a63b690a=user_id%3A0; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_ef504ee2-e460-461d-8dee-0d26a63b690a=true; ap_v=0,6.0; __utma=30149280.1313679076.1615638327.1615886612.1616377606.3; __utmc=30149280; __utmz=30149280.1616377606.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt_douban=1; __utmb=30149280.1.10.1616377606; viewed='34942789'",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4442.4 Safari/537.36",
}

pranas = {
"subject_ids": '35005045,34942789,30441551,34988734,34933701,35025453,35005062,35132631,35023731,34928222,35093999,34896921,35083619,34933701,34897038,35223102,34799583,35089547,34998019,35080870,34937627,34940823,35083558,35216559,35114602,35017678,34907855,34973161,35081657,35093374,34842466,35006237,34868504,35016965,34983746,35028954,34799583,34949694,34978587,35081921,34970385,35005062,34911983,34889883,35081845,34933701,34950357,35018175,35032242,34902715,34910658,35133869,35010146,35085904',
't': '1616392054350',
}
proxies = {
'http': '123.55.102.193:9999'
}
res = requests.get(url, headers=headers, proxies=proxies)
# <Response [418]> 反爬取机制,添加请求头headers
res.encoding = "utf-8"
text = res.text

soup = BeautifulSoup(text, 'lxml')

# 图书名
book_name = soup.find_all(name='a', href=re.compile("https://book.douban.com/subject/\d*/"))
# 单价
bookid = soup.find_all(name='span', attrs={"class":"rating_nums"})
# 作者
bookauthor = soup.find_all('p', attrs={"class":"pl"})
# 评语
pingyu = soup.find_all('span', attrs={"class":"inq"})

#
li_name, li_href = [], []
for m in book_name:
li_name.append("".join(re.findall("\S", m.text)))
li_href.append("".join(re.findall("\S", m['href'])))

li_href = list(set(li_href))

for i in li_name:
if i == "":
li_name.remove(i)

li_jiage = []
for author in bookauthor:
jiage = "".join(re.findall("\d*.\d*元", author.text))
li_jiage.append(jiage)
li_bookid = []
for s in bookid:
li_bookid.append("".join(re.findall("\S", s.text)))

bt = ["图书名称", "评分", "价格", "地址"]

with open('./1.csv', 'a', newline="") as code:
f = csv.writer(code)
f.writerow(bt)
print("表头写入完成")

for i in range(len(li_name)):
li_all = []
li_all.append(li_name[i])
li_all.append(li_bookid[i])
li_all.append(li_jiage[i])
li_all.append(li_href[i])

with open('./1.csv', 'a', newline="") as code:
f = csv.writer(code)
f.writerow(li_all)
print("内容写入完成")

posted @ 2021-03-30 18:38  tevien  阅读(178)  评论(0)    收藏  举报