主类:
from Dao.Requests_dao import Rqst
from Dao.Csv_dao import Csv
def paqu(x):
print('爬取:知乎“英文取名”:')
url='https://www.zhihu.com/topic/19561718/top-answers'
RQST=Rqst()#初始化爬取工具对象
CSV=Csv()#初始化存储工具对象
#获取首页docx
print('首页url:'+url)
try:
docx=RQST._init_(url)
docx.raise_for_status()
print('首页连接成功!')
except:
print('error:首页连接失败!')
print('------------------------------------------------------------------------------')
#爬取首页内所有的问题的url,指定数量,返回url列表
rs=RQST.find_all_question(x)
#遍历所有的问题的url
number=0
for i in rs:
number+=1
print(number)
data=RQST.get_content(i)
CSV.save(data)
if __name__ == '__main__':
paqu(1)
爬取类:
import requests
from bs4 import BeautifulSoup
import urllib
import json
import re
import os
# import io
# import sys
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
class Rqst:
def _init_(self,url):
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
docx=requests.get(url,headers=headers)
docx.encoding='utf-8'
return docx
#获取所有的url,形成列表返回
def find_all_question(self,number):
#该界面为动态网页,所以用接口爬取json获取url
apiurl='https://www.zhihu.com/api/v4/topics/19561718/feeds/essence?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=10&offset='
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
n=0#计数
rs=[]#url列表
i=0
#爬取指定条数的问题的url
while True:
docx=requests.get(apiurl+str(i),headers=headers)
i+=10
docx.encoding='unicode'
urlst=[]
if os.path.exists(r"saveurl.csv"):
with open('saveurl.csv','r',encoding='utf-8-sig') as f:
urlst=f.readlines()
for j in range(10):
js=json.loads(docx.text)['data'][j]
try:
r='https://www.zhihu.com/question/'+str(js.get('target').get('question').get('id'))+'/answer/'+str(js.get('target').get('id'))
if r+'\n' in urlst:
break
else:
with open('saveurl.csv','a',encoding='utf-8-sig') as f:
f.write(r+'\n')
rs.append(r)
n+=1
except:
pass
if n>=number:
return rs
if n>=number:
return rs
#获取问题
def get_content_question(self,bs):
rs=bs.find('div',{'class':'QuestionHeader'}).find('h1')
question=rs.text
return question
#获取问题回答
def get_content_answerContent(self,bs):
rs=bs.find('div',{'class':'QuestionAnswer-content'}).find_all('p')
content=''
for i in rs:
content=content+i.text
respond=content.replace('/','or').replace('/n','').replace(',',',')
return respond
#获取作者信息
def get_content_authorInfo(self,bs):
b1=bs.find('div',{'class':'ContentItem AnswerItem'})
#作者的名字
b1_1=b1.find('div',{'class':'AuthorInfo-content'}).find('a',{'data-za-detail-view-element_name':'User'})
try:
author='作者名称:'+b1_1.text
except:
author='作者名称:匿名用户'
#作者的描述
b1_2=b1.find('div',{'class':'ztext AuthorInfo-badgeText'})
try:
author_describe='作者描述:'+b1_2.text.replace('\n',',')
except:
author_describe='作者描述:无信息'
#作者的回答的赞同人数
b1_3=b1.find('meta',{'itemprop':'upvoteCount'})
try:
author_agreeNumber='赞同人数:'+b1_3.get('content')
except:
author_agreeNumber='赞同人数:无信息'
#回答发布日期
b1_4=b1.find('meta',{'itemprop':'dateCreated'})
try:
author_dateCreated='发布日期:'+b1_4.get('content').replace('T','*').replace('.000Z','')
except:
author_dateCreated='发布日期:无信息'
#回答编辑日期
b1_5=b1.find('meta',{'itemprop':'dateModified'})
try:
author_dateModified='编辑日期:'+b1_5.get('content').replace('T','*').replace('.000Z','')
except:
author_dateModified='编辑日期:无信息'
data=[
author,
author_describe,
author_agreeNumber,
author_dateCreated,
author_dateModified
]
return data
#获取讨论
def get_content_discuss(self,bs):
#得到讨论api
rs=bs.find('div',{'class':'Card AnswerCard'}).find('div',{'class':'ContentItem AnswerItem'})
url='https://www.zhihu.com/api/v4/answers/'+rs.get('name')+'/root_comments?order=normal&limit=20&offset=0&status=open'
#请求api得到json
docx=self._init_(url)
docx.encoding='utf-8'
discusses=[]
for i in json.loads(docx.text).get('data'):
user=i['author']['member']['name']
discuss=i['content'].replace('<p>','').replace('</p>','').replace('</br>','').replace('<br>','').replace(',',',').replace('\n','')
disc=re.sub("<.*?>",'','#'+user+'#:'+discuss)
discusses.append(disc)
return discusses
#获取所有内容
def get_content(self,url):
docx=self._init_(url)
try:
print('链接:',url)
docx.raise_for_status()
print('连接成功!')
except:
print('error:连接失败!')
try:
print('正在爬取数据。。。')
bs=BeautifulSoup(docx.text,'html.parser')
#获取问题
question=self.get_content_question(bs)
#获取问题回答
content=self.get_content_answerContent(bs)
#获取作者信息
author_data=self.get_content_authorInfo(bs)
#获取讨论
discusses=self.get_content_discuss(bs)
data={
'question':question,
'url':url,
'content':content,
'discusses':discusses,
'author_data':author_data
}
print('数据爬取成功!')
return data
except:
print('error:数据爬取失败!')
保存类:
import os
class Csv():
def save(self,data):
print('正在保存数据。。。')
try:
judge=True
if os.path.exists(r"save.csv"):
judge=False
with open('save.csv','a',encoding='utf-8-sig') as f:
#写入表头
if judge:
head=['问题','链接','回答','答主信息','讨论']
f.write(','.join(head)+'\n')
#写入第一行
row1=[data['question'],data['url'],data['content'],data['author_data'][0],data['discusses'][0]]
f.write(','.join(row1)+'\n')
#写入第二行及以后内容
row=max(len(data['discusses']),len(data['author_data']))
for i in range(1,row):
row=['','','','','']
row[0]=''
row[1]=''
row[2]=''
try:
row[3]=data['author_data'][i]
except:
row[3]=''
try:
row[4]=data['discusses'][i]
except:
row[4]=''
f.write(','.join(row)+'\n')
f.write('\n')
print('Save successfully!')
except:
print('error:保存失败!')