import requests
import re
import pandas as pd
def get_all_date_url():
all_url=[]
for i in range(61):
url = 'http://club.xywy.com/keshi/{}.html'.format(str(i+1))
res = requests.get(url)
urls=re.findall(r"http://club.xywy.com/keshi/\d{4}-\d{2}-\d+/\d+\.html",res.text)
all_url.extend(urls)
return list(set(all_url))
def get_QA_url(url):
all_QA_url=[]
res = requests.get(url)
res.encoding = 'gb2312'
all_page = re.findall(r'共 (\d+) 页',res.text)[0]
for i in range(int(all_page)):
url1 = 'http://club.xywy.com/keshi/'+ url.split('/')[-2] + '/' + str(i+1) +'.html'
all_QA_url.append(url1)
return list(set(all_QA_url))
def main():
all_url_data = []
for i in get_all_date_url():
all_url_data.extend(get_QA_url(i))
info_list = []
for detail_url in all_url_data:
final_dic_data = {}
final_dic_data['url']=detail_url
final_dic_data['患者标题']=xx
final_dic_data['患者姓名']=xx
final_dic_data['患者性别']=xx
final_dic_data['提问日期']=xx
final_dic_data['患者描述']=xx
final_dic_data['医生姓名']=xx
final_dic_data['医生职称']=xx
final_dic_data['医生科室']=xx
final_dic_data['问题分析']=xx
final_dic_data['回答时间']=xx
info_list.append(final_dic_data)
df =pd.DataFrame(info_list)
df.to_excel('xunyiwenyao.xlsx',index=False)
if __name__ == '__main__':
mian()