一.利用request和xpath爬取微医网
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#author tom
import requests
from lxml import etree
import pymongo
#爬取微医网类
class DoctorSpider():
#初始化应该具有的一些属性
def __init__(self):
self.headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
self.base_url='https://www.guahao.com/expert/all/%E5%85%A8%E5%9B%BD/all/%E4%B8%8D%E9%99%90/p'
self.page_num=1
self.info_list = []
self.client=pymongo.MongoClient(host='127.0.0.1',port=27017)
self.db=self.client['test']
#抓取网页数据
def crwal(self):
print('正在爬取第{}页'.format(self.page_num))
url=self.base_url+str(self.page_num)
res=requests.get(url=url,headers=self.headers).text
#中国有38页
if self.page_num<=38:
self.page_num+=1
return res
# 网页内容解析
def parse(self,res):
page_text=res
tree=etree.HTML(page_text)
li_list=tree.xpath('//div[@class="g-doctor-items to-margin"]/ul/li')
for li in li_list:
name=li.xpath("./div[2]/a/text()")[0]
skill=li.xpath("./div[2]/div[1]/p/text()")[0]
#解析出来的有很多空格,回车,制表符,
skill=skill.replace('\n','').replace('\r','').replace(' ','').strip()
position=li.xpath("./div[1]/dl/dt/text()")[1]
position = position.replace('\n','').strip()
score=li.xpath("./div[1]/dl/dd/p[3]/span/em/text()")[0]
num=li.xpath("./div[1]/dl/dd/p[3]/span/i/text()")[0]
office=li.xpath("./div[1]/dl/dd/p[1]/text()")[0]
hospital=li.xpath("./div[1]/dl/dd/p[2]/span/text()")[0]
dic={
'name':name,
'skill':skill,
'position':position,
'score':score,
'num':num,
'office':office,
'hospital':hospital,
}
self.save(dic)
#保存函数(保存到mongodb)
def save(self,dic):
collection=self.db['weiyiwang']
collection.save(dic)
#项目启动函数
def run(self):
while self.page_num<=38:
response=self.crwal()
a=self.parse(response)
if __name__ == '__main__':
doctor=DoctorSpider()
doctor.run()