python新人尝试爬取大众点评齿科信息 获取评分 经纬度 团单销量 等信息
新人初次尝试,就是访问的次数多了 会被点评 反爬 需要浏览器滑动验证 ,暂时还没有学会怎么破解,
初次尝试

import requests
import re
import csv
import time
mts = []
def marse_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'cookie':'navCtgScroll=100; navCtgScroll=200; _lxsdk_cuid=16d7bde3e45c8-0b491cbf188485-67e1b3f-1fa400-16d7bde3e46c8; _lxsdk=16d7bde3e45c8-0b491cbf188485-67e1b3f-1fa400-16d7bde3e46c8; _hc.v="\"ab6667ff-ff89-4c88-9924-2865edbe01ee.1569741222\""; s_ViewType=10; mpmerchant_portal_shopid=18189287; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; cy=24; cye=shijiazhuang; _lxsdk_s=16dd2e5facb-327-0e0-88a%7C%7C190'
}
surl = 'http://www.dianping.com/shop/'
# 获取大众点评列表
response = requests.get(url,headers=headers)
text = response.content.decode('utf-8')
lis = re.findall(r'li class=""(.*?)</li>',text,re.DOTALL)
# 循环大众点评商家列表
for li in lis:
# 获取商家网页链接
urls = re.findall(r'<div class="pic" >.*?data-shopid="(.*?)".*?',li,re.DOTALL)
# 获取商家促销信息列表
cxl= re.findall(r'<div class="svr-info">(.*?)</div>',li,re.DOTALL)
# 获取商家促销信息列表详细内容
listcx=[]
# 循环促销列表
for cxs in cxl:
cxss = re.findall(r'>团购:</span>(.*?)\n',cxs,re.DOTALL)# 促销团单标题
cxurl = re.findall(r'<a target="_blank" href="http://t.dianping.com/deal(.*?)"',cxs,re.DOTALL)# 促销团单URL用于访问获取销售数
#循环促销信息URL列表获取销售数
for scxurl,c in zip(cxurl,cxss):
href = 'http://t.dianping.com/deal'
scxurl = href+scxurl
cxre = requests.get(scxurl,headers=headers)
cxre = cxre.content.decode('utf-8')
yishou = re.findall(r'<span>已售(.*?)<',cxre,re.DOTALL)
tuandan=(c,yishou)
listcx.append(tuandan)
# 获取商家详细
mt1 = []
# 循环商家url列表 从而获取 星级 名称 评分 地理位置 经纬度
for ur in urls:
durl =surl+ur
res = requests.get(durl, headers=headers)
t = res.content.decode('utf-8')
name = re.findall(r'<h1 class="shop-name">(.*?) <a',t,re.DOTALL)
title = re.findall(r'<span title="(.*?)"', t, re.DOTALL)
reviewCount = re.findall(r'<span id="reviewCount" class="item">(.*?)<', t, re.DOTALL)
avg = re.findall(r'<span id="avgPriceTitle".*?>(.*?)</', t, re.DOTALL)
score = re.findall(r'<span id="comment_score">.*?"item">(.*?)</.*?"item">(.*?)</.*?"item">(.*?)</', t, re.DOTALL)
address = re.findall(r'itemprop="street-address" title="(.*?)">', t, re.DOTALL)
xy = re.findall(r'shopGlat: "(.*?)", shopGlng:"(.*?)",', t, re.DOTALL)
print(durl)
time.sleep(0)
mt2 = {
'name':name,
'title':title,
'reviewCount':reviewCount,
'avg':avg,
'score':score,
'address':address,
'xy':xy
}
print(mt2)
mt1.append(mt2)
mt = {
'mt':mt1,
'cx':listcx
}
mts.append(mt)
def main():
lll=[]
# 访问 1-10 页商家列表
for i in range(1,10):
url = 'http://www.dianping.com/search/keyword/24/0_%E9%BD%BF%E7%A7%91/p{}'.format(i)
print (url)
marse_page(url)
for xx in mts:
name = xx['mt'][0]['name']
title = xx['mt'][0]['title']
reviewCount = xx['mt'][0]['reviewCount']
avg = xx['mt'][0]['avg']
address = xx['mt'][0]['address']
score = xx['mt'][0]['score']
xy = xx['mt'][0]['xy']
cx = xx['cx']
ll = (name,title,reviewCount,avg,score,address,xy,cx)
lll.append(ll)
tou = ['医院名', '星级', '评论数', '人均', '评分','地址','经纬度','团单']
with open('美团.csv', 'w', newline='')as fp:
writer = csv.writer(fp)
writer.writerow(tou)
writer.writerows(lll)
print(mts)
if __name__ == '__main__':
main()

浙公网安备 33010602011771号