python新人尝试爬取大众点评齿科信息 获取评分 经纬度 团单销量 等信息

新人初次尝试,就是访问的次数多了 会被点评 反爬 需要浏览器滑动验证 ,暂时还没有学会怎么破解,
初次尝试
在这里插入图片描述

import requests
import re
import csv
import time
mts = []
def marse_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
        'cookie':'navCtgScroll=100; navCtgScroll=200; _lxsdk_cuid=16d7bde3e45c8-0b491cbf188485-67e1b3f-1fa400-16d7bde3e46c8; _lxsdk=16d7bde3e45c8-0b491cbf188485-67e1b3f-1fa400-16d7bde3e46c8; _hc.v="\"ab6667ff-ff89-4c88-9924-2865edbe01ee.1569741222\""; s_ViewType=10; mpmerchant_portal_shopid=18189287; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; cy=24; cye=shijiazhuang; _lxsdk_s=16dd2e5facb-327-0e0-88a%7C%7C190'
    }
    surl = 'http://www.dianping.com/shop/'

    # 获取大众点评列表
    response = requests.get(url,headers=headers)
    text = response.content.decode('utf-8')
    lis = re.findall(r'li class=""(.*?)</li>',text,re.DOTALL)


    # 循环大众点评商家列表
    for li in lis:

        # 获取商家网页链接
        urls = re.findall(r'<div class="pic" >.*?data-shopid="(.*?)".*?',li,re.DOTALL)
        # 获取商家促销信息列表
        cxl= re.findall(r'<div class="svr-info">(.*?)</div>',li,re.DOTALL)

        # 获取商家促销信息列表详细内容
        listcx=[]
        # 循环促销列表
        for cxs in cxl:
            cxss = re.findall(r'>团购:</span>(.*?)\n',cxs,re.DOTALL)# 促销团单标题
            cxurl = re.findall(r'<a target="_blank" href="http://t.dianping.com/deal(.*?)"',cxs,re.DOTALL)# 促销团单URL用于访问获取销售数
            #循环促销信息URL列表获取销售数
            for scxurl,c in zip(cxurl,cxss):
                href = 'http://t.dianping.com/deal'
                scxurl = href+scxurl
                cxre = requests.get(scxurl,headers=headers)
                cxre = cxre.content.decode('utf-8')
                yishou = re.findall(r'<span>已售(.*?)<',cxre,re.DOTALL)
                tuandan=(c,yishou)
                listcx.append(tuandan)

        # 获取商家详细
        mt1 = []
        # 循环商家url列表 从而获取 星级 名称 评分 地理位置 经纬度
        for ur in urls:
            durl =surl+ur
            res = requests.get(durl, headers=headers)
            t = res.content.decode('utf-8')
            name = re.findall(r'<h1 class="shop-name">(.*?) <a',t,re.DOTALL)
            title = re.findall(r'<span title="(.*?)"', t, re.DOTALL)
            reviewCount = re.findall(r'<span id="reviewCount" class="item">(.*?)<', t, re.DOTALL)
            avg = re.findall(r'<span id="avgPriceTitle".*?>(.*?)</', t, re.DOTALL)
            score = re.findall(r'<span id="comment_score">.*?"item">(.*?)</.*?"item">(.*?)</.*?"item">(.*?)</', t, re.DOTALL)
            address = re.findall(r'itemprop="street-address" title="(.*?)">', t, re.DOTALL)
            xy = re.findall(r'shopGlat: "(.*?)", shopGlng:"(.*?)",', t, re.DOTALL)
            print(durl)
            time.sleep(0)

            mt2 = {
                'name':name,
                'title':title,
                'reviewCount':reviewCount,
                'avg':avg,
                'score':score,
                'address':address,
                'xy':xy
            }
            print(mt2)
            mt1.append(mt2)

        mt = {
            'mt':mt1,
            'cx':listcx
        }
        mts.append(mt)




def main():
    lll=[]
    # 访问 1-10 页商家列表
    for i in range(1,10):
        url = 'http://www.dianping.com/search/keyword/24/0_%E9%BD%BF%E7%A7%91/p{}'.format(i)
        print (url)
        marse_page(url)


    for xx in mts:
        name = xx['mt'][0]['name']
        title = xx['mt'][0]['title']
        reviewCount = xx['mt'][0]['reviewCount']
        avg = xx['mt'][0]['avg']
        address = xx['mt'][0]['address']
        score = xx['mt'][0]['score']
        xy = xx['mt'][0]['xy']
        cx = xx['cx']
        ll = (name,title,reviewCount,avg,score,address,xy,cx)
        lll.append(ll)
    tou = ['医院名', '星级', '评论数', '人均', '评分','地址','经纬度','团单']
    with open('美团.csv', 'w', newline='')as fp:
        writer = csv.writer(fp)
        writer.writerow(tou)
        writer.writerows(lll)
    print(mts)



if __name__ == '__main__':
    main()

posted @ 2019-10-17 08:54  伟茂  阅读(144)  评论(0)    收藏  举报