大众点评爬虫

import requests
from lxml import etree
import csv

headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
cookies={
  'Cookie': 'fspop=test; cy=70; cye=changchun; __guid=169583271.1176092058052156700.1618064807707.5415; _lxsdk_cuid=178bc2d991bc8-06f82d2a1ad0c8-3e604809-1fa400-178bc2d991ec8; _lxsdk=178bc2d991bc8-06f82d2a1ad0c8-3e604809-1fa400-178bc2d991ec8; _hc.v=8e6ff184-ecf9-beda-8556-f21cac38d549.1618064809; s_ViewType=10; ctu=a39fa7b43d5011077a7a6a13b07f7eab2586a77330045fd09fb1ec9fcd4ecbef; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1618064809,1618108223; dplet=18e634c44bc3d5ff4dc4d2377c0348ce; dper=e18ff3f28e86ce6d07b46b29a43464b7a9697e4b309dc739fb65478c72a0a4e1ac4eeb1e4858e57828c84156f0e7221b89ce58e7174f2e6bf336e124ae5c277bbd1b72b6716c024fccd8bbd09c27536eb08f23c8a6e50a5b20884368c4b64588; ll=7fd06e815b796be3df069dec7836c3df; ua=dpuser_7353802477; monitor_count=40; _lxsdk_s=178bec40f52-ba4-7b6-bfe%7C%7C173; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1618111329'
}

dict={
'\ue36f':'9',
'\uea1a':'8',
'\ue13b':'7',
'\uf680':'6',
'\uea7a':'5',
'\uf6db':'4',
'\ue1a7':'3',
'\ueef3':'2',
'\ue9cd':'0'

}#svg映射,注意自己查找






def gethtml(url):
    r=requests.get(url=url,headers=headers,cookies=cookies)
    r.encoding='utf-8'
    html=r.text
    return html


def shiftnumber(num_list):
    count = ''
    for num in num_list:

        if num in dict.keys():
            cc = dict[num]
        else:
            cc = num

        count += cc
    return count
# name_list=[]
# total_score_list=[]
# evaluation_num_list =[]
# per_capita_list=[]
# taste_score_list=[]
# environment_score_list=[]
# service_score_list=[]

for i in range(1,5):
    print('正在爬取第{}页'.format(i))
    url='http://www.dianping.com/changchun/ch10/g110p{}'.format(i)
    html=gethtml(url)
    tree = etree.HTML(html)
    #name = tree.xpath('//*[@id="shop-all-list"]/ul/li[1]/div[2]/div[1]/a/h4/text()')[0]
    li_list=tree.xpath('//*[@id="shop-all-list"]/ul/li')
    for li in li_list:
        name = li.xpath('.//div[@class="tit"]/a/h4/text()')[0]

        total_score = li.xpath('./div[2]/div[2]/div/div[2]/text()')[0]

        evaluation_num = li.xpath('./div[2]/div[2]/a[1]/b//text()')
        evaluation_num = shiftnumber(evaluation_num)

        per_capita = li.xpath('./div[2]/div[2]/a[2]/b//text()')
        per_capita = shiftnumber(per_capita)

        taste_score = li.xpath('./div[2]/span/span[1]/b//text()')
        taste_score = shiftnumber(taste_score)

        environment_score = li.xpath('./div[2]/span/span[2]/b//text()')
        environment_score = shiftnumber(environment_score)

        service_score = li.xpath('./div[2]/span/span[3]/b//text()')
        service_score = shiftnumber(service_score)

        print('开始保存。。。。')
        with open ('长春火锅店.csv','a',encoding='utf-8',newline='') as csvfile:#  newline=''可以解决空行问题
            writer=csv.writer(csvfile)
            #writer.writerow(['火锅店名称','总评分','评价人数','人均消费','口味','环境','服务'])
            writer.writerow([name,total_score,evaluation_num,per_capita,taste_score,environment_score,service_score])

 

posted @ 2021-04-11 14:57  会发光的眸子  阅读(1087)  评论(0编辑  收藏  举报