网页爬虫之路 x点评

 

import requests
from lxml import etree

'''
一、目标网页: http://www.dianping.com/shop/77335766

二、反爬虫:
1.UA检测
2.Cookie检测
3.字体加密

'''

headers = {
    'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
    'Cookie': 'fspop=test; _lxsdk_cuid=171e5083736c8-0cf3cf05dda8b-c373667-e1000-171e5083737c8; _lxsdk=171e5083736c8-0cf3cf05dda8b-c373667-e1000-171e5083737c8; _hc.v=38d6cc50-27ac-072e-52d8-3632b4e191ba.1588685454; t_lxid=171e5083a49c8-07b2bbebce1436-c373667-e1000-171e5083a49c8-tid; s_ViewType=10; cy=143; cye=jiangxifuzhou; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _dp.ac.v=3feecfad-7f29-49e1-b007-a2ee1f1cb733; lgtoken=0c5c53d0f-c9d8-40d0-9c76-affb4cb4415b; dplet=27544c8252d97b1f6c2aeb7a481b4735; dper=d3ccd39686ce46f20cf42af37cce81448e8c9056ecc7d42db96faab64eff52a39cae246d30684917187f6ad7f59a223ca1372059d418a9297694848a7d21352bb5fc293f538e9ff502d66b5260b75a9a512443432e58201990e12317eb1d420a; ll=7fd06e815b796be3df069dec7836c3df; ua=dpuser_5381154977; ctu=4fbb3e3eaa8d5826f4aea3e6d127ee37a92b87677cf1e3bd99096cd41c7481c8; uamo=18879404697; _lxsdk_s=171f811b380-ee0-d98-00d%7C%7C191'
}

url = 'http://www.dianping.com/shop/77335766'
resp = requests.get(url, headers=headers)
html = etree.HTML(resp.text)
res_shopName = html.xpath("//h1[@class='shop-name']/text()")[0]
res_shopNumber = html.xpath("//div[@id='basic-info']/p[@class='expand-info tel']//text()")

print(res_shopName, '\n', res_shopNumber)
# [' 麦当劳(罗宾森购物中心店)
# [' ', '电话:', ' ', '\ue47c', '\ue0c9', '1-', '\ue556', '\ue135', '\ue135', '\ue135', '1', '\ue0c9', '** ']
v1

 

# [' 麦当劳(罗宾森购物中心店)
# [' ', '电话:', ' ', '\ue47c', '\ue0c9', '1-', '\ue556', '\ue135', '\ue135', '\ue135', '1', '\ue0c9', '** ']

 

 

 

posted @ 2020-05-05 21:47  沐风先生  阅读(96)  评论(0)    收藏  举报