python 美团在某个模板爬虫
美团的爬虫首先是在主页在关键词搜索
但是搜到的和看到的有差异
今天尝试用sele写代码
由于每次爬取时候不需要把页面全部加载出来,所以使用
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "eager"
这两行代码会导致加载一会就开始跳转到另外一个页面,不会等到页面完全加载出来
from selenium import webdriver
from lxml import etree
from fake_useragent import UserAgent
import re
import csv
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def get_urls():
for i in range(19,25):
url = "https://sz.meituan.com/jiankangliren/pn"+str(i)+"/"
urls.append(url)
def parser_url(url):
for i in range(5):
try:
driver.get(url)
driver.implicitly_wait(2)
# print(driver.page_source)
return driver.page_source
except:
print("https://verify.meituan.com/v2/web/general_page?action=spiderindefence&requestCode=6816451805cd4bcca0a90b6c0e159536&platform=1000&adaptor=auto&succCallbackUrl=https%3A%2F%2Foptimus-mtsi.meituan.com%2Foptimus%2FverifyResult%3ForiginUrl%3Dhttp%253A%252F%252Fapi.mobile.meituan.com%252Fapi%252Fv4%252Fpoi%252Fpcsearch%252F30%253Fuuid%253D1e88feaf32a04f738452.1602773577.1.0.0%2526userid%253D-1%2526limit%253D32%2526offset%253D32%2526cateId%253D-1%2526q%253D%2525E5%252581%2525A5%2525E5%2525BA%2525B7%2525E4%2525B8%2525BD%2525E4%2525BA%2525BA")
input()
def get_poiId(html):
poiIds = re.findall(r'<a class=\"abstract-pic grey\" href=\"(.*?)\" style=\"width:220px;height:126px\" target="_blank\"\>',html)
print(poiIds)
for poiId in poiIds:
link = "https:"+poiId
# print(link)
driver.get(link)
htm = driver.page_source
try:
name = re.findall(r"name\":\"(.*?)\"\,\"score",htm)[0]
adress = re.findall(r"address\"\:\"(.*?)\"\,\"p",htm)[0]
phone = re.findall(r"phone\"\:\"(.*?)\"\,\"openTime",htm)[0]
print(name,adress,phone)
writer.writerow([name,phone,adress])
except Exception as e:
print(e)
input()
def run():
get_urls()
for url in urls:
html = parser_url(url)
get_poiId(html)
if name == 'main':
urls = []
f = open('sz丽人.csv','w',newline='',encoding='utf-8')
writer = csv.writer(f)
writer.writerow(["名称","电话","地址"])
# get直接返回,不再等待界面加载完成
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "eager"
chromedriver = 'D:/浏览器/chromedriver.exe'
driver = webdriver.Chrome(chromedriver)
run()
posted on 2020-11-04 19:47 donghaoqian 阅读(389) 评论(0) 收藏 举报
浙公网安备 33010602011771号