python 美团在某个模板爬虫

美团的爬虫首先是在主页在关键词搜索
但是搜到的和看到的有差异
今天尝试用sele写代码
由于每次爬取时候不需要把页面全部加载出来，所以使用
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "eager"
这两行代码会导致加载一会就开始跳转到另外一个页面，不会等到页面完全加载出来

from selenium import webdriver
from lxml import etree
from fake_useragent import UserAgent
import re
import csv
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

def get_urls():
for i in range(19,25):
url = "https://sz.meituan.com/jiankangliren/pn"+str(i)+"/"
urls.append(url)
def parser_url(url):
for i in range(5):
try:
driver.get(url)
driver.implicitly_wait(2)
# print(driver.page_source)
return driver.page_source
except:
print("https://verify.meituan.com/v2/web/general_page?action=spiderindefence&requestCode=6816451805cd4bcca0a90b6c0e159536&platform=1000&adaptor=auto&succCallbackUrl=https%3A%2F%2Foptimus-mtsi.meituan.com%2Foptimus%2FverifyResult%3ForiginUrl%3Dhttp%253A%252F%252Fapi.mobile.meituan.com%252Fapi%252Fv4%252Fpoi%252Fpcsearch%252F30%253Fuuid%253D1e88feaf32a04f738452.1602773577.1.0.0%2526userid%253D-1%2526limit%253D32%2526offset%253D32%2526cateId%253D-1%2526q%253D%2525E5%252581%2525A5%2525E5%2525BA%2525B7%2525E4%2525B8%2525BD%2525E4%2525BA%2525BA")
input()

def get_poiId(html):

poiIds = re.findall(r'<a class=\"abstract-pic grey\" href=\"(.*?)\" style=\"width:220px;height:126px\" target="_blank\"\>',html)
print(poiIds)
for poiId in poiIds:
    link = "https:"+poiId
    # print(link)
    driver.get(link)
    htm = driver.page_source
    try:
        name = re.findall(r"name\":\"(.*?)\"\,\"score",htm)[0]
        adress = re.findall(r"address\"\:\"(.*?)\"\,\"p",htm)[0]
        phone = re.findall(r"phone\"\:\"(.*?)\"\,\"openTime",htm)[0]
        print(name,adress,phone)
        writer.writerow([name,phone,adress])
    except Exception as e:
        print(e)
        input()

def run():
get_urls()
for url in urls:
html = parser_url(url)
get_poiId(html)

if name == 'main':
urls = []
f = open('sz丽人.csv','w',newline='',encoding='utf-8')
writer = csv.writer(f)
writer.writerow(["名称","电话","地址"])
# get直接返回，不再等待界面加载完成
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "eager"
chromedriver = 'D:/浏览器/chromedriver.exe'
driver = webdriver.Chrome(chromedriver)
run()

posted on 2020-11-04 19:47 donghaoqian 阅读(389) 评论(0) 收藏举报

刷新页面返回顶部

donghaoqian

导航

公告

python 美团在某个模板爬虫