原来用的 go + chromedp 采集pdd商家后台订单信息,结果登录页面的时候说环境异常不给扫码

采集web端的时候验证登录莫名不能发送短信验证码,遇到安全验证弹不出对话框,应该是哪个地方没配置好,没头绪。换python+selenium试试

安装使用教程系列:https://blog.csdn.net/u011541946/category_6788788_1.html

 

于是python也遇到了同样的问题,想直接访问api被anti_content加密难倒。browsermob捕获network粗略的试了下没成功,这个还需要java环境。

那换个思路吧。让selenium接管一个已经打开了的页面

先将chrome的路径放在path下,cmd运行

chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenum\AutomationProfile"

此时会打开一个chrome浏览器,试运行

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
 
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
chrome_driver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
driver = webdriver.Chrome(chrome_driver, chrome_options=chrome_options)
print(driver.title)

如果能打印出该tab下的网页title就成功接管了。

 

于是便开始了漫长的面向过程单线程之旅:

import time
import re
import xlwt
import math
import win32api,win32con
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait       #WebDriverWait注意大小写
from selenium.webdriver.common.by import By

def before():
    chrome_options = Options()
    chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
    chrome_driver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
    driver = webdriver.Chrome(chrome_driver, chrome_options=chrome_options)
    return driver

def get_msg(driver):
    #点击"查看手机号"
    WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,'//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[1]/div[2]/div/a/span')))
    driver.find_element_by_xpath('//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[1]/div[2]/div/a/span').click()
    #此处停一秒以免 Ajax 没加载完
    time.sleep(1.5)
    #获取收件人信息
    name = driver.find_element(By.XPATH, '//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[1]/div[2]/div').text
    #判断是否验证
    if "*******" in name:
        win32api.MessageBox(0, "请先通过验证再关闭此对话框", "收件人",win32con.MB_OK)    
    #点击"查看姓名与地址"
    WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,'//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[2]/div[2]/div/a/span')))
    driver.find_element_by_xpath('//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[2]/div[2]/div/a/span').click()
    #此处停一秒以免 Ajax 没加载完
    time.sleep(1.5)
    #获取地址信息
    address = driver.find_element(By.XPATH, '//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[2]/div[2]/div').text
    #判断是否验证
    if "****" in address:
        win32api.MessageBox(0, "请先通过验证再关闭此对话框", "联系地址",win32con.MB_OK)
    #获取收件人姓名电话地址信息
    name = driver.find_element(By.XPATH, '//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[1]/div[2]/div').text
    address = driver.find_element(By.XPATH, '//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[2]/div[2]/div').text
    #整理信息
    msg = name + " " + address
    return msg


#选择50个/页
def choose_50(driver):
    #点击页码
    WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,'//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[2]/div/div/div/div/div/div/div/div[1]/input')))
    driver.find_element_by_xpath('//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[2]/div/div/div/div/div/div/div/div[1]/input').click()
    #选择50
    WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,'/html/body/div[6]/div/div/div/div/ul/li[4]/span')))
    driver.find_element_by_xpath('/html/body/div[6]/div/div/div/div/ul/li[4]/span').click()

#获取点击下一页的次数
def get_count(driver,page):
    total = int(get_total(driver))
    yu = 0 if total%page == 0 else 1
    page_num = math.floor(total/page) + yu - 1
    return page_num

#从该页获取订单信息并返回
def get_msg_by_orderid(order_id,driver):
    js='window.open("https://mms.pinduoduo.com/orders/detail?type=4399&sn='+ order_id +'");'
    driver.execute_script(js)
    driver_new = before()
    msg = get_msg(driver_new)
    driver_new.close()
    return msg

#获取该页所有订单编号并返回
def get_orders(driver):
    html = driver.page_source
    pattern = re.compile(r"data-clipboard-text=\"(.*?)\" class=")
    result = pattern.findall(html)
    return result

#获取总订单数并返回
def get_total(driver):
    html = driver.page_source
    str = driver.find_element_by_xpath('//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[1]').text
    return re.findall("\d+", str)[0]
    
#将数据写入excel
def excel_write(di):
    # 创建一个workbook 设置编码
    workbook = xlwt.Workbook(encoding = 'utf-8')
    # 创建一个worksheet
    worksheet = workbook.add_sheet('订单')

    i = 0
    worksheet.write(0,0, label = '订单编号')
    worksheet.write(0,1, label = '发货信息')
    # 写入excel
    # 参数对应 行, 列, 值
    for order_id in di:
        i = i+1
        worksheet.write(i,0, label = order_id)
        worksheet.write(i,1, label = di[order_id])
    # 保存
    workbook.save('orders.xlsx')

#这一页的操作
def action_thispage(driver):
    orders = get_orders(driver)
    for order_id in orders:
        msg = get_msg_by_orderid(order_id,driver)
        di[order_id] = msg
    

#点击下一页
def nextpage_click(driver):
    WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,'//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[6]/i')))
    driver.find_element_by_xpath('//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[6]/i').click()
    
di = {}

def main():
    driver = before()
   action_thispage(driver) page
= 5 for index in range(get_count(driver,page)): nextpage_click(driver) time.sleep(1) action_thispage(driver) excel_write(di) if __name__ == '__main__': main()

可中途遇到的安全验证实在太频繁了,每个验证都需要人脑思考参与,暂时没法做到机器过验证。迫使用户使用官方工具...web采集放弃了

posted on 2020-11-08 12:30  longzhankunlun  阅读(621)  评论(0编辑  收藏  举报