原来用的 go + chromedp 采集pdd商家后台订单信息,结果登录页面的时候说环境异常不给扫码
采集web端的时候验证登录莫名不能发送短信验证码,遇到安全验证弹不出对话框,应该是哪个地方没配置好,没头绪。换python+selenium试试
安装使用教程系列:https://blog.csdn.net/u011541946/category_6788788_1.html
于是python也遇到了同样的问题,想直接访问api被anti_content加密难倒。browsermob捕获network粗略的试了下没成功,这个还需要java环境。
那换个思路吧。让selenium接管一个已经打开了的页面
先将chrome的路径放在path下,cmd运行
chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenum\AutomationProfile"
此时会打开一个chrome浏览器,试运行
from selenium import webdriver from selenium.webdriver.chrome.options import Options chrome_options = Options() chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222") chrome_driver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" driver = webdriver.Chrome(chrome_driver, chrome_options=chrome_options) print(driver.title)
如果能打印出该tab下的网页title就成功接管了。
于是便开始了漫长的面向过程单线程之旅:
import time import re import xlwt import math import win32api,win32con from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait #WebDriverWait注意大小写 from selenium.webdriver.common.by import By def before(): chrome_options = Options() chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222") chrome_driver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" driver = webdriver.Chrome(chrome_driver, chrome_options=chrome_options) return driver def get_msg(driver): #点击"查看手机号" WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,'//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[1]/div[2]/div/a/span'))) driver.find_element_by_xpath('//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[1]/div[2]/div/a/span').click() #此处停一秒以免 Ajax 没加载完 time.sleep(1.5) #获取收件人信息 name = driver.find_element(By.XPATH, '//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[1]/div[2]/div').text #判断是否验证 if "*******" in name: win32api.MessageBox(0, "请先通过验证再关闭此对话框", "收件人",win32con.MB_OK) #点击"查看姓名与地址" WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,'//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[2]/div[2]/div/a/span'))) driver.find_element_by_xpath('//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[2]/div[2]/div/a/span').click() #此处停一秒以免 Ajax 没加载完 time.sleep(1.5) #获取地址信息 address = driver.find_element(By.XPATH, '//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[2]/div[2]/div').text #判断是否验证 if "****" in address: win32api.MessageBox(0, "请先通过验证再关闭此对话框", "联系地址",win32con.MB_OK) #获取收件人姓名电话地址信息 name = driver.find_element(By.XPATH, '//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[1]/div[2]/div').text address = driver.find_element(By.XPATH, '//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[2]/div[2]/div').text #整理信息 msg = name + " " + address return msg #选择50个/页 def choose_50(driver): #点击页码 WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,'//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[2]/div/div/div/div/div/div/div/div[1]/input'))) driver.find_element_by_xpath('//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[2]/div/div/div/div/div/div/div/div[1]/input').click() #选择50 WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,'/html/body/div[6]/div/div/div/div/ul/li[4]/span'))) driver.find_element_by_xpath('/html/body/div[6]/div/div/div/div/ul/li[4]/span').click() #获取点击下一页的次数 def get_count(driver,page): total = int(get_total(driver)) yu = 0 if total%page == 0 else 1 page_num = math.floor(total/page) + yu - 1 return page_num #从该页获取订单信息并返回 def get_msg_by_orderid(order_id,driver): js='window.open("https://mms.pinduoduo.com/orders/detail?type=4399&sn='+ order_id +'");' driver.execute_script(js) driver_new = before() msg = get_msg(driver_new) driver_new.close() return msg #获取该页所有订单编号并返回 def get_orders(driver): html = driver.page_source pattern = re.compile(r"data-clipboard-text=\"(.*?)\" class=") result = pattern.findall(html) return result #获取总订单数并返回 def get_total(driver): html = driver.page_source str = driver.find_element_by_xpath('//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[1]').text return re.findall("\d+", str)[0] #将数据写入excel def excel_write(di): # 创建一个workbook 设置编码 workbook = xlwt.Workbook(encoding = 'utf-8') # 创建一个worksheet worksheet = workbook.add_sheet('订单') i = 0 worksheet.write(0,0, label = '订单编号') worksheet.write(0,1, label = '发货信息') # 写入excel # 参数对应 行, 列, 值 for order_id in di: i = i+1 worksheet.write(i,0, label = order_id) worksheet.write(i,1, label = di[order_id]) # 保存 workbook.save('orders.xlsx') #这一页的操作 def action_thispage(driver): orders = get_orders(driver) for order_id in orders: msg = get_msg_by_orderid(order_id,driver) di[order_id] = msg #点击下一页 def nextpage_click(driver): WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,'//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[6]/i'))) driver.find_element_by_xpath('//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[6]/i').click() di = {} def main(): driver = before()
action_thispage(driver) page = 5 for index in range(get_count(driver,page)): nextpage_click(driver) time.sleep(1) action_thispage(driver) excel_write(di) if __name__ == '__main__': main()
可中途遇到的安全验证实在太频繁了,每个验证都需要人脑思考参与,暂时没法做到机器过验证。迫使用户使用官方工具...web采集放弃了