import datetime
import time
import json
import self as self
import MessageToSlack
import os
import pymysql.cursors
import random
import sys
import undetected_chromedriver.v2 as uc
from decouple import config
from functools import reduce
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from dateutil.relativedelta import relativedelta
# 用来驱动浏览器的
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from decouple import config
class Cainiao_Scrape:
need_customer_data = []
discount_rate_data = []
parameter_setting_data = []
page_count = 0
# 初始设置函数
def __init__(self, web_url, pro_db_host, pro_db_user, pro_db_pwd, pro_db_name, u02_db_host, u02_db_user, u02_db_pwd,
u02_db_name, u01_db_host, u01_db_user, u01_db_pwd, u01_db_name, cainiao_ac, cainiao_pwd):
self.vars = None
self.driver = None
self.web_url = web_url
# production
self.pro_db_host = pro_db_host
self.pro_db_user = pro_db_user
self.pro_db_pwd = pro_db_pwd
self.pro_db_name = pro_db_name
# u02
self.u02_db_host = u02_db_host
self.u02_db_user = u02_db_user
self.u02_db_pwd = u02_db_pwd
self.u02_db_name = u02_db_name
# u01
self.u01_db_host = u01_db_host
self.u01_db_user = u01_db_user
self.u01_db_pwd = u01_db_pwd
self.u01_db_name = u01_db_name
self.cainiao_ac = cainiao_ac
self.cainiao_pwd = cainiao_pwd
# 浏览器配置
def setup_method(self, method):
port_number = "127.0.0.1:9222"
# port_number = "192.168.20.57:2223"
try:
check_folder = os.path.isdir("ScreenCap")
if not check_folder:
os.makedirs("ScreenCap")
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option(
"debuggerAddress", port_number)
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {
'browser': 'ALL', "performance": "ALL"}
capabilities["goog:perfLoggingPrefs"] = {'enableNetwork': True}
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--disable-plugins-discovery")
chrome_options.add_argument('--no-first-run')
chrome_options.add_argument('--no-service-autorun')
chrome_options.add_argument('--no-default-browser-check')
chrome_options.add_argument('no-startup-window')
chrome_options.add_argument('--disable-dev-shm-usage')
# chrome_options.add_argument('--no-sandbox') # highest right to operate
# chrome_options.add_argument('--headless') #--headless是不显示浏览器启动及执行过程
# chrome_options.add_argument('--disable-gpu') # 不加载gpu,规避bug
self.driver = webdriver.Chrome(
options=chrome_options, desired_capabilities=capabilities)
self.vars = {}
# 清除浏览器cookie
# self.driver.delete_all_cookies()
MessageToSlack.post_message_to_slack("Start Chrome Driver")
except Exception as r:
self.teardown_method
MessageToSlack.post_message_to_slack(
"浏览器配置setup_method抛出异常:%s" % str(r))
sys.exit()
# 菜鸟登录的函数
def login_cainiao(self):
try:
MessageToSlack.post_message_to_slack("Go to cainiao website")
try:
self.driver.get(self.web_url)
except Exception as get2:
MessageToSlack.post_message_to_slack(str(get2))
MessageToSlack.post_message_to_slack('get that page again')
self.driver.get(self.web_url)
link = self.web_url
time.sleep(random.randint(4, 6))\
"""
# 打开本地的浏览器的时候,先跳过登录这一块然后再测试
self.driver.switch_to.frame(0)
time.sleep(random.randint(4, 6))
# 点击输入账户
self.driver.find_element(By.XPATH, "//input[@id='fm-login-id']").click()
time.sleep(2)
self.driver.find_element(By.XPATH, "//input[@id='fm-login-id']").send_keys(self.cainiao_ac)
# 点击输入密码
self.driver.find_element(By.XPATH, "//input[@id='fm-login-password']").click()
time.sleep(2)
self.driver.find_element(By.XPATH, "//input[@id='fm-login-password']").send_keys(self.cainiao_pwd)
# 调取滑块验证部分
self.slide_auth()
# 点击登陆按钮
self.driver.find_element(By.XPATH, "//button[@type='submit']").click()
MessageToSlack.post_message_to_slack('pressed login button')
# 如果有快速进入的验证框,点击快速验证
try:
self.driver.switch_to.frame(0)
self.driver.find_element(By.XPATH, "//button[@type='submit']").click()
MessageToSlack.post_message_to_slack('pressed quick entry button')
except Exception as gin1:
MessageToSlack.post_message_to_slack(str(gin1))
MessageToSlack.post_message_to_slack('pressed quick entry button Unable to locate element')
pass
time.sleep(random.randint(4, 6))
"""
try:
self.driver.get(self.web_url)
except Exception as get1:
MessageToSlack.post_message_to_slack(str(get1))
MessageToSlack.post_message_to_slack('get that page again')
self.driver.get(self.web_url)
time.sleep(random.randint(4, 6))
MessageToSlack.post_message_to_slack("Get Data Page")
MessageToSlack.post_message_to_slack(
"此时浏览器打开的页面的标题:%s" % self.driver.title)
# 查询符合时间段内的全部公司列表
self.query_all_customer()
time.sleep(2)
MessageToSlack.post_message_to_slack(self.parameter_setting_data)
MessageToSlack.post_message_to_slack("Data Scrape finished")
"""
# to press button to log out
self.driver.switch_to.default_content()
self.driver.find_element(By.CSS_SELECTOR, ".cct-nickname").click()
time.sleep(1)
self.driver.find_element(By.LINK_TEXT, "退出").click()
MessageToSlack.post_message_to_slack("logged out")
"""
except Exception as tc:
self.driver.get_screenshot_as_file(
"./ScreenCap/" + datetime.datetime.now().strftime("%Y%m%d_%H%M%p") + ".png") # 异常时捕获页面截图
MessageToSlack.post_message_to_slack(
"菜鸟登录的函数login_cainiao抛出异常:%s" % str(tc))
# self.teardown_method # 调用关闭浏览器函数
sys.exit() # 退出程序机制 通过引发SystemExit异常来退出Python程序
# 获取拖动按钮位置并拖动
def slide_auth(self):
try:
time.sleep(random.randint(4, 6))
# check have slide verify elements or not
if (self.driver.find_element(By.ID, "baxia-password")).is_displayed():
self.driver.switch_to.frame('baxia-dialog-content')
element = self.driver.find_element(By.ID, "nc_1_n1z")
slide_bar = self.driver.find_element(By.ID, "nc_1__scale_text")
action = ActionChains(self.driver)
action.move_to_element(element).pause(1).click_and_hold(element).pause(1).move_by_offset(
slide_bar.size['width'] - element.size['width'], 0).pause(1).release() # mobile slider
action.perform()
MessageToSlack.post_message_to_slack(
"Slide the verification bar")
self.driver.switch_to.parent_frame()
time.sleep(2)
if (self.driv2er.find_element(By.ID, "baxia-password")).is_displayed():
counter1 = 0
while (self.driver.find_element(By.ID, "baxia-password")).is_displayed() and counter1 < 6:
self.driver.switch_to.frame('baxia-dialog-content')
wrapper = self.driver.find_element(By.ID, "nocaptcha")
wrapper.click()
time.sleep(random.randint(2, 3))
element = self.driver.find_element(By.ID, "nc_1_n1z")
slide_bar = self.driver.find_element(
By.ID, "nc_1__scale_text")
action = ActionChains(self.driver)
action.move_to_element(element).pause(1).click_and_hold(element).pause(1).move_by_offset(
slide_bar.size['width'] - element.size['width'], 0).release() # mobile slider
action.perform()
MessageToSlack.post_message_to_slack(
"Slide the verification bar")
self.driver.switch_to.parent_frame()
time.sleep(random.randint(2, 3))
counter1 += 1
counter1 = 0
if (self.driver.find_element(By.ID, "baxia-password")).is_displayed():
raise Exception(
"Still cannot pass the slide certification after 5 attempt")
except Exception as s:
self.driver.get_screenshot_as_file(
"./ScreenCap/" + datetime.datetime.now().strftime("%Y%m%d_%H%M%p") + ".png")
MessageToSlack.post_message_to_slack(
"获取拖动按钮位置并拖动slide_auth抛出异常:%s" % str(s))
self.teardown_method
sys.exit()
# 查询符合时间段内的所有的公司列表
def query_all_customer(self):
if (self.driver.find_element(By.XPATH, '//form[@id="J_Form"]')).is_displayed():
# 查询搜索框存在
MessageToSlack.post_message_to_slack(
'customer name search form exists')
# 日期选择范围
two_yrs_ago = datetime.datetime.now() - relativedelta(years=2)
start_year = int(two_yrs_ago.strftime('%Y'))
start_month = int(two_yrs_ago.strftime('%m'))
start_day = int(two_yrs_ago.strftime('%d'))
# 年月日的XPATH
current_selection_year_start = '.c-cal-year[data-year="%s"]' % start_year
current_selection_month_start = '.c-cal-month[data-month="%s"]' % start_month
current_selection_day_start = '.c-cal-day[data-day="%s"]' % start_day
# 点击日期选择器的DIV
self.driver.find_element(
By.XPATH, "//div[@id='c-calendar-select-66']").click()
time.sleep(1)
# 点击日期选择器的年份
self.driver.find_element(
By.XPATH, '//*[@id="c-calendar-87"]/div/div[1]/div[1]').click()
time.sleep(1)
self.driver.find_element(
By.CSS_SELECTOR, current_selection_year_start).click()
# 点击日期选择器的月份
self.driver.find_element(
By.XPATH, '//*[@id="c-calendar-87"]/div/div[1]/div[2]').click()
time.sleep(1)
self.driver.find_element(
By.CSS_SELECTOR, current_selection_month_start).click()
# 点击日期选择器的日期
self.driver.find_element(
By.CSS_SELECTOR, current_selection_day_start).click()
# 点击确定
self.driver.find_element(
By.XPATH, '//*[@id="c-calendar-81"]/div[3]/button').click()
# 点击查询
self.driver.find_element(By.XPATH, "//div[@id='J_search']").click()
MessageToSlack.post_message_to_slack('press the search div!')
time.sleep(random.randint(2, 4))
# 获取总共的条数
paging_total_num = int(self.driver.find_element(
By.CSS_SELECTOR, '#J_Paging > div.paging-total-num > span').text)
MessageToSlack.post_message_to_slack(
'paging_total_num是:%s' % paging_total_num)
# 如果总条数大于10,设置每页显示条数为30
if paging_total_num > 10:
# 点击每页【】条的DIV框
self.driver.find_element(
By.XPATH, '//*[@id="c-select-226"]').click()
time.sleep(1)
# 点击页数列表的最后的一个li
self.driver.find_element(
By.XPATH, '//ul[@id="c-list-251"]/li[last()]/span').click()
time.sleep(random.randint(2, 4))
"""
获取页面performance_log,并分析存入数据库
get_log方法用于获取给定日志类型的日
get log first time, after get once log will be deleted
"""
first_page_log = self.driver.get_log("performance")
self.log_analysis_customer_query(first_page_log, 1)
# 获取页面显示页数,如果页面显示页数大于1,for循环跳转页面,并获取相应的performance_log分析存入数据库
paging_total_page = int(self.driver.find_element(
By.CSS_SELECTOR, '#J_Paging > div.paging-total-page > span').text)
MessageToSlack.post_message_to_slack(
'paging_total_page是%s' % paging_total_page)
if paging_total_page > 1:
initial_number = 1
while initial_number < paging_total_page:
initial_number += 1
# 判断点击跳转下一页的btn是否可点击
btn_right = self.driver.find_element(
By.CSS_SELECTOR, '#J_Paging > button.btn.btn-default.btn-sm.paging-next-btn.c-icon.c-icon-arrow-right')
if btn_right.is_enabled():
btn_right.click()
time.sleep(random.randint(2, 4))
# 获取页面performance_log,并分析存入数据库
self.log_analysis_customer_query(
self.driver.get_log("performance"), initial_number)
else:
break
else:
# 查询搜索框不存在时返回信息
raise Exception(
"cannot find this search form in customer list page. Please check is login problem or other")
# 客户列表日志分析函数
def log_analysis_customer_query(self, performance_log_customer_query, page):
for row in performance_log_customer_query:
log_json_customer_query = json.loads(row['message'])
log_customer_query = log_json_customer_query['message']
method_customer_query = str(log_customer_query['method'])
if method_customer_query != 'Network.responseReceived':
continue
url_customer_query = str(
log_customer_query['params']['response']['url'])
if url_customer_query != 'https://merchant.finance.cainiao.com/funds/signApproveDetail/signApproveDetailData.do':
continue
requestId_customer_query = str(
log_customer_query['params']['requestId'])
try:
customer_query_page_data = \
self.driver.execute_cdp_cmd('Network.getResponseBody',
{'requestId': requestId_customer_query})[
'body']
cqd = json.loads(customer_query_page_data)
cqd['data']['page'] = page
# 存储数据到cai_niao_company_discountrate_snapshots到各个数据库
self.save_to_discountrate_snapshots(cqd)
total_customer_query = cqd['data']['total']
if total_customer_query > 0:
# 调取跳转参数设置页面的函数
resultList = cqd['data']['resultList']
for item in resultList:
if item['status'] != 'FUND_APPROVE_PASS':
continue
self.parameter_setting_data.append({
'approveNo': item['approveNo'],
'customer_name': item['companyName']
})
else:
MessageToSlack.post_message_to_slack('No data was found')
except Exception as customer_query_e:
MessageToSlack.post_message_to_slack(
'Problem after press the search button! ' + str(customer_query_e))
MessageToSlack.post_message_to_slack('数据已存入parameter_setting_data中')
# 多个数据库执行数据存储到cai_niao_company_discountrate_snapshots
def save_to_discountrate_snapshots(self, data):
# save to dbs
self.save_discountrate_snapshots(
data, self.pro_db_host, self.pro_db_user, self.pro_db_pwd, self.pro_db_name)
# self.save_discountrate_snapshots(
# data, self.u02_db_host, self.u02_db_user, self.u02_db_pwd, self.u02_db_name)
self.save_discountrate_snapshots(
data, self.u01_db_host, self.u01_db_user, self.u01_db_pwd, self.u01_db_name)
# 存储数据到cai_niao_company_discountrate_snapshots
def save_discountrate_snapshots(self, data, db_host, db_user, db_pwd, db_name):
if len(data) > 0:
try:
connection = pymysql.connect(
host=db_host, user=db_user, password=db_pwd, database=db_name,
cursorclass=pymysql.cursors.DictCursor)
with connection:
raw_data = json.dumps(data)
download_date = datetime.datetime.now().strftime('%Y-%m-%d')
created_at = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
with connection.cursor() as cursor:
insert_sql = f"Insert INTO `cai_niao_company_discountrate_snapshots` (`download_date`,`created_at`,`raw_data`) VALUES ('{download_date}', '{created_at}', '{raw_data}')"
# MessageToSlack.post_message_to_slack("insert_sql: %s" % insert_sql)
cursor.execute(insert_sql)
except Exception as ds:
MessageToSlack.post_message_to_slack(
"存储数据到cai_niao_company_discountrate_snapshots抛出异常:%s" % str(ds))
connection.close()
else:
MessageToSlack.post_message_to_slack(
'No data need to store to cai_niao_company_discountrate_snapshots as no data grabbed')
# 获取cai_niao_watermark_dailylogs表中stock_central_amt>0并且日期範圍由2年欠的今天到今天的customer_name数据
def get_db_customer_data(self):
record_exist = 0
db_host = self.pro_db_host
db_user = self.pro_db_user
db_pwd = self.pro_db_pwd
db_name = self.pro_db_name
two_yrs_ago = datetime.datetime.now() - relativedelta(years=2)
start_date = two_yrs_ago.strftime('%Y-%m-%d')
end_date = datetime.datetime.now().strftime('%Y-%m-%d')
try:
connection = pymysql.connect(host=db_host, user=db_user, password=db_pwd, database=db_name,
cursorclass=pymysql.cursors.DictCursor)
MessageToSlack.post_message_to_slack(
"The database is connected to query data")
with connection:
with connection.cursor() as cursor:
check_sql = "SELECT EXISTS ( SELECT `customer_name` FROM `cai_niao_watermark_dailylogs` WHERE `stock_central_amt` > 0 AND `created_at` BETWEEN %s AND %s)"
cursor.execute(check_sql, (start_date, end_date))
result = cursor.fetchone()
if (list(result.values()))[0] == 1:
record_exist = 1
if record_exist == 1:
select_sql = "SELECT customer_id,customer_name FROM `cai_niao_watermark_dailylogs` WHERE `stock_central_amt` > 0 AND `created_at` BETWEEN %s AND %s"
cursor.execute(select_sql, (start_date, end_date))
dbs_response = cursor.fetchall()
self.need_customer_data = list_dict_duplicate_removal(
dbs_response)
connection.commit()
except Exception as ds:
MessageToSlack.post_message_to_slack(
"get_db_customer_data抛出异常:%s" % str(ds))
connection.close()
print(self.need_customer_data)
# 多个数据库执行数据存储
def save_to_db(self):
# save to dbs
self.db_save_discount_rate(
self.pro_db_host, self.pro_db_user, self.pro_db_pwd, self.pro_db_name)
# self.db_save_discount_rate(
# self.u02_db_host, self.u02_db_user, self.u02_db_pwd, self.u02_db_name)
self.db_save_discount_rate(
self.u01_db_host, self.u01_db_user, self.u01_db_pwd, self.u01_db_name)
# 将获取到的信息存储到数据库中
def db_save_discount_rate(self, db_host, db_user, db_pwd, db_name):
discount_rate_data = self.discount_rate_data
# MessageToSlack.post_message_to_slack("discount_rate_data:")
# MessageToSlack.post_message_to_slack(discount_rate_data)
if len(discount_rate_data) > 0:
try:
connection = pymysql.connect(
host=db_host, user=db_user, password=db_pwd, database=db_name,
cursorclass=pymysql.cursors.DictCursor)
MessageToSlack.post_message_to_slack(
"connect with db successfully")
with connection:
for data in discount_rate_data:
download_date = datetime.datetime.now().strftime('%Y-%m-%d')
created_at = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
updated_at = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
customer_id = data['customer_id']
customer_name = data['customer_name']
all_assets_value = data['allAssetsValue']
assets_receivable_value = data['assetsReceivableValue']
central_good_value = data['centralGoodValue']
central_pledge_rate = int(
data['centralPledgeRate']) # int
currency = data['currency']
discount_assets_receivable_value = data['discountAssetsReceivableValue']
goods_owner_name = data['goodsOwnerName']
goods_value = data['goodsValue']
head_trip_pledge_rate = int(
data['headTripPledgeRate']) # int
head_trip_value = data['headTripValue']
pledge_goods_value = data['pledgeGoodsValue']
# int
pledge_rate = int(data['pledgeRate'])
receivable_pledge_rate = int(
data['receivablePledgeRate']) # int
store_name = data['storeName']
store_t_b_id = data['storeTBId']
record_exist = 0
with connection.cursor() as cursor:
check_sql = "SELECT EXISTS ( SELECT `download_date` FROM `cai_niao_company_discountrate_dailylogs` WHERE `download_date` = %s and `customer_id` = %s and `store_t_b_id` = %s)"
cursor.execute(
check_sql, (download_date, customer_id, store_t_b_id))
result = cursor.fetchone()
if (list(result.values()))[0] == 1:
record_exist = 1
if record_exist == 0:
MessageToSlack.post_message_to_slack(
"Not found today record, insert that record")
# 执行插入数据
# 新增语句
insert_sql = f"Insert INTO `cai_niao_company_discountrate_dailylogs` (`download_date`,`created_at`,`customer_id`,`customer_name`,`all_assets_value`,`assets_receivable_value`,`central_good_value`,`central_pledge_rate`,`currency`,`discount_assets_receivable_value`,`goods_owner_name`,`goods_value`,`head_trip_pledge_rate`,`head_trip_value`,`pledge_goods_value`,`pledge_rate`,`receivable_pledge_rate`,`store_name`,`store_t_b_id`) VALUES ('{download_date}', '{created_at}', '{customer_id}', '{customer_name}', '{all_assets_value}', '{assets_receivable_value}', '{central_good_value}', {central_pledge_rate}, '{currency}', '{discount_assets_receivable_value}', '{goods_owner_name}', '{goods_value}', {head_trip_pledge_rate}, '{head_trip_value}', '{pledge_goods_value}', {pledge_rate}, {receivable_pledge_rate}, '{store_name}', '{store_t_b_id}')"
MessageToSlack.post_message_to_slack(
"insert_sql: %s" % insert_sql)
with connection.cursor() as cursor:
cursor.execute(insert_sql)
elif record_exist == 1:
MessageToSlack.post_message_to_slack(
"Today record is existed, update that record")
# 执行更新数据
# 更新语句
update_sql = f"UPDATE `cai_niao_company_discountrate_dailylogs` SET `updated_at` = '{updated_at}',`all_assets_value` = '{all_assets_value}',`assets_receivable_value` = '{assets_receivable_value}',`central_good_value` = '{central_good_value}',`central_pledge_rate` = {central_pledge_rate},`currency` = '{currency}',`discount_assets_receivable_value` = '{discount_assets_receivable_value}',`goods_owner_name` = '{goods_owner_name}',`goods_value` = '{goods_value}',`head_trip_pledge_rate` = {head_trip_pledge_rate},`head_trip_value` = '{head_trip_value}',`pledge_goods_value` = '{pledge_goods_value}',`pledge_rate` = {pledge_rate},`receivable_pledge_rate` = {receivable_pledge_rate},`store_name` = '{store_name}',`store_t_b_id` = '{store_t_b_id}' WHERE `download_date` = '{download_date}' and `customer_id` = '{customer_id}' and `store_t_b_id` = '{store_t_b_id}'"
MessageToSlack.post_message_to_slack(
"update_sql: %s" % update_sql)
with connection.cursor() as cursor:
cursor.execute(update_sql)
connection.commit()
MessageToSlack.post_message_to_slack(
"Inserted into %s" % db_name)
except Exception as ds:
MessageToSlack.post_message_to_slack(
"db_save_discount_rate抛出异常:%s" % str(ds))
connection.close()
else:
MessageToSlack.post_message_to_slack(
'No data need to store as no data grabbed')
MessageToSlack.post_message_to_slack("Bye")
# 关闭所有的浏览器窗口
def teardown_method(self):
# self.driver.quit()
MessageToSlack.post_message_to_slack(
"浏览器要关闭时打开的页面的标题: %s" % self.driver.title)
MessageToSlack.post_message_to_slack("Browser Close")
# 列表里的字典元素去重复
def list_dict_duplicate_removal(data_list):
def run_function(x, y): return x if y in x else x + [y]
return reduce(run_function, [[], ] + data_list)
if __name__ == '__main__':
scrape_cainiao = Cainiao_Scrape(
'https://merchant.finance.cainiao.com/funds/signApproveDetail/SignApproveDetailManage.htm',
config('PRO_DB_HOST'),
config('PRO_DB_USERNAME'),
config('PRO_DB_PASSWORD'),
config('PRO_DB_DATABASE'),
config('U02_DB_HOST'),
config('U02_DB_USERNAME'),
config('U02_DB_PASSWORD'),
config('U02_DB_DATABASE'),
config('U01_DB_HOST'),
config('U01_DB_USERNAME'),
config('U01_DB_PASSWORD'),
config('U01_DB_DATABASE'),
config('CAINIAO_AC'),
config('CAINIAO_PWD')
)
# # 获取cai_niao_watermark_dailylogs表中stock_central_amt>0并且日期範圍由2年前的今天到今天的customer_name数据
# scrape_cainiao.get_db_customer_data()
# 浏览器配置函数
scrape_cainiao.setup_method('')
# 链接菜鸟后台网址,并scrape相关数据
scrape_cainiao.login_cainiao()