海关爬爬虫3代(成熟变态版)

声明:代码仅作学习交流用途,代码分享者与创作者不承担任何由他人恶意运行而导致的责任,勿擅自修改限制频率的参数,勿恶意攻击网页,请学习浏览者遵守社会公德与法律秩序,爬虫导致的网页崩溃等损失由计算机操作者负全部责任,造成严重后果的需要承担刑事责任

import requests
from fake_useragent import UserAgent
from lxml import etree
from time import sleep
from random import randint
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
#from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from multiprocessing import  Process
import threading
import re
year = [2017,2018,2019]
month = [i for i in range(1,13)]
country_code_name_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\贸易伙伴参数导出.csv',encoding='GBK')
province_code_name_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\收发货人注册地参数导出.csv',encoding='GBK')
trade_code_type_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\贸易方式参数导出.csv',encoding='GBK')
country_code = country_code_name_dataform.iloc[:,0].tolist() #转列表
province_code = province_code_name_dataform.iloc[:,0].tolist()
trade_type_code = trade_code_type_dataform.iloc[:,0].tolist()
print(country_code)
print(province_code)
print(trade_type_code)

country_t_code = ['101%2c102%2c103%2c104%2c105%2c106%2c107%2c108%2c109%2c110%2c111%2c112%2c113%2c114%2c115%2c116%2c117%2c118%2c119%2c120%2c121%2c122%2c123%2c124%2c125%2c126%2c127%2c128%2c129',
                '130%2c131%2c132%2c133%2c134%2c135%2c136%2c137%2c138%2c139%2c140%2c141%2c142%2c143%2c144%2c145%2c146%2c147%2c148%2c149%2c199',

                '201%2c202%2c203%2c204%2c205%2c206%2c207%2c208%2c209%2c210%2c211%2c212%2c213%2c214%2c215%2c216%2c217%2c218%2c219%2c220%2c221%2c222%2c223%2c224%2c225%2c226%2c227%2c228%2c229',
                '230%2c231%2c232%2c233%2c234%2c235%2c236%2c237%2c238%2c239%2c240%2c241%2c242%2c243%2c244%2c245%2c246%2c247%2c248%2c249%2c299%2c250%2c251%2c252%2c253%2c254%2c255%2c256%2c257%2c258%2c259%2c260',

                '301%2c302%2c303%2c304%2c305%2c306%2c307%2c308%2c309%2c310%2c311%2c312%2c313%2c314%2c315%2c316%2c317%2c318%2c319',
                '320%2c321%2c322%2c323%2c324%2c325%2c326%2c327%2c328%2c329%2c330%2c331%2c332%2c333%2c334%2c335%2c336%2c337%2c338%2c339',
                '340%2c341%2c342%2c343%2c344%2c345%2c346%2c347%2c348%2c349%2c350%2c351%2c352%2c353%2c354%2c355%2c356%2c357%2c358%2c359%2c399',

                '401%2c402%2c403%2c404%2c405%2c406%2c407%2c408%2c409%2c410%2c411%2c412%2c413%2c414%2c415%2c416%2c417%2c418%2c419',
                '420%2c421%2c422%2c423%2c424%2c425%2c426%2c427%2c428%2c429%2c430%2c431%2c432%2c433%2c434%2c435%2c436%2c437%2c438%2c439',
                '440%2c441%2c442%2c443%2c444%2c445%2c446%2c447%2c448%2c449%2c499',

                '501%2c502%2c503%2c504%2c599',

                '601%2c602%2c603%2c604%2c605%2c606%2c607%2c608%2c609',
                '610%2c611%2c612%2c613%2c614%2c615%2c616%2c617%2c618%2c619%2c620%2c621%2c622%2c623%2c624%2c625%2c699%2c701%2c702%2c999']
url_base = 'http://43.248.49.97/queryData/queryDataList?pageSize=20000&pageNum=1&iEType=1&currencyType=rmb&year={year}&startMonth={month}&endMonth={month}&monthFlag=1&unitFlag=true&codeLength=8&outerField1=CODE_TS&outerField2=ORIGIN_COUNTRY&outerField3=TRADE_MODE&outerField4=TRADE_CO_PORT&outerValue1=&outerValue2={country}&outerValue3=&outerValue4={province}&orderType=CODE+ASC+DEFAULT&selectTableState=2&currentStartTime=201903'
def url_manger(year,month,province_code,country_t_code,url_base):
    request_url = []
    for y in year:
        for m in month:
            for p in province_code:
                for index,c_url in enumerate(country_t_code):
                    request_url.append(url_base.format(year=y,month=m,province=p,country=''.join(c_url)))
    f_link = {'爬取链接汇总':request_url}
    f_link_df = pd.DataFrame(f_link)
    f_link_df.to_csv('爬取链接汇总.csv', encoding='GBK')
    return request_url


def web_engine():
    global request_url
    global html_response
    global h
    url_send = request_url.pop()
    url_txt_info = re.findall(r'.+ype=rmb&year=(.+)&startMonth=(.+)&endMonth=.+&monthFlag=1&unitFlag=true&codeLength=8&outerField1=CODE_TS&outerField2=ORIGIN_COUNTRY&outerField3=TRADE_MODE&outerField4=TRADE_CO_PORT&outerValue1=&outerValue2=(.+)&outerValue3=&outerValue4=(.+)&orderType=CODE.+',url_send)
    y = url_txt_info[0][0]
    m = url_txt_info[0][1]
    cs_code = url_txt_info[0][2]
    p = url_txt_info[0][3]
    for index,compar_code in enumerate(country_t_code):
        if cs_code == compar_code:
            c = index
        else:
            continue
    edge = webdriver.Edge()
    edge.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",{
        "source": """
        Object.defineProperty(navigator, 'webdriver',{
        get: () => undefined
        })
        """
        })
    edge.implicitly_wait(100)
    sleep(2)
    edge.get(url_send)
    sleep(3)
    print(111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111)
 #   edge.find_element(By.XPATH, '''//*[@id="div1"]/div/div/div/div[1]/p/span''').is_displayed()
    try:
        WebDriverWait(edge, timeout=180, poll_frequency=1.5, ignored_exceptions=None).until(
            EC.visibility_of_any_elements_located((By.XPATH, '''//*[@id="div1"]/div/div/div/div[1]/p/span''')))
    #    element = WebDriverWait(edge,timeout=180,poll_frequency=1.5,ignored_exceptions=None).until(
    #    edge.find_element(By.XPATH, '''//*[@id="div1"]/div/div/div/div[1]/p/span''').is_displayed())  #这个是错误的用法
        html_response = edge.page_source
        print('成功', y, '\t', m, '\t', p, '\t', c)
        with open('爬取成功链接列表.txt', 'a') as f_success:
            url_send_w = url_send[0]
            f_success.write(url_send_w)
            f_success.close()
        e = etree.HTML(html_response)
        data_date = e.xpath('''//tbody[@id = 'div1']/tr/td[1]/div/text()''')
        goods_code = e.xpath('''//tbody[@id = 'div1']/tr/td[2]/div/text()''')
        goods_name = e.xpath('''//tbody[@id = 'div1']/tr/td[3]/div/text()''')
        partner_code = e.xpath('''//tbody[@id = 'div1']/tr/td[4]/div/text()''')
        partner_name = e.xpath('''//tbody[@id = 'div1']/tr/td[5]/div/text()''')
        trade_code = e.xpath('''//tbody[@id = 'div1']/tr/td[6]/div/text()''')
        trade_name = e.xpath('''//tbody[@id = 'div1']/tr/td[7]/div/text()''')
        in_province_code = e.xpath('''//tbody[@id = 'div1']/tr/td[8]/div/text()''')
        in_province_name = e.xpath('''//tbody[@id = 'div1']/tr/td[9]/div/text()''')
        first_unit_num = e.xpath('''//tbody[@id = 'div1']/tr/td[10]/div/text()''')
        first_unit_name = e.xpath('''//tbody[@id = 'div1']/tr/td[11]/div/text()''')
        second_unit_num = e.xpath('''//tbody[@id = 'div1']/tr/td[12]/div/text()''')
        second_unit_name = e.xpath('''//tbody[@id = 'div1']/tr/td[13]/div/text()''')
        rmb_value = e.xpath('''//tbody[@id = 'div1']/tr/td[14]/div/text()''')
        all_info = {
            '数据年月': data_date,
            '商品编码': goods_code,
            '商品名称': goods_name,
            '贸易伙伴编码': partner_code,
            '贸易伙伴名称': partner_name,
            '贸易方式编码': trade_code,
            '贸易方式名称': trade_name,
            '注册地编码': in_province_code,
            '注册地名称': in_province_name,
            '第一数量': first_unit_num,
            '第一计量单位': first_unit_name,
            '第二数量': second_unit_num,
            '第二计量单位': second_unit_name,
            '人民币': rmb_value
        }
        outdata = pd.DataFrame(all_info)
        outdata.to_csv('{0}年{1}月{2}省市{3}国家进口数据.csv'.format(y, m, p, c), encoding='GBK')
        edge.quit()
        h += 1
    except:
        print('失败',y,'\t',m,'\t',p,'\t',c)
        with open('爬取失败链接列表.txt','a') as f_fail:
            f_fail.write(url_send)
            f_fail.close()
            url_send = [url_send]
            request_url = url_send+request_url
            edge.quit()


if __name__ == '__main__':
    request_url = url_manger(year, month, province_code, country_t_code, url_base)
    with open('爬取成功链接列表.txt','r') as f_set_success:
        used_request_url = f_set_success.read()
        f_set_success.close()
    used_request_url_set = set(used_request_url.split('\n')) #转列表转集合
    request_url_set = set(request_url)
    request_url_set_end = request_url_set - used_request_url_set #去重
    request_url = list(request_url_set_end)
    kill_num = len(request_url)
    html_response = 'kkkk'
 #  print(len(request_url))
#    web_engine1 = web_engine(request_url, timeout=500, frequency=1)
    h=0
    while True:
        w1 = threading.Thread(target=web_engine)
        w2 = threading.Thread(target=web_engine)
        w3 = threading.Thread(target=web_engine)
        w4 = threading.Thread(target=web_engine)
  #      w5 = threading.Thread(target=web_engine1)
   #     w6 = threading.Thread(target=web_engine1)
        w1.start()
        sleep(2)
        w1.join()
        w2.start()
        sleep(4)
        w3.start()
        w4.start()
        w4.join()
   #     w2.start()
    #    w3.start()
  #      w4.start()
   #     w5.start()
    #    w6.start()
        if h > kill_num:
            break
    print('爬虫完成')



#修复所有bug
import requests
from fake_useragent import UserAgent
from lxml import etree
from time import sleep
from random import randint
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
#from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from multiprocessing import  Process
import threading
import re
year = [2017,2018,2019]
month = [i for i in range(1,13)]
country_code_name_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\贸易伙伴参数导出.csv',encoding='GBK')
province_code_name_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\收发货人注册地参数导出.csv',encoding='GBK')
trade_code_type_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\贸易方式参数导出.csv',encoding='GBK')
country_code = country_code_name_dataform.iloc[:,0].tolist() #转列表
province_code = province_code_name_dataform.iloc[:,0].tolist()
trade_type_code = trade_code_type_dataform.iloc[:,0].tolist()
print(country_code)
print(province_code)
print(trade_type_code)

country_t_code = ['101%2c102%2c103%2c104%2c105%2c106%2c107%2c108%2c109%2c110%2c111%2c112%2c113%2c114%2c115%2c116%2c117%2c118%2c119%2c120%2c121%2c122%2c123%2c124%2c125%2c126%2c127%2c128%2c129',
                '130%2c131%2c132%2c133%2c134%2c135%2c136%2c137%2c138%2c139%2c140%2c141%2c142%2c143%2c144%2c145%2c146%2c147%2c148%2c149%2c199',

                '201%2c202%2c203%2c204%2c205%2c206%2c207%2c208%2c209%2c210%2c211%2c212%2c213%2c214%2c215%2c216%2c217%2c218%2c219%2c220%2c221%2c222%2c223%2c224%2c225%2c226%2c227%2c228%2c229',
                '230%2c231%2c232%2c233%2c234%2c235%2c236%2c237%2c238%2c239%2c240%2c241%2c242%2c243%2c244%2c245%2c246%2c247%2c248%2c249%2c299%2c250%2c251%2c252%2c253%2c254%2c255%2c256%2c257%2c258%2c259%2c260',

                '301%2c302%2c303%2c304%2c305%2c306%2c307%2c308%2c309%2c310%2c311%2c312%2c313%2c314%2c315%2c316%2c317%2c318%2c319',
                '320%2c321%2c322%2c323%2c324%2c325%2c326%2c327%2c328%2c329%2c330%2c331%2c332%2c333%2c334%2c335%2c336%2c337%2c338%2c339',
                '340%2c341%2c342%2c343%2c344%2c345%2c346%2c347%2c348%2c349%2c350%2c351%2c352%2c353%2c354%2c355%2c356%2c357%2c358%2c359%2c399',

                '401%2c402%2c403%2c404%2c405%2c406%2c407%2c408%2c409%2c410%2c411%2c412%2c413%2c414%2c415%2c416%2c417%2c418%2c419',
                '420%2c421%2c422%2c423%2c424%2c425%2c426%2c427%2c428%2c429%2c430%2c431%2c432%2c433%2c434%2c435%2c436%2c437%2c438%2c439',
                '440%2c441%2c442%2c443%2c444%2c445%2c446%2c447%2c448%2c449%2c499',

                '501%2c502%2c503%2c504%2c599',

                '601%2c602%2c603%2c604%2c605%2c606%2c607%2c608%2c609',
                '610%2c611%2c612%2c613%2c614%2c615%2c616%2c617%2c618%2c619%2c620%2c621%2c622%2c623%2c624%2c625%2c699%2c701%2c702%2c999']
url_base = 'http://43.248.49.97/queryData/queryDataList?pageSize=20000&pageNum=1&iEType=1&currencyType=rmb&year={year}&startMonth={month}&endMonth={month}&monthFlag=1&unitFlag=true&codeLength=8&outerField1=CODE_TS&outerField2=ORIGIN_COUNTRY&outerField3=TRADE_MODE&outerField4=TRADE_CO_PORT&outerValue1=&outerValue2={country}&outerValue3=&outerValue4={province}&orderType=CODE+ASC+DEFAULT&selectTableState=2&currentStartTime=201903'
def url_manger(year,month,province_code,country_t_code,url_base):
    request_url = []
    for y in year:
        for m in month:
            for p in province_code:
                for index,c_url in enumerate(country_t_code):
                    request_url.append(url_base.format(year=y,month=m,province=p,country=''.join(c_url)))
    f_link = {'爬取链接汇总':request_url}
    f_link_df = pd.DataFrame(f_link)
    f_link_df.to_csv('爬取链接汇总.csv', encoding='GBK')
    return request_url


def web_engine():
    global request_url
    global html_response
    global h
    url_send = request_url.pop()
    url_txt_info = re.findall(r'.+ype=rmb&year=(.+)&startMonth=(.+)&endMonth=.+&monthFlag=1&unitFlag=true&codeLength=8&outerField1=CODE_TS&outerField2=ORIGIN_COUNTRY&outerField3=TRADE_MODE&outerField4=TRADE_CO_PORT&outerValue1=&outerValue2=(.+)&outerValue3=&outerValue4=(.+)&orderType=CODE.+',url_send)
    y = url_txt_info[0][0]
    m = url_txt_info[0][1]
    cs_code = url_txt_info[0][2]
    p = url_txt_info[0][3]
    for index,compar_code in enumerate(country_t_code):
        if cs_code == compar_code:
            c = index
        else:
            continue
    edge = webdriver.Edge()
    edge.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",{
        "source": """
        Object.defineProperty(navigator, 'webdriver',{
        get: () => undefined
        })
        """
        })
    edge.implicitly_wait(100)
    sleep(2)
    edge.get(url_send)
    sleep(3)
    print(111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111)
 #   edge.find_element(By.XPATH, '''//*[@id="div1"]/div/div/div/div[1]/p/span''').is_displayed()
    try:
    #    WebDriverWait(edge, timeout=180, poll_frequency=1.5, ignored_exceptions=None).until(
     #       EC.visibility_of_any_elements_located((By.XPATH, '''//*[@id="div1"]/div/div/div/div[1]/p/span''')))
        WebDriverWait(edge, timeout=180, poll_frequency=1.5, ignored_exceptions=None).until_not(
            EC.visibility_of_any_elements_located((By.XPATH, '''//*[@id="test"]''')))
    #    element = WebDriverWait(edge,timeout=180,poll_frequency=1.5,ignored_exceptions=None).until(
    #    edge.find_element(By.XPATH, '''//*[@id="div1"]/div/div/div/div[1]/p/span''').is_displayed())  #这个是错误的用法
        html_response = edge.page_source
        print('成功', y, '\t', m, '\t', p, '\t', c)
        with open('爬取成功链接列表.txt', 'a') as f_success:
            f_success.write(url_send)
            f_success.close()
        e = etree.HTML(html_response)
        data_date = e.xpath('''//tbody[@id = 'div1']/tr/td[1]/div/text()''')
        goods_code = e.xpath('''//tbody[@id = 'div1']/tr/td[2]/div/text()''')
        goods_name = e.xpath('''//tbody[@id = 'div1']/tr/td[3]/div/text()''')
        partner_code = e.xpath('''//tbody[@id = 'div1']/tr/td[4]/div/text()''')
        partner_name = e.xpath('''//tbody[@id = 'div1']/tr/td[5]/div/text()''')
        trade_code = e.xpath('''//tbody[@id = 'div1']/tr/td[6]/div/text()''')
        trade_name = e.xpath('''//tbody[@id = 'div1']/tr/td[7]/div/text()''')
        in_province_code = e.xpath('''//tbody[@id = 'div1']/tr/td[8]/div/text()''')
        in_province_name = e.xpath('''//tbody[@id = 'div1']/tr/td[9]/div/text()''')
        first_unit_num = e.xpath('''//tbody[@id = 'div1']/tr/td[10]/div/text()''')
        first_unit_name = e.xpath('''//tbody[@id = 'div1']/tr/td[11]/div/text()''')
        second_unit_num = e.xpath('''//tbody[@id = 'div1']/tr/td[12]/div/text()''')
        second_unit_name = e.xpath('''//tbody[@id = 'div1']/tr/td[13]/div/text()''')
        rmb_value = e.xpath('''//tbody[@id = 'div1']/tr/td[14]/div/text()''')
        all_info = {
            '数据年月': data_date,
            '商品编码': goods_code,
            '商品名称': goods_name,
            '贸易伙伴编码': partner_code,
            '贸易伙伴名称': partner_name,
            '贸易方式编码': trade_code,
            '贸易方式名称': trade_name,
            '注册地编码': in_province_code,
            '注册地名称': in_province_name,
            '第一数量': first_unit_num,
            '第一计量单位': first_unit_name,
            '第二数量': second_unit_num,
            '第二计量单位': second_unit_name,
            '人民币': rmb_value
        }
        outdata = pd.DataFrame(all_info)
        outdata.to_csv('{0}年{1}月{2}省市{3}国家进口数据.csv'.format(y, m, p, c), encoding='GBK')
        edge.quit()
        h += 1
    except:
        print('失败',y,'\t',m,'\t',p,'\t',c)
        with open('爬取失败链接列表.txt','a') as f_fail:
            f_fail.write(url_send)
            f_fail.close()
            url_send = [url_send]
            request_url = url_send+request_url
            edge.quit()


if __name__ == '__main__':
    request_url = url_manger(year, month, province_code, country_t_code, url_base)
    with open('爬取成功链接列表.txt','r') as f_set_success:
        used_request_url = f_set_success.read()
        f_set_success.close()
    used_request_url_set = set(used_request_url.split('\n')) #转列表转集合
    request_url_set = set(request_url)
    request_url_set_end = request_url_set - used_request_url_set #去重
    request_url = list(request_url_set_end)
    kill_num = len(request_url)
    html_response = 'kkkk'
 #  print(len(request_url))
#    web_engine1 = web_engine(request_url, timeout=500, frequency=1)
    h=0
    while True:
        w1 = threading.Thread(target=web_engine)
        w2 = threading.Thread(target=web_engine)
        w3 = threading.Thread(target=web_engine)
        w4 = threading.Thread(target=web_engine)
        w5 = threading.Thread(target=web_engine)
   #     w6 = threading.Thread(target=web_engine1)
        w1.start()
        sleep(2)
        w1.join()
        w2.start()
        sleep(1)
        w3.start()
        w4.start()
        w4.join()
   #     w2.start()
    #    w3.start()
  #      w4.start()
        w5.start()
    #    w6.start()
        if h > kill_num:
            break
    print('爬虫完成')
posted @ 2020-11-17 19:48  kuanleung  阅读(89)  评论(0)    收藏  举报  来源