Python requests爬虫

# coding=utf-8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from pyquery import PyQuery as pg
import datetime
import time

# browser = webdriver.Chrome()
# browser.maximize_window() # 窗口最大化
# clear_cache(browser)
# browser.get('http://www.customs.go.th/statistic_report.php?show_search=1')  # 在当前浏览器中访问百度
# wait = WebDriverWait(browser, 2)  # 等待的最大时间
# browser.implicitly_wait(10)
#
# # print(browser.current_url)
# # 新开一个窗口，通过执行js来新开一个窗口
# js = 'window.open("http://www.baidu.com");'
# browser.execute_script(js)
# handles = browser.window_handles
# for item in handles:
#     print(item)

# 'SD','TJ','CN'
country_codes = '''CC,CX,CO,KM,CG,CD,CK,CR,CI,HR,CU,CW,CY,CZ,DD,DK,DJ,DM,DO,TP,EC,EG,SV,GQ,ER,EE,ET,FK,FO,FJ,FI,YY,FR,FX,GF,PF,TF,GA,GM,GE,DE,GH,GI,GR,GL,GD,GP,GU,GT,GG,GN,GW,GY,HT,HM,HN,HK,HU,IS,IN,ID,IR,IQ,IE,IM,IL,IT,JM,JP,JE,JO,KZ,KE,KI,KP,KR,KW,KG,LA,LV,LB,LS,LR,LY,LI,LT,LU,MO,MK,MG,MW,MY,MV,ML,MT,MH,MQ,MR,MU,YT,MX,FM,MD,MC,MN,ME,MS,MA,MZ,MM,NA,NR,NP,NL,AN,NC,NZ,NI,NE,NG,NU,NF,MP,NO,OM,OT,PK,PW,PS,PA,PG,PY,PE,PH,PN,PL,PT,PR,QA,RE,RO,RU,RW,BL,SH,KN,LC,MF,PM,VC,WS,SM,ST,SA,SN,RS,CS,SC,SL,SG,SX,SK,SI,SB,SO,ZA,GS,SS,ES,LK,SR,SJ,SZ,SE,CH,SY,TW,TZ,TH,TL,TG,TK,TO,TT,TN,TR,TM,TC,TV,UG,UA,AE,GB,US,UM,UY,UZ,VU,VA,VE,VN,VG,VI,WF,EH,YE,YU,ZR,ZM,ZW,ZZ,FZ,EZ,AX,AF,AL,DZ,AS,AD,AO,AI,AQ,AG,AR,AM,AW,AU,AT,AZ,BS,BH,BD,BB,BY,BE,BZ,BJ,BM,BT,BO,BQ,BA,BW,BV,BR,IO,BN,BG,BF,BI,KH,CM,CA,CV,KY,CF,TD,CL'''

import requests


def getEqual11(hscode=None, imtype=None, year_parm=None, month_parm=None, country=''):
    info = dict(imex_type=imtype, tariff_code=hscode, country_code=country, month=month_parm, year=year_parm)
    print(info)
    datas = requests.post('http://www.customs.go.th/statistic_report.php?show_search=1', data=info)
    doc = pg(datas.text)
    # print(datas.text)
    table = doc('.table-responsive .table')
    print(table.text())
    childrens = (child for child in enumerate(table.children()[1]))
    with open(r'27071000001.txt', 'a') as myfile:
        for i, child in childrens:
            # print("*" * 100)
            # 遍历每行记录
            if isinstance(child, str):
                continue
            tds = child.getchildren()
            if len(tds) < 6:
                continue
            item = []
            # 遍历每列字段
            for index, td in enumerate(tds):
                if index <= 1:
                    item.append(td.text)
                else:
                    corrent = td.text.replace('\r', '').replace('\n', '').replace('\t', '')
                    item.append(corrent)
            print(item)
            # 国家编码简写   国家编码  出口还是进口  税则号编码     年   月  月quality   月cif  累计quality 累计cif
            record = (item[0], item[1], imtype, hscode, year_parm, month_parm, str(item[2]).replace(",", ""),
                      item[3].replace(",", ""), item[4].replace(",", ""), item[5].replace(",", ""))
            print(record)
            myfile.write(
                "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\n".format(*record))
            # yield record


def scraw_tailand_Data_11():
    countrys = country_codes.split(',')
    for country in countrys:
        for year in range(2007, 2019):
            start_month = 1
            # if year == 2007 or year==2008:
            #     start_month = 7
            # if year==2009:
            #     start_month=5
            for month in range(start_month, 13):
                if year == 2018 and month > 8:
                    break
                else:
                    for type in ['import', 'export']:
                        time.sleep(5)
                        getEqual11('27071000001', type, str(year), str(month),str(country))


scraw_tailand_Data_11()
posted on 2018-09-21 16:51 永远爱学习阅读(287) 评论(0) 收藏举报
刷新页面返回顶部
永远爱学习

导航

公告

Python requests爬虫