# -*- coding: UTF-8 -*-
import os
import sys
from spiders.market_supervision_penalty.govement_penalty_base_spider import govement_penalty_base_spider
from utils.common_util import *
import datetime
from bs4 import BeautifulSoup
from spiders.base_spiders.base_spider import *
from urllib.parse import urlencode
from config.proxy.config import *
from utils.date_util import current_datetime
class ah_market_gov_chuzhou_xinzhen(govement_penalty_base_spider):
name = "ah_market_gov_chuzhou_xinzhen"
custom_settings = {
'CONCURRENT_REQUESTS': '10',
'CONCURRENT_REQUESTS_PER_DOMAIN': '10',
# 'DOWNLOAD_DELAY': 0.2,
'DOWNLOAD_TIMEOUT': 90,
'RETRY_TIMES': 30,
'HTTPERROR_ALLOWED_CODES': [407, 302],
'RETRY_HTTP_CODES': [504, 408, 500, 502, 503, 533, 407, 401, 403, 404, 400, 478],
'REDIRECT_ENABLED': False,
'COOKIES_ENABLED': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'extensions.proxy.retry.RetryMiddleware': 550,
'extensions.proxy.ip_utils.ProxyMiddleware': 555,
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
'scrapy_splash.SplashCookiesMiddleware': 800,
'scrapy_splash.SplashMiddleware': 850,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 900
},
}
is_not_change_proxy = True # 只用一个代理
is_proxy = True
proxy_type = PROXY_TYPE_WD
proxy_count = 50
def __init__(self, increment=None, *args, **kwargs):
super(ah_market_gov_chuzhou_xinzhen, self).__init__(*args, **kwargs)
self.increment = increment
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Host': 'www.chuzhou.gov.cn',
'Upgrade-Insecure-Requests': '1',
'Referer': 'https://www.chuzhou.gov.cn/public/column/108578180?type=4&catId=161735210&action=list&nav=3',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
}
self.js_file = """
var _0x500dd8 = '%s';
var _0x14e579 = '%s';
var _0x351708 = '%s';
var _0x41f35b = 'WZWS_METHOD';
var _0x349042 = 'WZWS_PARAMS';
var btoa = function (str_){
return new Buffer.from(str_, "binary").toString("base64")
}
function _0x13698a() {
var _0x338d15 = 0x0;
var _0xbe152f = 0x0;
for (_0xbe152f = 0x0; _0xbe152f < _0x14e579.length; _0xbe152f++) {
_0x338d15 += _0x14e579.charCodeAt(_0xbe152f);
}
_0x338d15 *= _0x351708;
_0x338d15 += 0x1b207;
return "WZWS_CONFIRM_PREFIX_LABEL" + _0x338d15;
}
"""
self.index_url = "https://www.chuzhou.gov.cn/chuzhou/site/label/8888?IsAjax=1&dataType=html&_=0.6834244290108127&labelName=publicInfoList&siteId=2653861&pageSize=20&pageIndex=1&action=list&isDate=true&dateFormat=yyyy-MM-dd&length=50&organId=108578180&type=4&catId=161735210&cId=&result=%E6%9A%82%E6%97%A0%E7%9B%B8%E5%85%B3%E4%BF%A1%E6%81%AF&title=&fileNum=&keyWords=&file=%2Fc1%2Fchuzhou%2FpublicInfoList_newest"
def start_requests(self):
yield scrapy.Request(url=self.index_url, method='GET', headers=self.headers,
encoding="utf-8", dont_filter=True)
def parse(self, response):
resp_url = response.url
resp_meta = copy.deepcopy(response.meta)
try:
if "jsjiami.com.v6" in response.text:
cookie_url = self.parse_cookie(response.text)
header = deepCopy(self.headers)
cookie_str, coolie_dict = getSetcookie2Str(response)
header["Cookie"] = cookie_str
yield scrapy.Request(url=cookie_url, method='GET', headers=header,
encoding='UTF-8', dont_filter=True, meta=resp_meta, callback=self.parse)
elif response.status == 302:
header = deepCopy(self.headers)
cookie_str, coolie_dict = getSetcookie2Str(response)
header["Cookie"] = cookie_str
if "list" in str(resp_meta) :yield scrapy.Request(url=resp_meta['list_url'], method='GET', headers=header,else:
yield scrapy.Request(url=self.index_url, method='GET', headers=header,
encoding='UTF-8', dont_filter=True, meta=resp_meta, callback=self.parse)
else:
page_number = re.findall(r'pageCount:(.*?),',response.text)[0]
search_number = 2 if self.increment else int(page_number)
for index in range(1,search_number + 1):
send_url = 'https://www.chuzhou.gov.cn/chuzhou/site/label/8888?IsAjax=1&dataType=html&_=0.062391026092820656&labelName=publicInfoList&siteId=2653861&pageSize=20&pageIndex={}&action=list&isDate=true&dateFormat=yyyy-MM-dd&length=50&organId=108578180&type=4&catId=161735210&cId=&result=%E6%9A%82%E6%97%A0%E7%9B%B8%E5%85%B3%E4%BF%A1%E6%81%AF&title=&fileNum=&keyWords=&file=%2Fc1%2Fchuzhou%2FpublicInfoList_newest'.format(index)
yield scrapy.Request(url=send_url, method='GET', headers=self.headers,meta=resp_meta,
encoding="utf-8", dont_filter=True, callback=self.parse_list)
except:
traceback.print_exc()
self.logger.info(f"parse error url: {resp_url}")
def parse_list(self, response):
resp_url = response.url
resp_meta = copy.deepcopy(response.meta)
try:
if "jsjiami.com.v6" in response.text:
yield scrapy.Request(url=resp_url, method='GET', headers=self.headers,meta={**resp_meta,'list_url':resp_url},
encoding="utf-8", dont_filter=True, callback=self.parse)
else:
resp_soup = BeautifulSoup(response.text, 'html5lib')
detail_list = resp_soup.select('ul.xxgk_navli2')
for detail in detail_list:
if "href" in str(detail):
detail_url = response.urljoin(detail.select_one('a')['href'])
yield scrapy.Request(url=detail_url, method='GET', headers=self.headers,
encoding="utf-8", dont_filter=True, callback=self.parse_detail)
except:
traceback.print_exc()
self.logger.info(f"parse error url: {resp_url}")
def parse_cookie(self, resp_body):
_0x500dd8 = re.findall(r"_0x500dd8='(.*?)'", resp_body, re.DOTALL)[0]
_0x14e579 = re.findall(r"_0x14e579='(.*?)'", resp_body, re.DOTALL)[0]
_0x351708 = re.findall(r"_0x351708='(.*?)'", resp_body, re.DOTALL)[0]
data = pyv8_engine_service(self.js_file % (_0x500dd8, _0x14e579, _0x351708), "_0x13698a")
cookie_url = "http://www.chuzhou.gov.cn" + _0x500dd8 + "?wzwschallenge=" + base64.b64encode(data.encode()).decode()
return cookie_url