Python爬虫-中国政府采购网数据

import datetime
import json
import re
import threading
import time
import math

import requests
from lxml import etree
import openpyxl
class ZhenfucaigouSpider():
    url = 'http://search.ccgp.gov.cn/bxsearch?searchtype=2'
    keyword = '福建师范大学'
    start_time = '2020:01:01'
    end_time = '2020:10:09'
    page_num = 1
    Tag =2

    params = {
        'searchtype': '2',
        'page_index': page_num,
        'bidSort': '0',
        'pinMu': '0',
        'bidType': '7',
        'kw': keyword,
        'start_time': start_time,
        'end_time': end_time,
        'timeType': '6'
    }
    headers = {
        'Cookie': 'JSESSIONID=EgPd86-6id_etA2QDV31Kks3FrNs-4gwHMoSmEZvnEktWIakHbV3!354619916; Hm_lvt_9f8bda7a6bb3d1d7a9c7196bfed609b5=1602214804; Hm_lpvt_9f8bda7a6bb3d1d7a9c7196bfed609b5=1602214892; JSESSIONID=OBoLczbR_k89lC8sOuKF4W-46DVqKEd5u7isUpSyOjE6D0nBP94c!1675672049; Hm_lvt_9459d8c503dd3c37b526898ff5aacadd=1602214902,1602214928,1602214932,1602214937; Hm_lpvt_9459d8c503dd3c37b526898ff5aacadd=1602214937',
        'Host': 'search.ccgp.gov.cn',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
    }

    def get_page(self,url,headers,params):
        try:
            response = requests.get(url,headers=headers,params=params)
            if response.status_code == 200:
                html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'')
                return html
            else:
                print(response.status_code)
        except requests.ConnectionError:
            return None

    def get_detail_page(self,url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'')
                #print(html)
                return html
        except requests.ConnectionError:
            return None

    def get_all_url(self,html):
        pattern1 = '<.*?(href=".*?htm").*?'
        href_url = re.findall(pattern1, html, re.I)
        #print(href_url)
        url_list = []

        for url in href_url:
            url1 = url.replace('href=','').replace('"','')
            url_list.append(url1)
            #table.cell(row=i, column=2).value = url1
        #print("url_list=",url_list)
        return url_list

    # def parse_datail_page(self,html):
    #     table_list = html.xpath('//div[@class="table"]//tr')
    #     print("table_list",table_list)
    #     all_info = {}
    #     for table in table_list:
    #         if len(table.xpath('td[@class="title"]/text()'))>0:
    #             #print(''.join(table.xpath('td[@class="title"]/text()'))+":"+''.join(table.xpath('td[@colspan="3"]/text()')))
    #             title = ''.join(table.xpath('td[@class="title"]/text()'))
    #             value = ''.join(table.xpath('td[@colspan="3"]/text()'))
    #             if (title.find('附件')==0):
    #                 value = 'http://www.ccgp.gov.cn/oss/download?uuid='+''.join(table.xpath('td[@colspan="3"]/a/@id'))
    #                 #print(title+value)
    #             if ('公告时间' in title):
    #                 title = '公告时间'
    #                 value = table.xpath('td[@width="168"]/text()')[1]
    #                 district_key = '行政区域'
    #                 district_value = (table.xpath('td[@width="168"]/text()'))[0]
    #                 all_info[district_key]=district_value
    #             if '本项目招标公告日期中标日期' in title :
    #                 title = '本项目招标公告日期'
    #                 value = table.xpath('td[@width="168"]/text()')[0]
    #                 zhongbiaoriqi_key = '中标日期'
    #                 zhongbiaoriqi_value = table.xpath('td[@width="168"]/text()')[1]
    #                 all_info[zhongbiaoriqi_key]=zhongbiaoriqi_value
    #                 #print('中标日期'+zhongbiaoriqi_value)
    #             if '本项目招标公告日期成交日期' in title:
    #                 title = '本项目招标公告日期'
    #                 value = table.xpath('td[@width="168"]/text()')[0]
    #                 zhongbiaoriqi_key = '中标日期'
    #                 zhongbiaoriqi_value = ''.join(table.xpath('td[@width="168"]/text()'))[11:]
    #                 #print('zhongbiaoriqi_value:'+zhongbiaoriqi_value)
    #                 all_info[zhongbiaoriqi_key] = zhongbiaoriqi_value
    #             all_info[title] = value
    #             all_info['插入时间']= datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    #     return all_info
    #     #return json.dumps(all_info,ensure_ascii=False)
    def parse_datail_page(self,html,url,table):
        title= html.xpath("//title//text()")
        # print("title==",title)
        # print("url==",url)
        #table.cell(row=i, column=1).value = title
        print(self.Tag)
        table.cell(row=self.Tag, column=1).value = str(title)
        table.cell(row=self.Tag, column=2).value = url
        self.Tag +=1

    def start(self,url,table):
        time.sleep(0.01)
        # print(url)
        html = self.get_detail_page(url)
        html = etree.HTML(html)
        print("html2=", html)
        all_info = self.parse_datail_page(html,url,table)
        #print(all_info)
        #print(all_info.keys())

    def pages_num(self,html):
        num_list=html.xpath('/html/body/div[5]/div[1]/div/p/span[2]/text()')
        num = int(num_list[0])  #转换 int型
        # print(num)
        return num


    def run(self):
        execl_path = "D:\\2020年.xlsx" #要先在D盘创建2020年.xlsx文件
        wb = openpyxl.load_workbook(execl_path)
        table = wb['Sheet1']
        #获取搜索检索之后共有多少条内容
        html1 = self.get_page(url=self.url, headers=self.headers, params=self.params)
        html1 = etree.HTML(html1)
        pagesNum = self.pages_num(html1)
        #向上取整数
        pagesNum = math.ceil(pagesNum/20)
        print("pagesNum==",pagesNum)

        for i in range(1,pagesNum+1):
            print('正在爬取第{}页'.format(str(i)))
            self.params['page_index']=i
            html = self.get_page(url=self.url, headers=self.headers, params=self.params)
            #print(html)
            url_list = self.get_all_url(html)

            # 创建线程
            threads = []
            files = range(len(url_list))


            for url in url_list:
                t = threading.Thread(target=self.start(url,table), args=url)
                threads.append(t)

            # 启动线程
            for i in files:
                threads[i].start()
            for i in files:
                threads[i].join()
        wb.save(execl_path)


if __name__ == '__main__':
    zhenfucaigouSpider = ZhenfucaigouSpider()
    zhenfucaigouSpider.run()

 

posted @ 2020-10-14 10:23  槑槑DE  阅读(3803)  评论(1编辑  收藏  举报