sunPro

本项目中运用了相关技术

  1. fake_useragent的随机ua
  2. http://www.goubanjia.com/中的动态代理ip
  3. time.sleep(delay)随机延迟数,来降低被反爬虫策略监控的风险
  4. 存储数据在mysql数据库中
  5. 经本人测试,可爬取阳光政务的 http://wz.sun0769.com/political/index/politicsNewest 中2000页数据,爬取全部网页不在话下

项目结构

sunPro
    sunPro
        spiders
            sun.py
        items.py
        middlewares.py
        pipelines.py
        settings.py

sun.py中

# -*- coding: utf-8 -*-
import scrapy
import os
import random
# 当前的路径为:
# base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# print(base_path)
import sys
import logging

from sunPro.items import SunproItem


# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

class SunSpider(scrapy.Spider):
    name = 'sun'
    allowed_domains = ['wz.sun0769.com']
    start_urls = ['http://wz.sun0769.com/political/index/politicsNewest', ]
    # start_urls = ['http://wz.sun0769.com/political/index/politicsNewest',
    #               'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=2',
    #               'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=3',
    #               'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=4',
    #               'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=5']

    # 翻页的url
    page_urls = []

    # http://wz.sun0769.com/political/index/politicsNewest?id=1&page=2

    # 下一页 <a href="/political/index/politicsNewest?id=1&amp;page=3" class="arrow-page prov_rota"></a>
    def parse(self, response):
        # 获取解析页面

        # for i in range(1, 3):
        #     page = i+1
        #     print('-----> 目前的页数为:', page)
        #     yield scrapy.Request(f'http://wz.sun0769.com/political/index/politicsNewest?id=1&page={page}',
        #                          callback=self.parse)

        # if 200 <= response.status <= 300:

        # new_urls = 'http://wz.sun0769.com' + response.xpath('//div[@class="mr-three paging-box"]/a[2]/@href').extract_first()
        # print('-----> 新得页面的url是:', new_urls)
        # if new_urls:
        #     self.page_urls.append(new_urls)
        for page in range(1, 1000):
            url = 'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=%d' % page
            print('------> url', url)
            yield scrapy.Request(url=url, callback=self.parse_item)
        # logging.info('当前访问的url是 %s' % response.url)

        # 此处为翻页
        # new_urls = []
        # try:
        #     new_urls = response.xpath('/html/body/div[2]/div[3]/div[3]/div//a/@href').extract()
        #     # '/html/body/div[2]/div[3]/div[3]/div/a[2]'
        #     print('-----> 新的url列表为:', new_urls)
        # except Exception as e:
        #     print('解析下一页标签出错了 | 已经是最后一页,没有下一页标签了', e)
        # if len(new_urls) > 0:
        #     for i in range(len(new_urls)):
        #         page_url = new_urls[i]
        #         print('-----> page_url', page_url)
        #         yield scrapy.Request('http://wz.sun0769.com' + page_url, callback=self.parse)

        # 'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1'
        # for url in response.xpath('/html/body/div[2]/div[3]/div[3]/div//a/@href').extract():
        #     print('-----> 每一条url是', url)
        #     yield scrapy.Request('http://wz.sun0769.com'+url, callback=self.parse)

        # 得出下一页的标签,进行翻页操作,并且拼接成url访问

    def parse_item(self, response):
        if 200 <= response.status <=300:
            print('-----> 当前正在解析的页面为:', response.url)
            li_lists = response.xpath('/html/body/div[2]/div[3]/ul[2]//li')
            for li in li_lists:
                item = SunproItem()
                sid = li.xpath('./span[1]/text()').extract_first()
                status = li.xpath('./span[2]/text()').extract_first()
                ask_title = li.xpath('./span[3]/a/text()').extract_first()
                rep_time = li.xpath('./span[4]/text()').extract_first()
                ask_time = li.xpath('./span[5]/text()').extract_first()


                item["sid"] = sid
                item["status"] = status
                item["title"] = ask_title
                item["rep_time"] = rep_time
                item["ask_time"] = ask_time
                print('-----> 编号:', sid)
                print('-----> 状态:', status)
                print('-----> 问政标题:', ask_title)
                print('-----> 问政时间:', ask_time)
                print('-----> 回复时间:', rep_time)
                # 查找详情
                detail_url = 'http://wz.sun0769.com' + li.xpath('./span[3]/a/@href').extract_first()
                print('-----> 详情内容的url是:', detail_url)
                yield scrapy.Request(url=detail_url, meta={'item': item},
                                     callback=self.parse_detail)
        else:
            print('-----> 错误的status代码是', response.status)

    def parse_detail(self, response):
        item = response.meta['item']
        details = response.xpath('//div[@class="details-box"]/pre/text()').extract_first()
        content = ''.join(details)
        if content == '':
            content = '未知'
        print('-----> 详情内容是:', content)
        # 问政主体 : 部门是--
        department = response.xpath('//div[@class="mr-three clear"]/div[@class="fl politics-fl"]/text()').extract_first()
        # '/html/body/div[3]/div[2]/div[2]/div[3]/div[1]'
        department = department.split(':')[-1]
        print('-----> 问政部门是:', department)
        if department == '':
            department = '未知'
        item['content'] = content
        item['department'] = department
        yield item

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class SunproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    sid = scrapy.Field()     # 编号
    status = scrapy.Field()     # 状态
    title = scrapy.Field()  # 问政标题
    rep_time = scrapy.Field()   # 回复时间
    ask_time = scrapy.Field()   # 问政时间
    content = scrapy.Field()    # 问政内容
    department = scrapy.Field()  # 问政主体

middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import time
import urllib3
from scrapy import signals
import requests
import random
from fake_useragent import UserAgent


class SunproSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


# http = ["123.179.161.230:41128",
#         '171.80.187.163:38367',
#         '112.240.182.233:44590',
#         '113.226.97.93:55387',
#         '117.26.221.137:52067',
#         '182.244.168.247:38904',
#         '1.199.193.249:35786',
#         '183.141.154.179:46691',
#         '183.141.154.179:46691',
#         ]


# https = ['49.70.17.204:9999',
#          '49.89.87.85:9999',
#          '49.70.85.154:9999',
#          '49.70.85.53:9999',
#          '120.83.121.172:9999',
#          '113.195.156.158:9999']

class SunproDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    def __init__(self):
        '''
        self.user_agent_list = [
            'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
            'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) '
            'Chrome/10.0.648.133 Safari/534.16',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ',
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36)'
        ]
        '''

        # 随机ua改为fake_useragent比较有派头,方便获取,实时更新
        self.ua = UserAgent(verify_ssl=False)

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    # 下载中间件 改变发出的请求
    def process_request(self, request, spider):
        # 设置随机ua
        # request.headers['User-Agent'] = random.choice(self.user_agent_list)
        request.headers['User-Agent'] = self.ua.random

        # 设置time.sleep随机延迟 降低爬取速率,防止被封ip
        delay = random.randint(0, 3)
        time.sleep(delay)

        # 动态代理ip,此处为api接口url,访问可动态返回代理ip
        apiUrl = 'http://api.goubanjia.com/dynamic/get/9ec90437c17a332fb83067f5cb7539e4.html?sep=3'
        # 要抓取的目标网站地址 ,本人使用的为http://www.goubanjia.com/站的代理ip(全网代理ip)
        # targetUrl = "http://1212.ip138.com/ic.asp";
        # 获取IP列表
        res = requests.get(apiUrl).text.strip("\n")
        # 按照\n分割获取到的IP
        ips = res.split("\n")
        # 随机选择一个IP
        proxy_ip = random.choice(ips)

        # 利用动态加载的代理ip,设置url的proxy
        if request.url.startswith("http://"):
            request.meta['proxy'] = "http://%s" % proxy_ip  # http代理
        elif request.url.startswith("https://"):
            request.meta['proxy'] = "https://%s" % proxy_ip  # https代理
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from scrapy.exceptions import DropItem
import xlrd
import xlwt
import xlutils

from xlutils.copy import copy


class SunproPipeline:
    def __init__(self):
        self.ids_seen = set()

    def process_item(self, item, spider):
        if item['sid'] in self.ids_seen:
            raise DropItem('Duplicate item found: %s' % item)
        else:
            self.ids_seen.add(item['sid'])
        # 处理得到的数据
        # 1.写入数据库中
        # 2.保存到表格中
        print(item)

        return item
        # index = len(value)  # 获取需要写入数据的行数
        '''
        try:
            workbook = xlrd.open_workbook('详情.xls')  # 打开工作簿
            sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
            if len(sheets) == 0:
                sheet = workbook.add_sheet('相关信息')
            else:
                worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
            rows_old = worksheet.nrows  # 获取表格中已存在的数据的行数
            new_workbook = copy(workbook)  # 将xlrd对象拷贝转化为xlwt对象
            new_worksheet = new_workbook.get_sheet(0)  # 获取转化后工作簿中的第一个表格
            value = list(item.values())
            for i in len(value):
                new_worksheet.write(rows_old, i, value[i])  # 追加写入数据,注意是从i+rows_old行开始写入
            new_workbook.save('详情.xls')  # 保存工作簿
            print("xls格式表格【追加】写入数据成功!")
        except Exception as e:
            workbook = xlwt.Workbook()
            sheet = workbook.add_sheet('相关信息')
            # 表格的头
            for i in range(len(item.keys())):
                sheet.write(0, i, list(item.keys())[i])
            # 写入数据
            for i in range(len(item.values())):
                sheet.write(1, i, list(item.values())[i])
            workbook.save('详情.xls')

        return item
        '''


class mysqlPipeline(object):
    def __init__(self):
        # self.ids_seen = set()
        self.db = pymysql.connect(
            host='localhost',  # 连接的是本地数据库
            user='root',  # 自己的mysql用户名
            passwd='rootpwd',  # 自己的密码
            db='sun',  # 数据库的名字
            charset='utf8mb4',  # 默认的编码方式:
            cursorclass=pymysql.cursors.DictCursor)
        self.cursor = self.db.cursor()

    def process_item(self, item, spider):
        '''
        :param item: 对象
        :param spider: 爬虫
        :return:
        '''
        # if item['sid'] in self.ids_seen:
        #     raise DropItem('Duplicate item found: %s' % item)
        # else:
        #     self.ids_seen.add(item['sid'])
        # 将爬取的信息保存到mysql
        # 将item里的数据拿出来
        # title = item['title']
        # link = item['link']
        # posttime = item['posttime']

        sid = int(item["sid"])
        status = item["status"]
        title = item["title"]
        rep_time = item["rep_time"]
        ask_time = item["ask_time"]
        content = item['content']
        department = item['department']

        # 和本地的newsDB数据库建立连接

        try:
            # 使用cursor()方法获取操作游标
            # cursor = self.db.cursor()
            # SQL 插入语句
            sql = "INSERT INTO sun(sid,status,title,content,department ,ask_time,rep_time) \
                  VALUES (%d,'%s','%s','%s', '%s', '%s','%s')" % (sid, status, title, content, department, ask_time, rep_time)
            # 执行SQL语句
            print('-----> sql语句是:', sql)
            self.cursor.execute(sql)
            # 提交修改
            self.db.commit()
        except Exception as e:
            print('-----> 存储数据出错了', e)
            # 关闭连接
            self.db.close()
            self.cursor.close()

        return item

    def close_spider(self, spider):
        pass

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for sunPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'sunPro'

SPIDER_MODULES = ['sunPro.spiders']
NEWSPIDER_MODULE = 'sunPro.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False


LOG_LEVEL = 'ERROR'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 8
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'sunPro.middlewares.SunproSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'sunPro.middlewares.SunproDownloaderMiddleware': 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'sunPro.pipelines.SunproPipeline': 300,
    'sunPro.pipelines.mysqlPipeline': 302,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STO RAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
posted @ 2020-11-23 11:57  食蚁兽  阅读(168)  评论(0编辑  收藏  举报