在生成的项目中主要使用了如下目录 spiders(用来编写爬虫文件) items.py(类似字典，只有一种类型 scrapy.Field()，格式如 title = scrapy.Field(),在这里进行定义自己爬取到的字段) pipelines.py(存储数据) settings.py(在此项目中的配置项)
在确定好自己的所爬取的内容的xpath语句之后(xpath有时定位的爬取不到,需要自己进行再一次的剖析进行确定)

spiders中的lpp.py(爬虫文件)

import scrapy
from PHIC.items import PhicItem
class Lpp2Spider(scrapy.Spider):
    name = 'lpp2'
     allowed_domains = ['www.phic.org.cn']
     start_urls = ['http://www.phic.org.cn/tjsj/wssjzy/yljgjwsfwqk/202104/t20210409_295837.html']

   selectors = response.xpath('//div[@class="TRS_Editor"]')
        for selector in selectors:



            item['numbers'] = response.xpath(
                 './table//tr//td[1]//span//text').extract()  # 位序爬取
            item['Deathname'] = response.xpath(
                './table//tr//td[2]//span/text()').getall()  # 死亡名称爬取
            item['Mortality'] = response.xpath(
                './table//tr//td[3]//span/text()').getall()  # 死亡率爬取
            item['constitute'] = response.xpath(
                './table//tr//td[4]//span/text()').getall()  # 构成比例爬取

            item['MaleDeathname'] = response.xpath(
                './table//tr//td[5]//span/text()').getall()  # 男性死亡名称爬取
            item['MaleMortality'] = response.xpath(
                './table//tr//td[6]//span/text()').getall()  # 男性死亡率爬取
            item['Maleconstitute'] = response.xpath(
                './table//tr//td[7]//span/text()').getall()  # 男性构成比例爬取

            item['FemaleDeathname'] = response.xpath(
                './table//tr//td[8]//span/text()').extract()  # 女性死亡名称爬取
            item['FemaleMortality'] = response.xpath('./table//tr//td[9]//span/text()').getall()  # 女性死亡率爬取
            item['Femaleconstitute'] = selector.xpath(
                './table//tr//td[10]//span/text()').getall()  # 女性构成比例爬取
            print(item)
        yield item

进行item文件编写

import scrapy



class PhicItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    numbers=scrapy.Field()

    Deathname=scrapy.Field()

    Mortality=scrapy.Field()
    constitute=scrapy.Field()

    MaleDeathname = scrapy.Field()
    MaleMortality = scrapy.Field()
    Maleconstitute = scrapy.Field()

    FemaleDeathname = scrapy.Field()
    FemaleMortality = scrapy.Field() 
    Femaleconstitute = scrapy.Field()

pipeline文件的编写

import pymysql

class PHICPipeline:
    # 初始化
    def __init__(self):
        # 连接到MySQL数据库
        self.connect = pymysql.connect(
            # 主机名
            host='localhost',
            # 用户名
            user='root',
            # 密码
            password='',
            # 数据库名
            database='',
            # 数据库编码
            charset='utf8',
        )
        self.cursor = self.connect.cursor()
   def process_item(self, item, spider):
      sql="insert into           phic(Deathname,Mortality,constitute,MaleDeathname,MaleMortality,Maleconstitute,FemaleDeathname,FemaleMortality,Femaleconstitute)  values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
      self.cursor.execute(sql,(Deathname,
      Mortality,
      constitute,
      MaleDeathname,
      MaleMortality,
      Maleconstitute,
      FemaleDeathname,
      FemaleMortality,
      Femaleconstitute))
return item

    # 关闭爬虫
    def close_spider(self, spider):
        # 关闭cursor
        self.cursor.close()
        # 关闭连接
        self.connect.close()

cd 到当前的项目的目录下运行爬虫: scrapy crawl lpp2(爬虫名)

遇到的问题

运行时遇到的问题 no moudle named 'xxx'

将此目录(右击)设置为 Mark Diractory as sources root

问题二在进行入库之时出现的数据类型不匹配

由于自己所爬取的数据有空和乱数据所以我进行了字段的截取，并且是列表类型之后进行了在进行提交时候的数据进行pop

         flag = True
         while flag:
            if Deathname != []:
                self.cursor.execute(sql,(Deathname.pop(0),
                                        FemaleDeathname.pop(0),
                                        Femaleconstitute.pop(0),
                                        MaleDeathname.pop(0),
                                        MaleMortality.pop(0),
                                        Maleconstitute.pop(0),
                                        Mortality.pop(0),
                                        constitute.pop(0),
                                        FemaleMortality.pop(0)
                                         ))
                                         self.connect.commit()
            else:
                break

最后数据成功进入mysql库

scrapy-redis 分布式爬虫的相关文件修改和运行

基于scrapy爬虫的scrapy-redis修改项

爬虫文件的修改(XX.py)

from scrapy_redis.spiders import RedisSpider（导入scrapy-redis的包）
class HospitalgetSpider(scrapy.Spider): 修改为 class HospitalgetSpider(RedisSpider):
将域名(allowde_domains)和地址(start_urls)进行注释

settings文件的修改

SPIDER_MODULES = ['PHIC.spiders']
NEWSPIDER_MODULE = 'PHIC.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36'

ROBOTSTXT_OBEY = False

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"#设置调取器，scrap_redis中的调度器具备与数据库交互的功能
SCHEDULER= "scrapy_redis.scheduler.Scheduler"
#设置当爬虫结束的时候是否保持redis数据库中的去重集合与任务队列SCHEDULER_PERSIST =True
#SCHEDULER.QUEUE .CLA55.."Scrapy.redis.queue.SpiderPriorityQueue"#SCHEDULEROUEUECLASS.E."scrapy redis.gueue.SDiderQueue".I
#SCHEDULER OUEUECLASS. "Scrapy redis.gueue.SDiderStack"
ITEM_PIPELINES ={
  # 'PHIC.pipelines.PHICPipeline': 300,
  #当开启该管道,该管道将会把数据存到Redis数据库中"
  'scrapy_redis.pipelines.RedisPipeline': 400,
}
#设置redis数据库
REDIS_URL = "redis://192.168.153.129:6379"      #后面是redis的ip
#LOG_LEVEL ='DEBUG'
# Introduce an artifical delay to make use of parallelism. to speed up the#crawl.
DOWNLOAD_DELAY =1

scrapy-redis爬虫文件的运行

1.启动爬虫

  1. cd  ./PHICFile/PHIC/spiders
  2. cd  ./PHIC4/PHIC4/spiders
  3. cd  ./PHIC3/PHIC3/spiders
  4. cd  ./PHIC2/PHIC2/spiders
  5. cd  ./HPI/HPI/spiders

执行命令：scrapy runspider xxx.py(spider文件名)

  1. scrapy runspider PhicSpider.py
  2. scrapy runspider lpp3.py
  3. scrapy runspider lpp2.py
  4. scrapy runspider lpp.py
  5. scrapy runspider hospitalget.py

2.向 redis 推入 start_url

启动 redis-server

进入redis根目录:

cd /usr/local/bin

未使用自启，使用配置文件启动：

  ./redis-server /etc/redis/6379.conf

redis-cli > lpush redis_key start_url

1. lpush lpp http://www.phic.org.cn/tjsj/wssjzy/jkzb/202104/t20210409_295837.html      PHIC

2. lpush lpp2 'http://www.phic.org.cn/tjsj/wssjzy/yljgjwsfwqk/202104/t20210409_295827.html'
PHIC2

3. lpush lpp3 'http://www.phic.org.cn/tjsj/wssjzy/yljgjwsfwqk/202104/t20210409_295828.html'
PHIC3

4. lpush  lpp4 'http://www.phic.org.cn/tjsj/wssjzy/yljgjwsfwqk/202104/t20210409_295826.html'
PHIC4

5. lpush lpp1    'http://www.mingyihui.net/beijingshi_hospital_avi1_1.html'
HPI

posted on 2021-11-15 10:19 极度嘟嘟咕噜阅读(339) 评论(0) 收藏举报

刷新页面返回顶部

导航

爬虫部分

在pycharm创建自己的爬虫项目

spiders中的lpp.py(爬虫文件)

进行item文件编写

pipeline文件的编写

遇到的问题

运行时遇到的问题 no moudle named 'xxx'

问题二在进行入库之时出现的数据类型不匹配

scrapy-redis 分布式爬虫的相关文件修改和运行

基于scrapy爬虫的scrapy-redis修改项

爬虫文件的修改(XX.py)

settings文件的修改

scrapy-redis爬虫文件的运行

1.启动爬虫

2.向 redis 推入 start_url

导航

爬虫部分

在pycharm创建自己的爬虫项目

spiders中的lpp.py(爬虫文件)

进行item文件编写

pipeline文件的编写

遇到的问题

运行时遇到的问题 no moudle named 'xxx'

问题二 在进行入库之时出现的数据类型不匹配

scrapy-redis 分布式爬虫的相关文件修改和运行

基于scrapy爬虫的scrapy-redis修改项

爬虫文件的修改(XX.py)

settings文件的修改

scrapy-redis爬虫文件的运行

1.启动爬虫

2.向 redis 推入 start_url

问题二在进行入库之时出现的数据类型不匹配