爬虫案例

items.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # http://doc.scrapy.org/en/latest/topics/items.html
 7 
 8 import scrapy
 9 
10 
11 class CompanyItem(scrapy.Item):
12 
13     # 公司id (url数字部分)
14     info_id = scrapy.Field()
15     # 公司名称
16     company_name = scrapy.Field()
17     # 公司口号
18     slogan = scrapy.Field()
19     # 分类
20     scope = scrapy.Field()
21     # 子分类
22     sub_scope = scrapy.Field()
23 
24     # 所在城市
25     city = scrapy.Field()
26     # 所在区域
27     area = scrapy.Field()
28     # 公司主页
29     home_page = scrapy.Field()
30     # 公司标签
31     tags = scrapy.Field()
32 
33     # 公司简介
34     company_intro = scrapy.Field()
35     # 公司全称:
36     company_full_name = scrapy.Field()
37     # 成立时间:
38     found_time = scrapy.Field()
39     # 公司规模:
40     company_size = scrapy.Field()
41     # 运营状态
42     company_status = scrapy.Field()
43 
44     # 投资情况列表:包含获投时间、融资阶段、融资金额、投资公司
45     tz_info = scrapy.Field()
46     # 团队信息列表:包含成员姓名、成员职称、成员介绍
47     tm_info = scrapy.Field()
48     # 产品信息列表:包含产品名称、产品类型、产品介绍
49     pdt_info = scrapy.Field()
View Code

 

juzi.py

  1 # coding:utf-8
  2 
  3 import scrapy
  4 from bs4 import BeautifulSoup
  5 from scrapy.linkextractors import LinkExtractor
  6 from scrapy.spiders import Rule, CrawlSpider
  7 
  8 from scrapy_redis.spiders import RedisCrawlSpider
  9 from itjuzi.items import CompanyItem
 10 
 11 
 12 #class ITjuziSpider(RedisCrawlSpider):
 13 class ITjuziSpider(RedisCrawlSpider):
 14     name = 'itjuzi'
 15     allowed_domains = ['www.itjuzi.com']
 16     #start_urls = ['http://www.itjuzi.com/company?page=1/']
 17     redis_key = 'itjuzispider:start_urls'
 18 
 19     headers = {
 20         "Host" : "www.itjuzi.com",
 21         "Connection" : "keep-alive",
 22         #"Upgrade-Insecure-Requests" : "1",
 23         "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
 24         "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
 25         "Accept-Encoding" : "gzip, deflate, sdch",
 26         "Accept-Language" : "zh-CN,zh;q=0.8,en;q=0.6",
 27         #"Referer" : "http://www.itjuzi.com/company",
 28         "Cookie" : "gr_user_id=b609a4a5-8b03-4078-85b5-0bb0e617daf5; _hp2_id.2147584538=%7B%22userId%22%3A%225361264262655533%22%2C%22pageviewId%22%3A%222812157498870176%22%2C%22sessionId%22%3A%221376560908636027%22%2C%22identity%22%3Anull%2C%22trackerVersion%22%3A%223.0%22%7D; identity=123636274%40qq.com; remember_code=3zB55lODqf; acw_tc=AQAAAMOWGBUfUwgAU3Awtjx04lI+Gr0e; acw_sc=589c0d25fa257c17aedfa3127a7c3a0eac145db7; session=4f1aec94fd99840a562f39488480a9b2798824f9; Hm_lvt_1c587ad486cdb6b962e94fc2002edf89=1486565620,1486575689,1486603765,1486603844; Hm_lpvt_1c587ad486cdb6b962e94fc2002edf89=1486624109; ",
 29         #"If-Modified-Since" : "Thu, 09 Feb 2017 03:35:25 GMT"
 30 
 31     }
 32 
 33     rules = [
 34         # 获取每一页的链接
 35         Rule(link_extractor=LinkExtractor(allow=('/company\?page=\d+'))),
 36         Rule(link_extractor=LinkExtractor(allow=('/company/foreign\?page=\d+'))),
 37         # 获取每一个公司的详情
 38         Rule(link_extractor=LinkExtractor(allow=('/company/\d+')), callback='parse_item')
 39     ]
 40 
 41 
 42     def make_requests_from_url(self, url):
 43         return scrapy.Request(url,
 44                 headers = self.headers,
 45                 dont_filter = True)
 46 
 47 
 48     def parse_item(self, response):
 49         print "======================" + response.url
 50         soup = BeautifulSoup(response.body, 'lxml')
 51 
 52         # 开头部分: //div[@class="infoheadrow-v2 ugc-block-item"]
 53         cpy1 = soup.find('div', class_='infoheadrow-v2')
 54         if cpy1:
 55             # 公司名称://span[@class="title"]/b/text()[1]
 56             company_name = cpy1.find(class_='title').b.contents[0].strip().replace('\t', '').replace('\n', '')
 57 
 58             # 口号: //div[@class="info-line"]/p
 59             slogan = cpy1.find(class_='info-line').p.get_text()
 60 
 61             # 分类:子分类//span[@class="scope c-gray-aset"]/a[1]
 62             scope_a = cpy1.find(class_='scope c-gray-aset').find_all('a')
 63             # 分类://span[@class="scope c-gray-aset"]/a[1]
 64             scope = scope_a[0].get_text().strip() if len(scope_a) > 0 else 'NULL'
 65             # 子分类:# //span[@class="scope c-gray-aset"]/a[2]
 66             sub_scope = scope_a[1].get_text().strip() if len(scope_a) > 1 else 'NULL'
 67 
 68             # 城市+区域://span[@class="loca c-gray-aset"]/a
 69             city_a = cpy1.find(class_='loca c-gray-aset').find_all('a')
 70             # 城市://span[@class="loca c-gray-aset"]/a[1]
 71             city = city_a[0].get_text().strip() if len(city_a) > 0 else 'NULL'
 72             # 区域://span[@class="loca c-gray-aset"]/a[2]
 73             area = city_a[1].get_text().strip() if len(city_a) > 1 else 'NULL'
 74 
 75             # 主页://a[@class="weblink"]/@href
 76             home_page = cpy1.find(class_='weblink')['href']
 77             # 标签://div[@class="tagset dbi c-gray-aset"]/a
 78             tags = cpy1.find(class_='tagset dbi c-gray-aset').get_text().strip().strip().replace('\n', ',')
 79 
 80         #基本信息://div[@class="block-inc-info on-edit-hide"]
 81         cpy2 = soup.find('div', class_='block-inc-info on-edit-hide')
 82         if cpy2:
 83 
 84             # 公司简介://div[@class="block-inc-info on-edit-hide"]//div[@class="des"]
 85             company_intro = cpy2.find(class_='des').get_text().strip()
 86 
 87             # 公司全称:成立时间:公司规模:运行状态://div[@class="des-more"]
 88             cpy2_content = cpy2.find(class_='des-more').contents
 89 
 90             # 公司全称://div[@class="des-more"]/div[1]
 91             company_full_name = cpy2_content[1].get_text().strip()[len('公司全称:'):] if cpy2_content[1] else 'NULL'
 92 
 93             # 成立时间://div[@class="des-more"]/div[2]/span[1]
 94             found_time = cpy2_content[3].contents[1].get_text().strip()[len('成立时间:'):] if cpy2_content[1] else 'NULL'
 95 
 96             # 公司规模://div[@class="des-more"]/div[2]/span[2]
 97             company_size = cpy2_content[3].contents[3].get_text().strip()[len('公司规模:'):] if cpy2_content[1] else 'NULL'
 98 
 99             #运营状态://div[@class="des-more"]/div[3]/span
100             company_status = cpy2_content[5].get_text().strip() if cpy2_content[3] else 'NULL'
101 
102         # 主体信息:
103         main = soup.find('div', class_='main')
104 
105         # 投资情况://table[@class="list-round-v2 need2login"]
106           # 投资情况,包含获投时间、融资阶段、融资金额、投资公司
107         tz = main.find('table', 'list-round-v2')
108         tz_list = []
109         if tz:
110             # 找出投资情况下所有的tr
111             all_tr = tz.find_all('tr')
112             for tr in all_tr:
113                 tz_dict = {}
114                 all_td = tr.find_all('td')
115                 # 投资时间
116                 tz_dict['tz_time'] = all_td[0].span.get_text().strip()
117                 # 融资阶段
118                 tz_dict['tz_round'] = all_td[1].get_text().strip()
119                 # 融资金额
120                 tz_dict['tz_finades'] = all_td[2].get_text().strip()
121                 # 投资公司
122                 tz_dict['tz_capital'] = all_td[3].get_text().strip().replace('\n', ',')
123                 tz_list.append(tz_dict)
124 
125         #tz_list = []
126         # 团队信息:成员姓名、成员职称、成员介绍
127         tm = main.find('ul', class_='list-prodcase limited-itemnum')
128         tm_list = []
129         if tm:
130             for li in tm.find_all('li'):
131                 tm_dict = {}
132                 # 成员姓名
133                 tm_dict['tm_m_name'] = li.find('span', class_='c').get_text().strip()
134                 #成员职称
135                 tm_dict['tm_m_title'] = li.find('span', class_='c-gray').get_text().strip()
136                 # 成员信息
137                 tm_dict['tm_m_intro'] = li.find('p', class_='mart10 person-des').get_text().strip()
138                 tm_list.append(tm_dict)
139 
140         # 产品信息:产品名称、产品类型、产品介绍
141         pdt = main.find('ul', class_='list-prod limited-itemnum')
142         pdt_list = []
143         if pdt:
144             for li in pdt.find_all('li'):
145                 pdt_dict = {}
146                 pdt_dict['pdt_name'] = li.find('h4').b.get_text().strip()
147                 pdt_dict['pdt_type'] = li.find('span', class_='tag yellow').get_text().strip()
148                 pdt_dict['pdt_intro'] = li.find(class_='on-edit-hide').p.get_text().strip()
149                 pdt_list.append(pdt_dict)
150 
151         item = CompanyItem()
152         #取出后面的数字编号
153         item['info_id'] = response.url.split('/')[-1:][0]
154         item['company_name'] = company_name
155         item['slogan'] = slogan
156         item['scope'] = scope
157         item['sub_scope'] = sub_scope
158         item['city'] = city
159         item['area'] = area
160         item['home_page'] = home_page
161         item['tags'] = tags
162         item['company_intro'] = company_intro
163         item['company_full_name'] = company_full_name
164         item['found_time'] = found_time
165         item['company_size'] = company_size
166         item['company_status'] = company_status
167         item['tz_info'] = tz_list
168         item['tm_info'] = tm_list
169         item['pdt_info'] = pdt_list
170 
171         yield item
View Code

 

middlewares.py

 1 # coding:utf8
 2 
 3 
 4 
 5 # Start your middleware class
 6 
 7 #from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
 8 from settings import USER_AGENTS
 9 import random
10 import time
11 import requests
12 import base64
13 
14 
15 # User-Agetn 下载中间件
16 class RandomUserAgent(object):
17     def process_request(self, request, spider):
18         # 这句话用于随机选择user-agent
19         user_agent = random.choice(USER_AGENTS)
20         date = time.strftime(u"%a, %d %b %Y %H:%M:%S GMT", time.gmtime())
21         #print user_agent
22         request.headers.setdefault('User-Agent', user_agent)
23         #request.headers.setdefault('If-Modified-Since', date)
24 
25 class RandomProxy(object):
26     def __init__(self):
27         #self.proxy_list = ["121.40.108.76:80", "121.8.243.51:8888","221.204.116.169:9797","112.95.205.29:8888","183.31.254.57:9797","221.204.116.211:9797"]
28         self.proxy_auth = "mr_mao_hacker:sffqry9r"
29         self.proxy_api = "http://dps.kuaidaili.com/api/getdps/?orderid=958655825381063&num=50&ut=1&sep=3"
30         self.proxy_list = requests.get(self.proxy_api).text.split()
31 
32     def process_request(self, request, spider):
33         proxy = random.choice(self.proxy_list)
34         base64_userpass = base64.b64encode(self.proxy_auth)
35         #print proxy
36         request.meta['proxy'] = "http://" + proxy
37         #if self.proxy_auth != None:
38         request.headers['Proxy-Authorization'] = "Basic " + base64_userpass
39 
40 
41 #class ProxyMiddleware(object):
42     # overwrite process request
43     #def process_request(self, request, spider):
44         # Set the location of the proxy
45     #    sql = 'select ip,port from t_proxy_ip t where t.is_valid =1'
46     #    result = SqlUtil.query_all(sql)
47     #    ip_port = random.choice(result)
48     #    logging.info(ip_port)
49     #    request.meta['proxy'] = "http://{0}:{1}".format(ip_port['ip'], ip_port['port'])
50         # # Use the following lines if your proxy requires authentication
51         # proxy_user_pass = "USERNAME:PASSWORD"
52         # # setup basic authentication for the proxy
53         # encoded_user_pass = base64.encodestring(proxy_user_pass)
54         # request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
View Code

 

settings.py

  1 # -*- coding: utf-8 -*-
  2 
  3 # Scrapy settings for itjuzi_dis project
  4 #
  5 # For simplicity, this file contains only settings considered important or
  6 # commonly used. You can find more settings consulting the documentation:
  7 #
  8 #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 
 12 BOT_NAME = 'itjuzi'
 13 
 14 SPIDER_MODULES = ['itjuzi.spiders']
 15 NEWSPIDER_MODULE = 'itjuzi.spiders'
 16 
 17 
 18 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 #USER_AGENT = 'itjuzi_dis (+http://www.yourdomain.com)'
 20 
 21 
 22 # Enables scheduling storing requests queue in redis.
 23 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
 24 
 25 # Ensure all spiders share same duplicates filter through redis.
 26 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
 27 
 28 # 使用队列形式
 29 SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
 30 
 31 # 允许暂停,redis请求记录不丢失
 32 SCHEDULER_PERSIST = True
 33 
 34 # REDIS_START_URLS_AS_SET = True
 35 
 36 #COOKIES_ENABLED = True
 37 COOKIES_DEBUG = True
 38 
 39 DOWNLOAD_DELAY = 1.5
 40 
 41 # 支持随机下载延迟
 42 #RANDOMIZE_DOWNLOAD_DELAY = True
 43 
 44 # 接收 http error 502的响应页面
 45 #HTTPERROR_ALLOWED_CODES = [502]
 46 #REDIS_HOST = "192.168.199.107"
 47 #REDIS_PORT = 6379
 48 
 49 # Obey robots.txt rules
 50 ROBOTSTXT_OBEY = False
 51 
 52 ITEM_PIPELINES = {
 53 #    'itjuzi_dis.pipelines.DuplicatesPipeline': 200,
 54 #    'itjuzi_dis.pipelines.ItjuziSpiderPipeline': 300,
 55     'scrapy_redis.pipelines.RedisPipeline': 300
 56 }
 57 
 58 DOWNLOADER_MIDDLEWARES = {
 59     'scrapy.downloadermiddlewares.retry.RetryMiddleware': 100,
 60     'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
 61     'itjuzi.middlewares.RandomProxy': 300,
 62     #'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 100,
 63     'itjuzi.middlewares.RandomUserAgent': 200,
 64 }
 65 
 66 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 67 #CONCURRENT_REQUESTS = 32
 68 
 69 # Configure a delay for requests for the same website (default: 0)
 70 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 71 # See also autothrottle settings and docs
 72 
 73 # The download delay setting will honor only one of:
 74 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 75 #CONCURRENT_REQUESTS_PER_IP = 16
 76 
 77 # Disable cookies (enabled by default)
 78 
 79 
 80 # Disable Telnet Console (enabled by default)
 81 #TELNETCONSOLE_ENABLED = False
 82 
 83 # Override the default request headers:
 84 DEFAULT_REQUEST_HEADERS = {
 85     'Host': 'www.itjuzi.com',
 86     'Cache-Control': 'max-age=0',
 87     'Connection': 'keep-alive',
 88     'Upgrade-Insecure-Requests': '1',
 89     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 90     'Accept-Encoding': 'gzip, deflate, sdch',
 91     'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6'
 92 }
 93 
 94 USER_AGENTS = [
 95     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
 96     "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
 97     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
 98     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
 99     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
100     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
101     "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
102     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
103     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
104     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
105     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
106     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
107     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
108     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
109     "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
110     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
111     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
112     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
113     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10",
114     "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8",
115     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
116     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12",
117     "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1",
118     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8",
119     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3",
120     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13",
121     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1",
122     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2",
123     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
124     "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ",
125     "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
126     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5",
127     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
128     "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
129     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
130 ]
131 
132 
133 
134 # Enable or disable spider middlewares
135 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
136 #SPIDER_MIDDLEWARES = {
137 #    'itjuzi_dis.middlewares.MyCustomSpiderMiddleware': 543,
138 #}
139 
140 # Enable or disable downloader middlewares
141 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
142 
143 
144 # Enable or disable extensions
145 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
146 #EXTENSIONS = {
147 #    'scrapy.extensions.telnet.TelnetConsole': None,
148 #}
149 
150 # Configure item pipelines
151 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
152 
153 # Enable and configure the AutoThrottle extension (disabled by default)
154 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
155 #AUTOTHROTTLE_ENABLED = True
156 # The initial download delay
157 #AUTOTHROTTLE_START_DELAY = 5
158 # The maximum download delay to be set in case of high latencies
159 #AUTOTHROTTLE_MAX_DELAY = 60
160 # The average number of requests Scrapy should be sending in parallel to
161 # each remote server
162 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
163 # Enable showing throttling stats for every response received:
164 #AUTOTHROTTLE_DEBUG = False
165 
166 # Enable and configure HTTP caching (disabled by default)
167 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
168 #HTTPCACHE_ENABLED = True
169 #HTTPCACHE_EXPIRATION_SECS = 0
170 #HTTPCACHE_DIR = 'httpcache'
171 #HTTPCACHE_IGNORE_HTTP_CODES = []
172 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
173 # REDIRECT_ENABLED = False
174 
175 
176 
177 #REDIS_PARAMS = {'host':'redis','decode_responses':False}
View Code

 

pipelines.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 from scrapy.exceptions import DropItem
 8 
 9 from itjuzi_dis.db_util import JuziCompany,DB_Util,JuziTeam,JuziTz,JuziProduct
10 
11 
12 # 去重复的 company
13 class DuplicatesPipeline(object):
14 
15     def __init__(self):
16         self.ids_seen = set()
17 
18     def process_item(self, item, spider):
19         if item['info_id'] in self.ids_seen:
20             raise DropItem("Duplicate item found: %s" % item)
21         else:
22             self.ids_seen.add(item['info_id'])
23             return item
24 
25 
26 class ItjuziSpiderPipeline(object):
27     def open_spider(self, spider):
28         DB_Util.init_db()  # 表不存在时候,初始化表结构
29 
30     def process_item(self, item, spider):
31         if not item['info_id']:
32             raise DropItem('item info_id is null.{0}'.format(item))
33         else:
34             session = DB_Util.get_session()
35             company = JuziCompany()
36             company.company_name = item['company_name']
37             company.slogan = item['slogan']
38             company.scope=item['scope']
39             company.sub_scope=item['sub_scope']
40             company.city = item['city']
41             company.area = item['area']
42             company.home_page=item['home_page']
43             company.tags=item['tags']
44             company.company_intro=item['company_intro']
45             company.company_full_name=item['company_full_name']
46             company.found_time=item['found_time']
47             company.company_size=item['company_size']
48             company.company_status=item['company_status']
49             company.info_id = item['info_id']
50             session.add(company)
51             if item['tz_info']:
52                 for touzi in item['tz_info']:
53                     tz = JuziTz()
54                     tz.company_id = company.info_id
55                     tz.tz_time = touzi['tz_time']
56                     tz.tz_finades = touzi['tz_finades']
57                     tz.tz_capital = touzi['tz_capital']
58                     tz.tz_round = touzi['tz_round']
59                     session.add(tz)
60             if item['tm_info']:
61                 for team in item['tm_info']:
62                     tm = JuziTeam()
63                     tm.company_id = company.info_id
64                     tm.tm_m_name = team['tm_m_name']
65                     tm.tm_m_title = team['tm_m_title']
66                     tm.tm_m_intro = team['tm_m_intro']
67                     session.add(tm)
68             if item['pdt_info']:
69                 for product in item['pdt_info']:
70                     pdt = JuziProduct()
71                     pdt.company_id = company.info_id
72                     pdt.pdt_name = product['pdt_name']
73                     pdt.pdt_type = product['pdt_type']
74                     pdt.pdt_intro = product['pdt_intro']
75                     session.add(pdt)
76             session.commit()
77         return item
View Code

 

posted on 2020-03-28 00:14  cherry_ning  阅读(147)  评论(0)    收藏  举报

导航