爬虫案例
items.py
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class CompanyItem(scrapy.Item): 12 13 # 公司id (url数字部分) 14 info_id = scrapy.Field() 15 # 公司名称 16 company_name = scrapy.Field() 17 # 公司口号 18 slogan = scrapy.Field() 19 # 分类 20 scope = scrapy.Field() 21 # 子分类 22 sub_scope = scrapy.Field() 23 24 # 所在城市 25 city = scrapy.Field() 26 # 所在区域 27 area = scrapy.Field() 28 # 公司主页 29 home_page = scrapy.Field() 30 # 公司标签 31 tags = scrapy.Field() 32 33 # 公司简介 34 company_intro = scrapy.Field() 35 # 公司全称: 36 company_full_name = scrapy.Field() 37 # 成立时间: 38 found_time = scrapy.Field() 39 # 公司规模: 40 company_size = scrapy.Field() 41 # 运营状态 42 company_status = scrapy.Field() 43 44 # 投资情况列表:包含获投时间、融资阶段、融资金额、投资公司 45 tz_info = scrapy.Field() 46 # 团队信息列表:包含成员姓名、成员职称、成员介绍 47 tm_info = scrapy.Field() 48 # 产品信息列表:包含产品名称、产品类型、产品介绍 49 pdt_info = scrapy.Field()
juzi.py
1 # coding:utf-8 2 3 import scrapy 4 from bs4 import BeautifulSoup 5 from scrapy.linkextractors import LinkExtractor 6 from scrapy.spiders import Rule, CrawlSpider 7 8 from scrapy_redis.spiders import RedisCrawlSpider 9 from itjuzi.items import CompanyItem 10 11 12 #class ITjuziSpider(RedisCrawlSpider): 13 class ITjuziSpider(RedisCrawlSpider): 14 name = 'itjuzi' 15 allowed_domains = ['www.itjuzi.com'] 16 #start_urls = ['http://www.itjuzi.com/company?page=1/'] 17 redis_key = 'itjuzispider:start_urls' 18 19 headers = { 20 "Host" : "www.itjuzi.com", 21 "Connection" : "keep-alive", 22 #"Upgrade-Insecure-Requests" : "1", 23 "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36", 24 "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 25 "Accept-Encoding" : "gzip, deflate, sdch", 26 "Accept-Language" : "zh-CN,zh;q=0.8,en;q=0.6", 27 #"Referer" : "http://www.itjuzi.com/company", 28 "Cookie" : "gr_user_id=b609a4a5-8b03-4078-85b5-0bb0e617daf5; _hp2_id.2147584538=%7B%22userId%22%3A%225361264262655533%22%2C%22pageviewId%22%3A%222812157498870176%22%2C%22sessionId%22%3A%221376560908636027%22%2C%22identity%22%3Anull%2C%22trackerVersion%22%3A%223.0%22%7D; identity=123636274%40qq.com; remember_code=3zB55lODqf; acw_tc=AQAAAMOWGBUfUwgAU3Awtjx04lI+Gr0e; acw_sc=589c0d25fa257c17aedfa3127a7c3a0eac145db7; session=4f1aec94fd99840a562f39488480a9b2798824f9; Hm_lvt_1c587ad486cdb6b962e94fc2002edf89=1486565620,1486575689,1486603765,1486603844; Hm_lpvt_1c587ad486cdb6b962e94fc2002edf89=1486624109; ", 29 #"If-Modified-Since" : "Thu, 09 Feb 2017 03:35:25 GMT" 30 31 } 32 33 rules = [ 34 # 获取每一页的链接 35 Rule(link_extractor=LinkExtractor(allow=('/company\?page=\d+'))), 36 Rule(link_extractor=LinkExtractor(allow=('/company/foreign\?page=\d+'))), 37 # 获取每一个公司的详情 38 Rule(link_extractor=LinkExtractor(allow=('/company/\d+')), callback='parse_item') 39 ] 40 41 42 def make_requests_from_url(self, url): 43 return scrapy.Request(url, 44 headers = self.headers, 45 dont_filter = True) 46 47 48 def parse_item(self, response): 49 print "======================" + response.url 50 soup = BeautifulSoup(response.body, 'lxml') 51 52 # 开头部分: //div[@class="infoheadrow-v2 ugc-block-item"] 53 cpy1 = soup.find('div', class_='infoheadrow-v2') 54 if cpy1: 55 # 公司名称://span[@class="title"]/b/text()[1] 56 company_name = cpy1.find(class_='title').b.contents[0].strip().replace('\t', '').replace('\n', '') 57 58 # 口号: //div[@class="info-line"]/p 59 slogan = cpy1.find(class_='info-line').p.get_text() 60 61 # 分类:子分类//span[@class="scope c-gray-aset"]/a[1] 62 scope_a = cpy1.find(class_='scope c-gray-aset').find_all('a') 63 # 分类://span[@class="scope c-gray-aset"]/a[1] 64 scope = scope_a[0].get_text().strip() if len(scope_a) > 0 else 'NULL' 65 # 子分类:# //span[@class="scope c-gray-aset"]/a[2] 66 sub_scope = scope_a[1].get_text().strip() if len(scope_a) > 1 else 'NULL' 67 68 # 城市+区域://span[@class="loca c-gray-aset"]/a 69 city_a = cpy1.find(class_='loca c-gray-aset').find_all('a') 70 # 城市://span[@class="loca c-gray-aset"]/a[1] 71 city = city_a[0].get_text().strip() if len(city_a) > 0 else 'NULL' 72 # 区域://span[@class="loca c-gray-aset"]/a[2] 73 area = city_a[1].get_text().strip() if len(city_a) > 1 else 'NULL' 74 75 # 主页://a[@class="weblink"]/@href 76 home_page = cpy1.find(class_='weblink')['href'] 77 # 标签://div[@class="tagset dbi c-gray-aset"]/a 78 tags = cpy1.find(class_='tagset dbi c-gray-aset').get_text().strip().strip().replace('\n', ',') 79 80 #基本信息://div[@class="block-inc-info on-edit-hide"] 81 cpy2 = soup.find('div', class_='block-inc-info on-edit-hide') 82 if cpy2: 83 84 # 公司简介://div[@class="block-inc-info on-edit-hide"]//div[@class="des"] 85 company_intro = cpy2.find(class_='des').get_text().strip() 86 87 # 公司全称:成立时间:公司规模:运行状态://div[@class="des-more"] 88 cpy2_content = cpy2.find(class_='des-more').contents 89 90 # 公司全称://div[@class="des-more"]/div[1] 91 company_full_name = cpy2_content[1].get_text().strip()[len('公司全称:'):] if cpy2_content[1] else 'NULL' 92 93 # 成立时间://div[@class="des-more"]/div[2]/span[1] 94 found_time = cpy2_content[3].contents[1].get_text().strip()[len('成立时间:'):] if cpy2_content[1] else 'NULL' 95 96 # 公司规模://div[@class="des-more"]/div[2]/span[2] 97 company_size = cpy2_content[3].contents[3].get_text().strip()[len('公司规模:'):] if cpy2_content[1] else 'NULL' 98 99 #运营状态://div[@class="des-more"]/div[3]/span 100 company_status = cpy2_content[5].get_text().strip() if cpy2_content[3] else 'NULL' 101 102 # 主体信息: 103 main = soup.find('div', class_='main') 104 105 # 投资情况://table[@class="list-round-v2 need2login"] 106 # 投资情况,包含获投时间、融资阶段、融资金额、投资公司 107 tz = main.find('table', 'list-round-v2') 108 tz_list = [] 109 if tz: 110 # 找出投资情况下所有的tr 111 all_tr = tz.find_all('tr') 112 for tr in all_tr: 113 tz_dict = {} 114 all_td = tr.find_all('td') 115 # 投资时间 116 tz_dict['tz_time'] = all_td[0].span.get_text().strip() 117 # 融资阶段 118 tz_dict['tz_round'] = all_td[1].get_text().strip() 119 # 融资金额 120 tz_dict['tz_finades'] = all_td[2].get_text().strip() 121 # 投资公司 122 tz_dict['tz_capital'] = all_td[3].get_text().strip().replace('\n', ',') 123 tz_list.append(tz_dict) 124 125 #tz_list = [] 126 # 团队信息:成员姓名、成员职称、成员介绍 127 tm = main.find('ul', class_='list-prodcase limited-itemnum') 128 tm_list = [] 129 if tm: 130 for li in tm.find_all('li'): 131 tm_dict = {} 132 # 成员姓名 133 tm_dict['tm_m_name'] = li.find('span', class_='c').get_text().strip() 134 #成员职称 135 tm_dict['tm_m_title'] = li.find('span', class_='c-gray').get_text().strip() 136 # 成员信息 137 tm_dict['tm_m_intro'] = li.find('p', class_='mart10 person-des').get_text().strip() 138 tm_list.append(tm_dict) 139 140 # 产品信息:产品名称、产品类型、产品介绍 141 pdt = main.find('ul', class_='list-prod limited-itemnum') 142 pdt_list = [] 143 if pdt: 144 for li in pdt.find_all('li'): 145 pdt_dict = {} 146 pdt_dict['pdt_name'] = li.find('h4').b.get_text().strip() 147 pdt_dict['pdt_type'] = li.find('span', class_='tag yellow').get_text().strip() 148 pdt_dict['pdt_intro'] = li.find(class_='on-edit-hide').p.get_text().strip() 149 pdt_list.append(pdt_dict) 150 151 item = CompanyItem() 152 #取出后面的数字编号 153 item['info_id'] = response.url.split('/')[-1:][0] 154 item['company_name'] = company_name 155 item['slogan'] = slogan 156 item['scope'] = scope 157 item['sub_scope'] = sub_scope 158 item['city'] = city 159 item['area'] = area 160 item['home_page'] = home_page 161 item['tags'] = tags 162 item['company_intro'] = company_intro 163 item['company_full_name'] = company_full_name 164 item['found_time'] = found_time 165 item['company_size'] = company_size 166 item['company_status'] = company_status 167 item['tz_info'] = tz_list 168 item['tm_info'] = tm_list 169 item['pdt_info'] = pdt_list 170 171 yield item
middlewares.py
1 # coding:utf8 2 3 4 5 # Start your middleware class 6 7 #from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 8 from settings import USER_AGENTS 9 import random 10 import time 11 import requests 12 import base64 13 14 15 # User-Agetn 下载中间件 16 class RandomUserAgent(object): 17 def process_request(self, request, spider): 18 # 这句话用于随机选择user-agent 19 user_agent = random.choice(USER_AGENTS) 20 date = time.strftime(u"%a, %d %b %Y %H:%M:%S GMT", time.gmtime()) 21 #print user_agent 22 request.headers.setdefault('User-Agent', user_agent) 23 #request.headers.setdefault('If-Modified-Since', date) 24 25 class RandomProxy(object): 26 def __init__(self): 27 #self.proxy_list = ["121.40.108.76:80", "121.8.243.51:8888","221.204.116.169:9797","112.95.205.29:8888","183.31.254.57:9797","221.204.116.211:9797"] 28 self.proxy_auth = "mr_mao_hacker:sffqry9r" 29 self.proxy_api = "http://dps.kuaidaili.com/api/getdps/?orderid=958655825381063&num=50&ut=1&sep=3" 30 self.proxy_list = requests.get(self.proxy_api).text.split() 31 32 def process_request(self, request, spider): 33 proxy = random.choice(self.proxy_list) 34 base64_userpass = base64.b64encode(self.proxy_auth) 35 #print proxy 36 request.meta['proxy'] = "http://" + proxy 37 #if self.proxy_auth != None: 38 request.headers['Proxy-Authorization'] = "Basic " + base64_userpass 39 40 41 #class ProxyMiddleware(object): 42 # overwrite process request 43 #def process_request(self, request, spider): 44 # Set the location of the proxy 45 # sql = 'select ip,port from t_proxy_ip t where t.is_valid =1' 46 # result = SqlUtil.query_all(sql) 47 # ip_port = random.choice(result) 48 # logging.info(ip_port) 49 # request.meta['proxy'] = "http://{0}:{1}".format(ip_port['ip'], ip_port['port']) 50 # # Use the following lines if your proxy requires authentication 51 # proxy_user_pass = "USERNAME:PASSWORD" 52 # # setup basic authentication for the proxy 53 # encoded_user_pass = base64.encodestring(proxy_user_pass) 54 # request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
settings.py
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for itjuzi_dis project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # http://doc.scrapy.org/en/latest/topics/settings.html 9 # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'itjuzi' 13 14 SPIDER_MODULES = ['itjuzi.spiders'] 15 NEWSPIDER_MODULE = 'itjuzi.spiders' 16 17 18 # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 #USER_AGENT = 'itjuzi_dis (+http://www.yourdomain.com)' 20 21 22 # Enables scheduling storing requests queue in redis. 23 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 24 25 # Ensure all spiders share same duplicates filter through redis. 26 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 27 28 # 使用队列形式 29 SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" 30 31 # 允许暂停,redis请求记录不丢失 32 SCHEDULER_PERSIST = True 33 34 # REDIS_START_URLS_AS_SET = True 35 36 #COOKIES_ENABLED = True 37 COOKIES_DEBUG = True 38 39 DOWNLOAD_DELAY = 1.5 40 41 # 支持随机下载延迟 42 #RANDOMIZE_DOWNLOAD_DELAY = True 43 44 # 接收 http error 502的响应页面 45 #HTTPERROR_ALLOWED_CODES = [502] 46 #REDIS_HOST = "192.168.199.107" 47 #REDIS_PORT = 6379 48 49 # Obey robots.txt rules 50 ROBOTSTXT_OBEY = False 51 52 ITEM_PIPELINES = { 53 # 'itjuzi_dis.pipelines.DuplicatesPipeline': 200, 54 # 'itjuzi_dis.pipelines.ItjuziSpiderPipeline': 300, 55 'scrapy_redis.pipelines.RedisPipeline': 300 56 } 57 58 DOWNLOADER_MIDDLEWARES = { 59 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 100, 60 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 61 'itjuzi.middlewares.RandomProxy': 300, 62 #'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 100, 63 'itjuzi.middlewares.RandomUserAgent': 200, 64 } 65 66 # Configure maximum concurrent requests performed by Scrapy (default: 16) 67 #CONCURRENT_REQUESTS = 32 68 69 # Configure a delay for requests for the same website (default: 0) 70 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 71 # See also autothrottle settings and docs 72 73 # The download delay setting will honor only one of: 74 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 75 #CONCURRENT_REQUESTS_PER_IP = 16 76 77 # Disable cookies (enabled by default) 78 79 80 # Disable Telnet Console (enabled by default) 81 #TELNETCONSOLE_ENABLED = False 82 83 # Override the default request headers: 84 DEFAULT_REQUEST_HEADERS = { 85 'Host': 'www.itjuzi.com', 86 'Cache-Control': 'max-age=0', 87 'Connection': 'keep-alive', 88 'Upgrade-Insecure-Requests': '1', 89 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 90 'Accept-Encoding': 'gzip, deflate, sdch', 91 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6' 92 } 93 94 USER_AGENTS = [ 95 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 96 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 97 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 98 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 99 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 100 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 101 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 102 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 103 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 104 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 105 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 106 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 107 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 108 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 109 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 110 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 111 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 112 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 113 "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10", 114 "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8", 115 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 116 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12", 117 "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1", 118 "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8", 119 "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3", 120 "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13", 121 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1", 122 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2", 123 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", 124 "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ", 125 "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3", 126 "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5", 127 "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14", 128 "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15", 129 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" 130 ] 131 132 133 134 # Enable or disable spider middlewares 135 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 136 #SPIDER_MIDDLEWARES = { 137 # 'itjuzi_dis.middlewares.MyCustomSpiderMiddleware': 543, 138 #} 139 140 # Enable or disable downloader middlewares 141 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 142 143 144 # Enable or disable extensions 145 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 146 #EXTENSIONS = { 147 # 'scrapy.extensions.telnet.TelnetConsole': None, 148 #} 149 150 # Configure item pipelines 151 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 152 153 # Enable and configure the AutoThrottle extension (disabled by default) 154 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 155 #AUTOTHROTTLE_ENABLED = True 156 # The initial download delay 157 #AUTOTHROTTLE_START_DELAY = 5 158 # The maximum download delay to be set in case of high latencies 159 #AUTOTHROTTLE_MAX_DELAY = 60 160 # The average number of requests Scrapy should be sending in parallel to 161 # each remote server 162 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 163 # Enable showing throttling stats for every response received: 164 #AUTOTHROTTLE_DEBUG = False 165 166 # Enable and configure HTTP caching (disabled by default) 167 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 168 #HTTPCACHE_ENABLED = True 169 #HTTPCACHE_EXPIRATION_SECS = 0 170 #HTTPCACHE_DIR = 'httpcache' 171 #HTTPCACHE_IGNORE_HTTP_CODES = [] 172 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 173 # REDIRECT_ENABLED = False 174 175 176 177 #REDIS_PARAMS = {'host':'redis','decode_responses':False}
pipelines.py
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 from scrapy.exceptions import DropItem 8 9 from itjuzi_dis.db_util import JuziCompany,DB_Util,JuziTeam,JuziTz,JuziProduct 10 11 12 # 去重复的 company 13 class DuplicatesPipeline(object): 14 15 def __init__(self): 16 self.ids_seen = set() 17 18 def process_item(self, item, spider): 19 if item['info_id'] in self.ids_seen: 20 raise DropItem("Duplicate item found: %s" % item) 21 else: 22 self.ids_seen.add(item['info_id']) 23 return item 24 25 26 class ItjuziSpiderPipeline(object): 27 def open_spider(self, spider): 28 DB_Util.init_db() # 表不存在时候,初始化表结构 29 30 def process_item(self, item, spider): 31 if not item['info_id']: 32 raise DropItem('item info_id is null.{0}'.format(item)) 33 else: 34 session = DB_Util.get_session() 35 company = JuziCompany() 36 company.company_name = item['company_name'] 37 company.slogan = item['slogan'] 38 company.scope=item['scope'] 39 company.sub_scope=item['sub_scope'] 40 company.city = item['city'] 41 company.area = item['area'] 42 company.home_page=item['home_page'] 43 company.tags=item['tags'] 44 company.company_intro=item['company_intro'] 45 company.company_full_name=item['company_full_name'] 46 company.found_time=item['found_time'] 47 company.company_size=item['company_size'] 48 company.company_status=item['company_status'] 49 company.info_id = item['info_id'] 50 session.add(company) 51 if item['tz_info']: 52 for touzi in item['tz_info']: 53 tz = JuziTz() 54 tz.company_id = company.info_id 55 tz.tz_time = touzi['tz_time'] 56 tz.tz_finades = touzi['tz_finades'] 57 tz.tz_capital = touzi['tz_capital'] 58 tz.tz_round = touzi['tz_round'] 59 session.add(tz) 60 if item['tm_info']: 61 for team in item['tm_info']: 62 tm = JuziTeam() 63 tm.company_id = company.info_id 64 tm.tm_m_name = team['tm_m_name'] 65 tm.tm_m_title = team['tm_m_title'] 66 tm.tm_m_intro = team['tm_m_intro'] 67 session.add(tm) 68 if item['pdt_info']: 69 for product in item['pdt_info']: 70 pdt = JuziProduct() 71 pdt.company_id = company.info_id 72 pdt.pdt_name = product['pdt_name'] 73 pdt.pdt_type = product['pdt_type'] 74 pdt.pdt_intro = product['pdt_intro'] 75 session.add(pdt) 76 session.commit() 77 return item
posted on 2020-03-28 00:14 cherry_ning 阅读(147) 评论(0) 收藏 举报
浙公网安备 33010602011771号