Scrapy 采集需要登录注册的网站
方法一:使用账号密码登录网址并采集(如遇到图形验证码等,需要额外进行处理)
#!/usr/bin/py2 # -*- coding: utf-8 -*- #encoding=utf-8 from bs4 import BeautifulSoup from scrapy.http import Request, FormRequest from spider_test.items import * from scrapy.spiders import CrawlSpider from spider_test import settings class ScrapyTestSpider(CrawlSpider): name = "spider_test" allowed_domains = [settings.SPIDER_DOMAIN] # 这是在settings.py文件中的自定义配置,其值类似于 www.demo.com def start_requests(self): """第一次请求一下登录页面,设置开启cookie使其得到cookie,设置回调函数""" yield Request('http://%s/admin/account/login.html' % settings.SPIDER_DOMAIN, meta={'cookiejar': 1}, callback=self.parse) def parse(self, response): data = dict(username="xiaoming", # 登录页表单的账号字段 password="888888") # 登录页表单的密码字段 print('登录中....!') """第二次用表单post请求,携带Cookie、浏览器代理、用户登录信息,进行登录给Cookie授权""" yield FormRequest(url='http://%s/admin/account/dologin.html' % settings.SPIDER_DOMAIN, # 真实post地址 meta={'cookiejar': 1}, formdata=data, callback=self.jump_list) def jump_list(self, response): print('正在请需要登录才可以访问的页面....!') yield Request('http://%s/admin/office/getofficelist.html' % settings.SPIDER_DOMAIN, meta={'cookiejar': 1}, callback=self.parser_list) def parser_list(self, response): soup = BeautifulSoup(response.body, 'html.parser') page_list = soup.find(attrs={'class': 'pagination'}).find_all('a') if page_list: for page in page_list: page_url = 'http://%s%s' % (settings.SPIDER_DOMAIN, page.get('href')) yield Request(page_url, meta={'cookiejar': 1}, callback=self.parser_list) office_list = soup.find_all('a', attrs={'class': 'ui-office-list'}) if office_list: for office in office_list: office_url = 'http://%s%s' % (settings.SPIDER_DOMAIN, office.attrs['href']) yield Request(office_url, meta={'cookiejar': 1}, callback=self.parse_article) def parse_article(self, response): test_item = SpiderTestItem() soup = BeautifulSoup(response.body, 'html.parser') container = soup.find('table', attrs={'class': 'index-statistics-table'}) test_item['source_url'] = response.url test_item['title'] = soup.title.get_text() test_item['article_content'] = container.prettify() return test_item
方法二:使用已登录的Cookie信息进行采集(可规避图形验证码)
#!/usr/bin/py2 # -*- coding: utf-8 -*- #encoding=utf-8 from bs4 import BeautifulSoup from scrapy.http import Request, FormRequest from spider_test.items import * from scrapy.spiders import CrawlSpider from spider_test import settings class ScrapyTestSpider(CrawlSpider): name = "spider_test" allowed_domains = [settings.SPIDER_DOMAIN] cookies = dict(PHPSESSID='qwertyuiopasdfghjklzxcvbnm') # 登录后得到的会话ID def start_requests(self): print('正在请需要登录才可以访问的页面....!') yield Request('http://%s/admin/office/getofficelist.html' % settings.SPIDER_DOMAIN, cookies=self.cookies, callback=self.parser_list) def parser_list(self, response): soup = BeautifulSoup(response.body, 'html.parser') page_list = soup.find(attrs={'class': 'pagination'}).find_all('a') if page_list: for page in page_list: page_url = 'http://%s%s' % (settings.SPIDER_DOMAIN, page.get('href')) yield Request(page_url, cookies=self.cookies, callback=self.parser_list) office_list = soup.find_all('a', attrs={'class': 'ui-office-list'}) if office_list: for office in office_list: office_url = 'http://%s%s' % (settings.SPIDER_DOMAIN, office.attrs['href']) yield Request(office_url, cookies=self.cookies, callback=self.parse_article) def parse_article(self, response): test_item = SpiderTestItem() soup = BeautifulSoup(response.body, 'html.parser') container = soup.find('table', attrs={'class': 'index-statistics-table'}) test_item['source_url'] = response.url test_item['title'] = soup.title.get_text() test_item['article_content'] = container.prettify() return test_item