简单爬虫实现: -接上一篇-scrapy 爬中框架
爬虫一般操作
文件夹结构:
data/data/spiders/ucidatasets.py
data/data/items.py
data/data/middlewares.py
data/data/pipelines.py
data/data/settings.py
data/scrapy.cfg
以上没有带上每个文件夹下的__init__.py
scrapy.cfg:
[settings]
default = data.settings
[deploy]
#url = http://localhost:6800/
project = data
settings.py
BOT_NAME = 'data'
# 爬虫位置在data/spiders
SPIDER_MODULES = ['data.spiders']
NEWSPIDER_MODULE = 'data.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
MONGODB_URL = "192.168.10.60"
MONGODB_DB = "ucidata"
MONGODB_COLLECTION = "info"
#
# item会依次经过下面定义的管道文件 数字决定多个管道的经过顺序
#
ITEM_PIPELINES = {
'data.pipelines.DataPipeline': 300,
}
pipelines.py
import os
import scrapy
from scrapy.exceptions import DropItem
import pymongo
class DataPipeline( ):
def __init__(self, mongodb_url, mongodb_db, mongodb_collection):
self.mongodb_url = mongodb_url
self.mongodb_db = mongodb_db
self.mongodb_collection = mongodb_collection
print("---->>>", self.mongodb_url)
print("---->>>", self.mongodb_db)
@classmethod
def from_crawler(cls, crawler):
"""
crawler 提供了访问Scrapy的核心组件的方法 如settings, singals
也可以用于hook
获取配置文件中的 MONGODB_DB 与 MONGODB_URL 设置
:param crawler:
:return:
"""
return cls(
mongodb_url = crawler.settings.get("MONGODB_URL"),
mongodb_db = crawler.settings.get("MONGODB_DB"),
mongodb_collection=crawler.settings.get("MONGODB_COLLECTION")
)
def open_spider(self, spider):
"""
爬虫开启时执行的操作 : 开始与mongodb的连接
:param spider:
:return:
"""
print("begin---------")
self.client = pymongo.MongoClient(self.mongodb_url)
self.db = self.client[self.mongodb_db][self.mongodb_collection]
def process_item(self, item, spider):
"""
:param item:
:param spider:
:return: 返回值为item或者字典类型
或者丢出DropItem
"""
self.db.insert(dict(item))
return item
# # 关闭爬虫时断开MongoDB数据库连接
def close_spider(self,spider):
self.client.close()
ucidatasets.py
#coding=utf-8
import os
import scrapy
from data.items import DataItem
FILENAME = "url.txt"
class UciDatasetsSpider(scrapy.Spider):
name = "ucidatasets"
#start_urls = [
#]
def _getRootPath(self):
comroot = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
return comroot
def _readUrlFromFile(self):
comroot = self._getRootPath()
#print "file============>",comroot
urltxt = os.path.join(comroot,FILENAME)
with open(urltxt,'r') as f:
line = f.readline()
a = 0
while line:
if a > 10 :
break
a = a+1
#print "line, ---------",line
yield line
line = f.readline()
def start_requests(self):
"""
配置 start_requests后start_urls不必使用了,url全部从这里获取
spider启动后,仅仅调用一次
:return:
"""
for u in self._readUrlFromFile():
url = u
#print "url ===>", url
if url!= None and url != '':
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
"""
xpath 的绝对路径定位法
界面处打开F12,左上角选择箭头图标,并选中要操作的对象,右键复制xpath。
产品的界面是跟随项目的进展一直在变化,元素的绝对位置也是在变化,最好不直接使用这种形式
xpath = '/html/body/table[2]/tbody/tr/td[2]/table[2]/tbody/tr[2]/td[1]/table/tbody/tr/td[2]/p/b/a'
根据绝对得到相对路径
:param response:
:return:
"""
#self._getNewFile( )
xpath = "//p//b//a//@href"
subSelector = response.xpath(xpath)
#print subSelector
for sub in subSelector:
item = sub.get()
"""
get 一次返回一个,
getall 一次返回全部
如: subSelector.getall() 则返回所有的
subSelector.get() 一次返回一个
"""
itemstr = item
url = 'http://archive.ics.uci.edu/ml/' + itemstr
#print " url ==>>>",url
#ditem = DataItem()
#ditem['url'] = url
yield scrapy.Request( url, callback=self.parse_datasets)
def parse_datasets(self, response):
# /html/body/table[2]/tbody/tr/td/table[1]/tbody/tr/td[1]/p[1]/span[2]/a[1]/font
xpath = "//td[1]/p[1]/span[2]/a[1]/@href"
subSelector = response.xpath(xpath)
#print "TTTTTTTTT",subSelector
preurl = "http://archive.ics.uci.edu/ml"
for sub in subSelector:
item = sub.get()
url = preurl + item.replace("..","")
ditem = DataItem()
ditem["tmpurl"] = url
#print "T-->url",url
yield scrapy.Request(url, meta={"dataitem":ditem}, callback=self.parse_downloads)
def parse_downloads(self, response):
#/html/body/ul/li[3]/a
ditem = response.meta["dataitem"]
baseurl = ditem["tmpurl"]
#print "DDDDDDDDDDD",baseurl
xpath = "//ul/li[3]/a/@href"
subSelector = response.xpath(xpath)
#print "D---------",subSelector
for sub in subSelector:
item = sub.get()
#print "D==================", item
url = baseurl + item
ditem["file_urls"] = url
#ditem["file"] = item
print("=============", url)
yield ditem
middlewares.py保持不变

浙公网安备 33010602011771号