1.安装环境
http://pan.baidu.com/s/1bnAKBSz
修改环境变量path.添加 c:\python27;c:\python27\Script;
2.如需使用mysql等数据库请自行安装
3.安装好环境后,进入命令行,进入到工作目录,使用以下命令创建工程(工程名以doubanmoive为例)
scrapy startproject doubanmoive
4.做一系列修改使用以下命令运行项目
scrapy crawl doubanmoive
5.scrapy的一些注意地方
(1)步骤3完成以后,项目的目录应该是这样的(根目录各人不同)
D:\WEB\Python\doubanmoive>tree /f
Folder PATH listing for volume Data
Volume serial number is 00000200 34EC:9CB9
D:.
│ scrapy.cfg
│
└─doubanmoive
│ items.py
│ pipelines.py
│ settings.py
│ __init__.py
│
└─spiders
__init__.py
moive_spider.py
moive_spider.pyc
(2)这些文件主要功能为:
- doubanmoive/items.py:定义需要获取的内容字段,类似于实体类。
- doubanmoive/pipelines.py:项目管道文件,用来处理Spider抓取的数据。
- doubanmoive/settings.py:项目配置文件
- doubanmoive/spiders/moive_spider.py:放置spider的目录
(3)demo
doubanmoive/items.py
from scrapy.item import Item, Field class DoubanmoiveItem(Item): name=Field()#电影名 year=Field()#上映年份 score=Field()#豆瓣分数 director=Field()#导演 classification=Field()#分类 actor=Field()#演员 img=Field()#剧照
doubanmoive/pipelines.py
# -*- coding: utf-8 -*-
from scrapy import log
from twisted.enterprise import adbapi
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
import urllib
import MySQLdb
import MySQLdb.cursors
class DoubanmoivePipeline(object):
def __init__(self):
#定义mysql数据,db:mysql database name,user: mysql username,passwd:mysql password
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db = 'python',
user = 'root',
passwd = 'root',
cursorclass = MySQLdb.cursors.DictCursor,
charset = 'utf8',
use_unicode = False
)
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self,tx,item):
#检查数据库中是否存在该名字的电影
tx.execute("select * from doubanmoive where m_name= %s",(item['name'][0],))
result=tx.fetchone()
# log.msg(result,level=log.DEBUG)
print result
if result:
log.msg("Item already stored in db:%s" % item,level=log.DEBUG)
else:
#处理过多的主演和不符合的时间格式
classification=actor=''
lenClassification=len(item['classification'])
lenActor=len(item['actor'])
for n in xrange(lenClassification):
classification+=item['classification'][n]
if n<lenClassification-1:
classification+='/'
for n in xrange(lenActor):
actor+=item['actor'][n]
if n<lenActor-1:
actor+='/'
#获取海报下载地址
site= item['img'][0]
#截取海报地址的最后一个/,生成本地的文件名
str = site.split('/');
print str
path = str[-1]
print 'local img path %s'%(path)
#开始下载海报
print '--------------------download img %s'%(site)
data = urllib.urlopen(site).read()
newfile = open(path,"wb")
newfile.write(data)
newfile.close()
#将这些数据插入到数据库里面
tx.execute(\
"insert into doubanmoive (m_name,m_year,m_score,m_director,m_classification,m_actor,m_img,m_local_img) values (%s,%s,%s,%s,%s,%s,%s,%s)",\
(item['name'][0],item['year'][0],item['score'][0],item['director'][0],classification,actor,site,path))
# log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
doubanmoive/spiders
# -*- coding: utf-8 -*-
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from doubanmoive.items import DoubanmoiveItem
class MoiveSpider(CrawlSpider):
name="doubanmoive"
#允许访问的domain
allowed_domains=["movie.douban.com"]
#开始的地址
start_urls=["http://movie.douban.com/top250"]
#定义规则:允许访问的地址正则表达式,申明爬虫回调方法
rules=[
Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/top250\?start=\d+.*'))),
Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/subject/\d+')),callback="parse_item"),
]
#rules申明的爬虫回调方法
def parse_item(self,response):
sel=HtmlXPathSelector(response)
item=DoubanmoiveItem()
#定义获取item字段的xpath表达式,完成item解析赋值
item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)')
item['score']=sel.xpath('//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract()
item['director']=sel.xpath('//*[@id="info"]/span[1]/a/text()').extract()
item['classification']= sel.xpath('//span[@property="v:genre"]/text()').extract()
item['actor']= sel.xpath('//*[@id="info"]/span[3]/a[1]/text()').extract()
# item['img']= sel.xpath('//*[@id="content"]/div/div[1]/ol/li[1]/div/div[1]/a/img/text()').extract()
# item['img']= sel.select('//ol/li/div/div/a/img/@src').extract()
item['img']= sel.select('//a/img/@src').extract()
return item
doubanmoive/settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for doubanmoive project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'doubanmoive'
SPIDER_MODULES = ['doubanmoive.spiders']
NEWSPIDER_MODULE = 'doubanmoive.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'doubanmoive (+http://www.yourdomain.com)'
ITEM_PIPELINES={
'doubanmoive.pipelines.DoubanmoivePipeline':400,
}
LOG_LEVEL='DEBUG'
DOWNLOAD_DELAY = 2
RANDOMIZE_DOWNLOAD_DELAY = True
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
COOKIES_ENABLED = True
运行完成以后,数据库结果



浙公网安备 33010602011771号