scrapy电影天堂练习
movie.py
import scrapy
from movieProject.items import MovieprojectItem
class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['www.ygdy8.net']
start_urls = ['https://www.ygdy8.net/html/gndy/china/index.html']
def parse(self, response):
print("电影天堂")
movieList = response.xpath('//table//tr[2]/td[2]/b/a[2]')
for item in movieList:
movieName = item.xpath('./text()').extract_first()
movieUrl = 'https://www.ygdy8.net' + item.xpath('./@href').extract_first()
print(movieName, movieUrl)
#使用meta将movieName传给secon_parse方法
yield scrapy.Request(url=movieUrl, callback=self.second_parse,meta={'movieName':movieName})
def second_parse(self,response):
print("二次解析之前")
#打开src进入到详情页面,然后获取详情页面的图片地址
secondUrl = response.xpath('//div[@id="Zoom"]//img/@src').extract_first()
print("第二次访问", secondUrl)
movieName = response.meta['movieName']
movie = MovieprojectItem(movieName=movieName, movieUrl=secondUrl)
yield movie
items.py
class MovieprojectItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
movieName = scrapy.Field()
movieUrl = scrapy.Field()
pass
pipelines.py
class MovieprojectPipeline:
def open_spider(self, spider):
self.fp = open('movie.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
self.fp.write(str(item))
return item
def close_spider(self, spider):
self.fp.close()
浙公网安备 33010602011771号