13每周总结
13每周总结
发表时间:23.5.17
这周学习了python爬虫,并学习了scrapy框架,对图书网进行了爬取,代码如下
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from dushu.items import DushuItem
class ReadSpider(CrawlSpider):
name = "read"
allowed_domains = ["www.dushu.com"]
start_urls = ["https://www.dushu.com/book/1008_1.html"]
rules = (Rule(LinkExtractor(allow=r"/book/1008_\d+\.html"), callback="parse_item", follow=True),)
def parse_item(self, response):
img_list = response.xpath('//div[@class="bookslist"]//img')
for img in img_list:
name = img.xpath('./@alt').extract_first()
src = img.xpath('./@data-original').extract_first()
book = DushuItem(name=name, src=src)
print(src,book)
yield book