13每周总结

发表时间：23.5.17

这周学习了python爬虫，并学习了scrapy框架，对图书网进行了爬取，代码如下

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

from dushu.items import DushuItem

class ReadSpider(CrawlSpider):

name = "read"

allowed_domains = ["www.dushu.com"]

start_urls = ["https://www.dushu.com/book/1008_1.html"]

rules = (Rule(LinkExtractor(allow=r"/book/1008_\d+\.html"), callback="parse_item", follow=True),)

def parse_item(self, response):

img_list = response.xpath('//div[@class="bookslist"]//img')

for img in img_list:

name = img.xpath('./@alt').extract_first()

src = img.xpath('./@data-original').extract_first()

book = DushuItem(name=name, src=src)

print(src,book)

yield book

posted @ 2023-03-20 22:06 樱花开到我身边阅读(12) 评论(0) 收藏举报

刷新页面返回顶部

yhkdw