scrapy爬虫---黑马授课老师信息收集
itcast.py 文件修改
import scrapy
class ItcastSpider(scrapy.Spider):
# 爬虫名字
name = 'itcast'
# 检查域名
allowed_domains = ['itcast.cn']
# url 起始地址
start_urls = ['https://www.itcast.cn/channel/teacher.shtml#aweb']
# 定义对网站的相关操作
def parse(self, response):
# 定义获取所有老师的节点
node_list = response.xpath('//div[@class="li_txt"]')
print(len(node_list))
for node in node_list:
temp = {}
# xpath()方法返回的是选择器对象的列表
# temp['name'] = node.xpath('./h3/text()')[0]
# temp['title'] = node.xpath('./h4/text()')[0]
# temp['desc'] = node.xpath('./p/text()')[0]
temp['name'] = node.xpath('./h3/text()')[0].extract()
temp['title'] = node.xpath('./h4/text()')[0].extract()
temp['desc'] = node.xpath('./p/text()')[0].extract()
#
# temp['name'] = node.xpath('./h3/text()')[0].extract_first()
# temp['title'] = node.xpath('./h4/text()')[0].extract_first()
# temp['desc'] = node.xpath('./p/text()')[0].extract_first()
# print(temp)
yield temp
配置文件保存操作
settings.py
ITEM_PIPELINES = {
'myspider.pipelines.MyspiderPipeline': 300,
}
管道文件修改pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json
class MyspiderPipeline:
def __init__(self):
self.file = open('itcast.json', 'w')
def process_item(self, item, spider):
# print('itcast',item)
# 将字典数据序列化
json_data = json.dumps(item,ensure_ascii=False)+',\n'
self.file.write(json_data)
return item
def __del__(self):
self.file.close()
posted on 2022-10-24 12:54 Indian_Mysore 阅读(109) 评论(1) 收藏 举报
浙公网安备 33010602011771号