node抓取图片
const https = require('https')
const http = require('http') /* 方式二时使用*/
const fs = require('fs')
const cheerio = require('cheerio')
const request = require('request')
const path = require('path');
const imgDir = path.join(__dirname, 'img');
let url = 'https://www.3dmgame.com/gl/3749617.html'
// const title = "猎人"
const list1 = [
{url:"https://www.3dmgame.com/gl/3748911.html", title:"./img/战士"},
{url:"https://www.3dmgame.com/gl/3749617.html", title:"./img/猎人"},
{url:"https://www.3dmgame.com/gl/3749938.html", title:"./img/机器人"},
];
const getImg = (url, title) => {
fs.mkdirSync(`${title}`, {recursive: true});//同步创建目录
https.get(url, (res) => {
// 安全判断
const { statusCode } = res
const contentType = res.headers['content-type']
console.log(statusCode, contentType)
let err = null
if (statusCode !== 200) {
err = new Error('请求状态错误')
} else if (!/^text\/html/.test(contentType)) {
err = new Error('请求类型错误')
}
if (err) {
console.log(err)
res.resume() //重置缓存
return false
}
let resData = ''
res.on('data', (data) => {
resData += data.toString('utf8')
})
res.on('end', () => {
//将请求数据保存在本地
let $ = cheerio.load(resData)
console.error($('img').length);
let id = 0;
$('img').each((index, el) => {
let imgUrl = $(el).attr('src')
// console.log($(el).attr('src'))
if (imgUrl) {
// let filename = imgUrl.split('/').pop()
// /* 方式一*/
// // request('http:'+imgUrl).pipe(fs.createWriteStream(imgDir + '/' + filename));
// /* 方式二*/
// var req = http.get('http:'+imgUrl, function (res) {
// var imgData = "";
// res.setEncoding("binary"); //一定要设置response的编码为binary否则会下载下来的图片打不开
// res.on("data", function (chunk) {
// imgData += chunk;
// });
// res.on("end", function () {
// let filename = imgUrl.split('/').pop()
// fs.writeFile(imgDir + '/' + filename, imgData, "binary", function (err) {
// if (err) {
// console.log("保存失败");
// }
// console.log("保存成功");
// });
// });
// res.on("error", function (err) {
// console.log("请求失败");
// });
// });
if (!imgUrl.includes("https://img.3dmgame.com/uploads/images/news")) {
return;
}
const ext = imgUrl.substring(imgUrl.length - 4, imgUrl.length);
console.error(`ext=${ext}`);
if (imgUrl.substring(imgUrl.length - 4, imgUrl.length) === ".jpg") {
return;
}
console.error(imgUrl);
var writeStream = fs.createWriteStream(`${title}//${++id}_${imgUrl.substring(imgUrl.length - 10, imgUrl.length - 4)}.png`);
var readStream = request(imgUrl);
readStream.pipe(writeStream);
readStream.on('end', function () {
console.log('文件下载成功');
});
readStream.on('error', function () {
console.log(1);
// console.log("错误信息:"+ err)
})
writeStream.on("finish", function () {
console.log("文件写入成功");
writeStream.end();
});
}
});
console.log('数据传输完毕')
})
}).on('error', (err) => {
console.log('请求错误')
})
}
for(let item of list1){
getImg(item.url, item.title);
}
这里主要是抓取网页上的所有图片,然后过滤图片。
posted on 2021-09-08 10:08 gongzhuiau 阅读(91) 评论(0) 收藏 举报
浙公网安备 33010602011771号