C#简单爬虫实现
一、环境
.net core 6.0
vs2022 控制台应用程序
Nuget引入:
AngleSharp 1.1.0 用于HTML解析
Downloader 3.0.6 用于下载文件
ShellProgressBar 5.2.0 用于进度条显示
二、效果

三、相关代码
1.Program.cs
using ShellProgressBar;
using Spider;
using System.Collections;
var url = "https://blog.csdn.net/u011127019/article/details/124248757";
var data = await HttpHelper.GetHtmlDocument(url);
DownloadHandler downloadHandler = new DownloadHandler();
List<ImageList> imageList = new List<ImageList>();
ImageList imageList1 = new ImageList
{
Name = "图片目录",
Images = new List<string>()
};
foreach (var item in data.QuerySelectorAll("#article_content img"))
{
var link = item.QuerySelector("img");
var href = item?.GetAttribute("src");
if (href != null)
{
imageList1.ImageCount++;
imageList1.Images.Add(href);
}
}
imageList.Add(imageList1);
var list = imageList;// 加载图集列表
ProgressBarOptions BarOptions = new()
{
ProgressCharacter = '─',
ProgressBarOnBottom = true,
ForegroundColor = ConsoleColor.Yellow,
ForegroundColorDone = ConsoleColor.DarkGreen,
BackgroundColor = ConsoleColor.DarkGray,
BackgroundCharacter = '\u2593'
};
ProgressBarOptions ChildBarOptions = new()
{
ForegroundColor = ConsoleColor.Green,
BackgroundColor = ConsoleColor.DarkGreen,
ProgressCharacter = '─'
};
using var bar = new ProgressBar(list.Count, "正在下载所有图片", BarOptions);
foreach (var item in list)
{
bar.Message = $"图集:{item.Name}";
bar.Tick();
int i = 1;
foreach (var imgUrl in item.Images)
{
using (var childBar = bar.Spawn(item.ImageCount, $"图片:{imgUrl}", ChildBarOptions))
{
childBar.Tick();
string fileName = string.Empty;
// 具体的下载代码
if (imgUrl.Contains(".png"))
{
fileName = ".png";
}
if (imgUrl.Contains(".jpg"))
{
fileName = ".jpg";
}
await downloadHandler.Download(childBar, imgUrl, AppDomain.CurrentDomain.BaseDirectory + "\\Images\\" + i + fileName);
i++;
}
}
}
2.HttpHelper.cs
using AngleSharp.Html.Dom;
using AngleSharp.Html.Parser;
using Downloader;
using System.Net;
using System.Text;
namespace Spider
{
public static class HttpHelper
{
public const string UserAgent =
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36";
public static IDownloadService Downloader { get; }
public static DownloadConfiguration DownloadConf => new()
{
BufferBlockSize = 10240, // 通常,主机最大支持8000字节,默认值为8000。
ChunkCount = 8, // 要下载的文件分片数量,默认值为1
// MaximumBytesPerSecond = 1024 * 50, // 下载速度限制,默认值为零或无限制
MaxTryAgainOnFailover = 5, // 失败的最大次数
ParallelDownload = true, // 下载文件是否为并行的。默认值为false
Timeout = 1000, // 每个 stream reader 的超时(毫秒),默认值是1000
RequestConfiguration = {
Accept = "*/*",
AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate,
CookieContainer = new CookieContainer(), // Add your cookies
Headers = new WebHeaderCollection(), // Add your custom headers
KeepAlive = true,
ProtocolVersion = HttpVersion.Version11, // Default value is HTTP 1.1
UseDefaultCredentials = false,
UserAgent = UserAgent
}
};
public static HttpClientHandler Handler { get; }
public static HttpClient Client { get; }
static HttpHelper()
{
Handler = new HttpClientHandler();
Client = new HttpClient(Handler);
Client.DefaultRequestHeaders.Add("User-Agent", UserAgent);
Downloader = new DownloadService(DownloadConf);
}
public static async Task<IHtmlDocument> GetHtmlDocument(string url)
{
var html = await Client.GetStringAsync(url);
return new HtmlParser().ParseDocument(html);
}
public static async Task<IHtmlDocument> GetHtmlDocument(string url, string charset)
{
var res = await Client.GetAsync(url);
var resBytes = await res.Content.ReadAsByteArrayAsync();
var resStr = Encoding.GetEncoding(charset).GetString(resBytes);
return new HtmlParser().ParseDocument(resStr);
}
}
}
3.DownloadHandler.cs
using Downloader;
using ShellProgressBar;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Diagnostics;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading.Tasks;
namespace Spider
{
public class DownloadHandler
{
public async Task Download(IProgressBar bar, string url, string filepath)
{
var barOptions = new ProgressBarOptions
{
ForegroundColor = ConsoleColor.Yellow,
BackgroundColor = ConsoleColor.DarkYellow,
ForegroundColorError = ConsoleColor.Red,
ForegroundColorDone = ConsoleColor.Green,
BackgroundCharacter = '\u2593',
ProgressBarOnBottom = true,
EnableTaskBarProgress = RuntimeInformation.IsOSPlatform(OSPlatform.Windows),
DisplayTimeInRealTime = false,
ShowEstimatedDuration = false
};
var percentageBar = bar.Spawn(100, $"正在下载:{Path.GetFileName(url)}", barOptions);
HttpHelper.Downloader.DownloadStarted += DownloadStarted;
HttpHelper.Downloader.DownloadFileCompleted += DownloadFileCompleted;
HttpHelper.Downloader.DownloadProgressChanged += DownloadProgressChanged;
await HttpHelper.Downloader.DownloadFileTaskAsync(url, filepath);
void DownloadStarted(object? sender, DownloadStartedEventArgs e)
{
Trace.WriteLine(
$"图片, FileName:{Path.GetFileName(e.FileName)}, TotalBytesToReceive:{e.TotalBytesToReceive}");
}
void DownloadFileCompleted(object? sender, AsyncCompletedEventArgs e)
{
Trace.WriteLine($"下载完成, filepath:{filepath}");
percentageBar.Dispose();
}
void DownloadProgressChanged(object? sender, DownloadProgressChangedEventArgs e)
{
percentageBar.AsProgress<double>().Report(e.ProgressPercentage);
}
}
}
}
4.Images.cs
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Spider
{
public class ImageList
{
public string Name { get; set; } = string.Empty;
public int ImageCount { get; set; }
public List<string>? Images { get; set; }
}
}
四、源码下载
链接:https://pan.baidu.com/s/1VnnH05Har9hUhxAsIfKSMw?pwd=paws
提取码:paws

浙公网安备 33010602011771号