using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace MeiZi
{
public class Program
{
static void Main(string[] args)
{
new GetMeiziPic();
}
}
/// <summary>
/// 获取妹子图片
/// </summary>
public class GetMeiziPic
{
private readonly string _path;
private const string ImgRegex = @"<img[^>]*?src\s*=\s*[""']?([^'"" >]+?)[ '""][^>]*?>";//图片的正则表达式
private const string LinkRegex = @"<h2><a\s+[^>]*?>[^<>]*?<\/a></h2>";
public GetMeiziPic()
{
_path = DealDir(Path.Combine(Environment.CurrentDirectory, "Images"));
Console.WriteLine("=============== 开始采集 ===============");
for (var i = 1; i < 10; i++)
{
Console.WriteLine("===============正在下载第{0}页数据===============", i);
DoFetchStep1(i);
}
Console.WriteLine("=============== 采集完成 ===============");
}
private string DealDir(string path)
{
if (!Directory.Exists(path))
Directory.CreateDirectory(path);
return path;
}
private void DoFetchStep1(int pageNum)
{
var request = (HttpWebRequest)WebRequest.Create("http://www.sepaidui.com/?sort=4&page=" + pageNum);
request.Credentials = CredentialCache.DefaultCredentials;
var response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode != HttpStatusCode.OK) return;
var stream = response.GetResponseStream();
if (stream == null) return;
using (var sr = new StreamReader(stream))
{
FetchLinksFromSource1(sr.ReadToEnd());
}
}
private void FetchLinksFromSource1(string htmlSource)
{
var matchesLink = Regex.Matches(htmlSource, LinkRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline);
foreach (Match m in matchesLink)
{
string href = m.Groups[0].Value.Split('"')[1];
DoFetchStep2(href);
}
}
private void DoFetchStep2(string href)
{
var request = (HttpWebRequest)WebRequest.Create(href);
var h = request.Headers;
request.Credentials = CredentialCache.DefaultCredentials;
var response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode != HttpStatusCode.OK) return;
var stream = response.GetResponseStream();
if (stream == null) return;
using (var sr = new StreamReader(stream))
{
FetchLinksFromSource2(sr.ReadToEnd());
}
}
private void FetchLinksFromSource2(string htmlSource)
{
var matchesImgSrc = Regex.Matches(htmlSource, ImgRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline);
foreach (Match m in matchesImgSrc)
{
var href = m.Groups[1].Value;
//只选取来自新浪相册的图片
if (href.Contains("sinaimg") && CheckIsUrlFormat(href) && !href.Contains("60d02b59tw1eq6g7srmiwj20pv03mdg8"))
{
Console.WriteLine(href);
}
else
continue;
using (var myWebClient = new WebClient())
{
try
{
myWebClient.DownloadFile(new Uri(href), Path.Combine(_path, Path.GetRandomFileName() + Path.GetExtension(href)));
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
}
}
}
private readonly Regex _isUrlFormat = new Regex(@"http://?([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
private bool CheckIsUrlFormat(string value)
{
return _isUrlFormat.IsMatch(value);
}
}
}