C#程序

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;

namespace MeiZi
{
    public class Program
    {
        static void Main(string[] args)
        {
            new GetMeiziPic();
        }
    }
    /// <summary>
    /// 获取妹子图片
    /// </summary>
    public class GetMeiziPic
    {
        private readonly string _path;
        private const string ImgRegex = @"<img[^>]*?src\s*=\s*[""']?([^'"" >]+?)[ '""][^>]*?>";//图片的正则表达式
        private const string LinkRegex = @"<h2><a\s+[^>]*?>[^<>]*?<\/a></h2>";
        public GetMeiziPic()
        {
            _path = DealDir(Path.Combine(Environment.CurrentDirectory, "Images"));
            Console.WriteLine("===============    开始采集   ===============");
            for (var i = 1; i < 10; i++)
            {
                Console.WriteLine("===============正在下载第{0}页数据===============", i);
                DoFetchStep1(i);
            }

            Console.WriteLine("===============   采集完成   ===============");
        }
        private string DealDir(string path)
        {
            if (!Directory.Exists(path))
                Directory.CreateDirectory(path);
            return path;
        }
        private void DoFetchStep1(int pageNum)
        {
            var request = (HttpWebRequest)WebRequest.Create("http://www.sepaidui.com/?sort=4&page=" + pageNum);
            request.Credentials = CredentialCache.DefaultCredentials;
            var response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode != HttpStatusCode.OK) return;
            var stream = response.GetResponseStream();
            if (stream == null) return;
            using (var sr = new StreamReader(stream))
            {
                FetchLinksFromSource1(sr.ReadToEnd());
            }
        }

        private void FetchLinksFromSource1(string htmlSource)
        {
            var matchesLink = Regex.Matches(htmlSource, LinkRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline);
            foreach (Match m in matchesLink)
            {
                string href = m.Groups[0].Value.Split('"')[1];
                DoFetchStep2(href);
            }
        }

        private void DoFetchStep2(string href)
        {
            var request = (HttpWebRequest)WebRequest.Create(href);
            var h = request.Headers;
            request.Credentials = CredentialCache.DefaultCredentials;
            var response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode != HttpStatusCode.OK) return;
            var stream = response.GetResponseStream();
            if (stream == null) return;
            using (var sr = new StreamReader(stream))
            {
                FetchLinksFromSource2(sr.ReadToEnd());
            }
        }
        private void FetchLinksFromSource2(string htmlSource)
        {
            var matchesImgSrc = Regex.Matches(htmlSource, ImgRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline);
            foreach (Match m in matchesImgSrc)
            {
                var href = m.Groups[1].Value;
                //只选取来自新浪相册的图片
                if (href.Contains("sinaimg") && CheckIsUrlFormat(href) && !href.Contains("60d02b59tw1eq6g7srmiwj20pv03mdg8"))
                {
                    Console.WriteLine(href);
                }
                else
                    continue;
                using (var myWebClient = new WebClient())
                {
                    try
                    {
                        myWebClient.DownloadFile(new Uri(href), Path.Combine(_path, Path.GetRandomFileName() + Path.GetExtension(href)));
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine(ex.Message);
                    }
                }
            }
        }

        private readonly Regex _isUrlFormat = new Regex(@"http://?([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
        private bool CheckIsUrlFormat(string value)
        {
            return _isUrlFormat.IsMatch(value);
        }
    }
}
posted @ 2015-03-21 00:59 talentzemin 阅读(256) 评论(0) 收藏举报
刷新页面返回顶部
talentzemin

C#程序

公告