简单的抓取网站图片

 

 

class Program
    {
        static void Main(string[] args)
        {
            new Thread(new ParameterizedThreadStart(Go)).Start("beauty");
            new Thread(new ParameterizedThreadStart(Go)).Start("music");
            new Thread(new ParameterizedThreadStart(Go)).Start("artdesign");
            Console.ReadKey();
        }

        static void Go(object category)
        {
            string url = string.Empty;
            string res = string.Empty;
            for (int i = 1; i <= 3; i++)
            {
                url = string.Format("http://699pic.com/{0}-{1}-0-0-0.html", category, i);
                res = GetHttpWebRequest(new Uri(url));
                if (!string.IsNullOrEmpty(res))
                {
                    Regex r = new Regex("<img alt=\"(.*?)\" data-original=\"(.*?)\"");
                    MatchCollection mc = r.Matches(res);
                    List<string> list = new List<string>();
                    if (mc.Count > 0)
                    {
                        Parallel.For(0, mc.Count, (int j) =>
                        {
                            list.Add(mc[j].Groups[2].ToString());
                        });

                        list.ForEach(m =>
                        {
                            GetImage(m, @"E:\spiderimages\" + category.ToString());
                            Console.WriteLine(category+" 类别下的图片地址 "+m+" 已经成功保存");
                        });
                    }
                }
            }
            Console.WriteLine("\r\n\r\n=============================" + category + "类别图片下载完毕====================================\r\n\r\n");
        }


        static string GetHttpWebRequest(Uri url)
        {

            HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create(url);

            myReq.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 OPR/40.0.2308.90";
            myReq.Accept = "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
            myReq.KeepAlive = true;
            myReq.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");

            HttpWebResponse result = null;
            try
            {
                result = (HttpWebResponse)myReq.GetResponse();
            }
            catch (Exception ex)
            {

                return "";
            }


            if (!result.ContentType.Contains("text/html") && !result.ContentType.Contains("javascript") && !result.ContentType.Contains("json") && !result.ContentType.Contains("jsonp") && !result.ContentType.Contains("text") && !result.ContentType.Contains("xml"))
            {

                return "";
            }

            Stream receviceStream = result.GetResponseStream();

            byte[] htmlbyte;
            try
            {
                htmlbyte = getByte(receviceStream);
            }
            catch (Exception ex)
            {
                return "";
            }


            //4 设置编码
            string enc = "utf-8";
            if (result.ContentType.Contains("charset"))
            {
                enc = result.ContentType.Split('=')[1]; //Console.WriteLine("k:" + enc + " " + url.AbsoluteUri);
                string strHTML = Encoding.GetEncoding(enc).GetString(htmlbyte);
                receviceStream.Close();
                result.Close();
                return strHTML;
            }
            else
            {
                string text = Encoding.Default.GetString(htmlbyte);
                Match m = Regex.Match(text, @"charset=['""](?<enc>[-a-zA-Z0-9]+)['""]|charset=(?<enc>[-a-zA-Z0-9]+)");
                if (m.Success)
                {
                    enc = m.Groups[1].Value;
                }
                receviceStream.Close();
                result.Close();
                return Encoding.GetEncoding(enc).GetString(htmlbyte);
            }
        }

        static byte[] getByte(Stream se)
        {
            List<byte> lsbyte = new List<byte>();

            while (se.CanRead)
            {
                byte[] buf = new byte[1024 * 1024 * 20];
                int getlength = se.Read(buf, 0, buf.Length);
                if (getlength == 0) break;
                if (getlength < buf.Length)
                {
                    byte[] b = new byte[getlength];
                    Array.Copy(buf, b, getlength);
                    lsbyte.AddRange(b);
                }
                else lsbyte.AddRange(buf);
            }
            return lsbyte.ToArray();
        }

        static void GetImage(string imgUrl, string path)
        {
            if (!Directory.Exists(path))
            {
                Directory.CreateDirectory(path);
            }
            string fileName = imgUrl.Substring(imgUrl.LastIndexOf("/") + 1);
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(imgUrl);
            request.UserAgent = "Mozilla/6.0 (MSIE 6.0; Windows NT 5.1; Natas.Robot)";
            request.Timeout = 3000;

            WebResponse response = request.GetResponse();
            Stream stream = response.GetResponseStream();

            if (response.ContentType.ToLower().StartsWith("image/"))
            {
                byte[] arrayByte = new byte[1024];
                int imgLong = (int)response.ContentLength;
                int l = 0;

                FileStream fso = new FileStream(Path.Combine(path , fileName), FileMode.Create);
                while (l < imgLong)
                {
                    int i = stream.Read(arrayByte, 0, 1024);
                    fso.Write(arrayByte, 0, i);
                    l += i;
                }

                fso.Close();
                stream.Close();
            }

        }
    }

 

posted on 2017-08-09 18:09  奔游浪子  阅读(155)  评论(0)    收藏  举报

导航