爬虫技术 -- 基础学习(五)解决页面编码识别(附c#代码)

  实现从Web网页提取文本之前,首先要识别网页的编码,有时候还需要进一步识别网页所使用的语言。因为同一种编码可能对应多种语言,例如UTF-8编码可能对应英文或中文等语言。


  识别编码整体流程如下:
  (1)从WEB服务器返回的content type头信息中提取编码,如果是GB2312的编码要当GBK处理。
  (2)从网页mate标签中识别字符编码,如果content type中的编码不一致,以meta中声明的编码为准。
  (3)如果仍然无法确定网页所使用的字符集,需要从返回流的二进制格式判断。
  (4)确定网页所使用的语言,往往采用统计的方法来估计网页的语言。

      判断编码的完整过程如下:(c#代码)

 1         /// <summary>
 2         /// 函数名称:GetDataFromUrl
 3         /// 功能说明:获取url指定的网页的源码
 4         /// 参数:string url用于指定 url
 5         /// 参数:ref Encoding encode用来获取网页中的字符集编码
 6         /// </summary>
 7         public static string GetDataFromUrl(string url, ref Encoding encode)
 8         {
 9             string str = string.Empty;
10             HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
11 
12             //设置http头
13             request.AllowAutoRedirect = true;
14             request.AllowWriteStreamBuffering = true;
15             request.Referer = "";
16             request.Timeout = 10 * 1000;
17             request.UserAgent = "";
18             HttpWebResponse response = null;
19             response = (HttpWebResponse)request.GetResponse();
20 
21             //根据http应答的http头来判断编码
22             string characterSet = response.CharacterSet;
23             //Encoding encode;
24             if (characterSet != "")
25             {
26                 if (characterSet == "ISO-8859-1")
27                 {
28                     characterSet = "gb2312";
29                 }
30                 encode = Encoding.GetEncoding(characterSet);
31             }
32             else
33             {
34                 encode = Encoding.Default;
35             }
36 
37             //声明一个内存流来保存http应答流
38             Stream receiveStream = response.GetResponseStream();
39             MemoryStream mStream = new MemoryStream();
40 
41             byte[] bf = new byte[255];
42             int count = receiveStream.Read(bf, 0, 255);
43             while (count > 0)
44             {
45                 mStream.Write(bf, 0, count);
46                 count = receiveStream.Read(bf, 0, 255);
47             }
48             receiveStream.Close();
49 
50             mStream.Seek(0, SeekOrigin.Begin);
51 
52             //从内存流里读取字符串
53             StreamReader reader = new StreamReader(mStream, encode);
54             char[] buffer = new char[1024];
55             count = reader.Read(buffer, 0, 1024);
56             while (count > 0)
57             {
58                 str += new String(buffer, 0, count);
59                 count = reader.Read(buffer, 0, 1024);
60             }
61 
62             //从解析出的字符串里判断charset,如果和http应答的编码不一直
63             //那么以页面声明的为准,再次从内存流里重新读取文本
64             Regex reg =
65                new Regex(@"<meta[\s\S]+?charset=(.*?)""[\s\S]+?>",
66                           RegexOptions.Multiline | RegexOptions.IgnoreCase);
67             MatchCollection mc = reg.Matches(str);
68             if (mc.Count > 0)
69             {
70                 string tempCharSet = mc[0].Result("$1");
71                 if (string.Compare(tempCharSet, characterSet, true) != 0)
72                 {
73                     encode = Encoding.GetEncoding(tempCharSet);
74                     str = string.Empty;
75                     mStream.Seek(0, SeekOrigin.Begin);
76                     reader = new StreamReader(mStream, encode);
77                     buffer = new char[255];
78                     count = reader.Read(buffer, 0, 255);
79                     while (count > 0)
80                     {
81                         str += new String(buffer, 0, count);
82                         count = reader.Read(buffer, 0, 255);
83                     }
84                 }
85             }
86             reader.Close();
87             mStream.Close();
88             if (response != null)
89                 response.Close();
90 
91             return str;
92 
93         }

 

posted @ 2013-12-14 15:53  lmei  阅读(1927)  评论(1编辑  收藏  举报