最近歪酷的改动很让人抓狂,
虽然在歪酷两年多,感情也很深了。
这次还是受不了歪酷的折腾搬家来博客园了。
为了把以前歪酷上的东西能够尽量多地转移过来,
就用空闲时间写了一个小程序,
用于把自己在歪酷博客上的日志和评论抓成RSS,
这样就可以利用博客园的“从Rss导入”功能把日志转移过来了。
网页内容分析上很多地方采取了比较偷懒和投机的方法,
因为每个人的模板不一样,这个程序只保证对自己的博客有效,
对其它人的歪酷博客,就很可能出错了。
练习之作,不好意思发到技术区,所以就放在生活区啦。
整个程序包含三个源文件。
YculReader.cs用于从歪酷博客上抓取日志和评论内容。
RssWriter.cs用于生成RSS的Xml文档。
Main.cs主程序,生成数页每份20篇日志的Xml。
开发环境:Ubuntu 8.04,mono 1.2.6/C# 2.0,MonoDevelop 1.0
测试对本人的歪酷博客http://runandthink.yculblog.com工作正常。

YculReader.cs
1 // YculReader.cs created with MonoDevelop
2 // User: cuterabbit at 上12:20 08-4-28
3 //
4 // To change standard headers go to Edit->Preferences->Coding->Standard Headers
5 //
6
7 using System;
8 using System.Collections.Generic;
9 using System.Globalization;
10 using System.IO;
11 using System.Text;
12 using System.Text.RegularExpressions;
13 using System.Net;
14
15 namespace YculRss
16 {
17 public class BlogPost
18 {
19 public string Domain;
20 public string PostID;
21 public string Title;
22 public string Content;
23 public string Comments;
24 public DateTime PubDate;
25 public string FormatedPubDate
26 {
27 get
28 {
29 CultureInfo ci=new CultureInfo("en-US");
30 return PubDate.ToString("ddd, dd MMM yyyy HH:mm:ss",ci)+" GMT";
31 }
32 set
33 {
34 CultureInfo ci=new CultureInfo("en-US");
35 PubDate=DateTime.ParseExact(value,"R",ci);
36 }
37 }
38 }
39 public class YculReader
40 {
41 private static string proxy="";
42 private string domain;
43 public static readonly string ProxyCai="http://10.85.37.43:808";
44
45 public YculReader(string Domain)
46 {
47 domain=Domain;
48 }
49
50 public static string Proxy
51 {
52 get{return proxy;}
53 set{proxy=value;}
54 }
55 public static string GetURL(string adress)
56 {
57 HttpWebRequest requ=(HttpWebRequest)WebRequest.Create(adress);
58 if(proxy!="")
59 requ.Proxy=new WebProxy(proxy);
60 HttpWebResponse resp=(HttpWebResponse)requ.GetResponse();
61 StreamReader sr= new StreamReader(resp.GetResponseStream(),Encoding.UTF8);
62 string respstr=sr.ReadToEnd();
63 sr.Close();
64 return respstr;
65 }
66 public List<string> GetList(int ListID)
67 {
68 string arvpage=YculReader.GetURL("http://"+domain+".ycool.com/archive/index"+ListID+".html");
69 int compstart=arvpage.IndexOf("component")+34;
70 int compend=arvpage.IndexOf("pageNav")-15;
71 int length=arvpage.Length;
72 string comp=arvpage.Remove(0,compstart).Remove(compend-compstart,length-compend);
73 Regex reg=new Regex("<a href=\"/post.(?<postnum>\\d{7}).html\">[^/<>]*</a>",RegexOptions.Compiled);
74 MatchCollection match=reg.Matches(comp);
75 List<string> pl=new List<string>();
76 foreach(Match m in match)
77 pl.Add(m.Groups["postnum"].Value);
78 return pl;
79 }
80 public BlogPost GetPost(string PostID)
81 {
82 BlogPost bp=new BlogPost();
83 bp.Domain=domain;
84 bp.PostID=PostID;
85 string full=GetURL("http://"+domain+".ycool.com/post."+PostID+".html");
86 Regex reg=new Regex("<div class=\"postEntry\">\n <h3 class=\"title\">"+
87 "<a href=\"#\" rel=\"follow\">(?<title>[^<>/]*)</a></h3>",RegexOptions.Compiled);
88 bp.Title=reg.Match(full).Groups["title"].Value;
89 int compstart=full.IndexOf("<div class=\"content\">")+21;
90 int compend=full.IndexOf("<div class=\"tags\">")-91;
91 int length=full.Length;
92 bp.Content=full.Remove(0,compstart).Remove(compend-compstart,length-compend);
93 compstart=full.IndexOf("<div class=\"componentBody\">\n <ul>");
94 compend=full.IndexOf("</div><script type=\"text/javascript\" src=\""+
95 "http://s.ycul.com/blog.public/common.js\"></script>");
96 bp.Comments=full.Remove(0,compstart).Remove(compend-compstart,length-compend);
97 reg=new Regex("<span class=\"postTime\">(?<time>[^<>/]*)</span>",RegexOptions.Compiled);
98 bp.PubDate=DateTime.Parse(reg.Match(full).Groups["time"].Value)-new TimeSpan(8,0,0);
99 return bp;
100 }
101 }
102 }

RssWriter.css
1 // RssWriter.cs created with MonoDevelop
2 // User: cuterabbit at 上12:23 08-4-28
3 //
4 // To change standard headers go to Edit->Preferences->Coding->Standard Headers
5 //
6
7 using System;
8 using System.IO;
9 using System.Text;
10
11 namespace YculRss
12 {
13 public class RssWriter
14 {
15 private string domain;
16 private string path;
17 private StreamWriter rw;
18 public RssWriter(string Domain,string Path)
19 {
20 domain=Domain;
21 path=Path;
22 rw=new StreamWriter(path,false,Encoding.UTF8);
23 }
24 public void AddHead(string BlogTitle)
25 {
26 rw.WriteLine("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"+
27 "<rss version=\"2.0\">\n<channel>\n<title>"+BlogTitle+"</title>\n"+
28 "<link>http://"+domain+".ycool.com/</link>");
29 rw.Flush();
30 }
31 public void AddFoot()
32 {
33 rw.WriteLine("</channel></rss>");
34 rw.Flush();
35 rw.Close();
36 }
37 public void AddItem(BlogPost bp)
38 {
39 rw.WriteLine("<item>");
40 rw.WriteLine("<title>"+bp.Title+"</title>");
41 rw.WriteLine("<link>http://"+bp.Domain+".ycool.com/post."+bp.PostID+".html</link>");
42 rw.WriteLine("<description><![CDATA["+bp.Content+"<br/><hr/>"+bp.Comments+"]]></description>");
43 rw.WriteLine("<pubDate>"+bp.FormatedPubDate+"</pubDate>");
44 rw.WriteLine("</item>");
45 rw.Flush();
46 }
47 }
48 }
49

Main.cs
1 // Main.cs created with MonoDevelop
2 // User: cuterabbit at 下11:33 08-4-27
3 //
4 // To change standard headers go to Edit->Preferences->Coding->Standard Headers
5 //
6 using System;
7
8 namespace YculRss
9 {
10 class MainClass
11 {
12 public static void Main(string[] args)
13 {
14 Console.WriteLine("Hello World!");
15 RssWriter rw;
16 YculReader yr=new YculReader("runandthink");
17 for(int i=1;i<=8;i++)
18 {
19 rw=new RssWriter("runandthink","/home/cuterabbit/桌面/YculRss-"+i.ToString()+".xml");
20 rw.AddHead("长跑人生");
21 foreach(string postnum in yr.GetList(i))
22 rw.AddItem(yr.GetPost(postnum));
23 rw.AddFoot();
24 }
25 }
26 }
27 }
posted on 2008-05-04 23:52
CuteRabbit 阅读(223)
评论(2) 编辑 收藏 网摘 所属分类:
兔爪/技术探讨