How to backup your blogs on cnblogs

This is an alternative to OfflineExplorer.
Thanks for this article[1] which is listed in Reference secton, and I modify several lines to adapt to my blogs. Here is the change list:
1. L193, change "homepage1_BottomPager" to "homepage1_HomePageDays_BottomPager". Because I can't find "homepage1_BottomPager" in the source code of my cnblog web page at all.
2. L394, set url to your last page.
3. L396, set the output directory on your local disk.
Enjoy it!
  1 #! encoding=utf-8
  2 
  3 #cnblogs博客备份，使用方法：修改最下面的url和output，然后执行就可以了。
  4 
  5 import urllib2
  6 import re
  7 import os
  8 import sys
  9 # from HTMLParser import HTMLParser
 10 import html5lib
 11 # from xml.etree.ElementTree import ElementTree
 12 from urlparse import urlparse
 13 import xml
 14 import codecs
 15 import traceback
 16 import time
 17 
 18 # class MyHTMLParser(HTMLParser):
 19 
 20 #     def handle_starttag(self, tag, attrs):
 21 #         # if tag.lower() == "img":
 22 #             print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))
 23 #             for x in attrs:
 24 #                 print "name %s,value %s" % (x[0],x[1])
 25 #     def handle_endtag(self, tag):
 26 #         print "Encountered the end of a %s tag" % tag
 27 
 28 #     def handle_startendtag(self, tag, attrs):
 29 #         print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))
 30 #         for x in attrs:
 31 #             print "name %s,value %s" % (x[0],x[1])
 32 
 33 # 资源尝试次数
 34 gTestTime = 5
 35 
 36 def DownloadFile(url,output):
 37   responseText = None
 38   dirssPath = None
 39   try:
 40     res = urlparse(url)
 41     url = res.scheme+"://"+res.netloc+res.path
 42     path = res.path
 43     index = path.rfind('/')
 44     dirss = "/"
 45     if index != -1:
 46       dirss =  output + "/" + res.netloc.encode("utf-8") + path[0:index].encode("utf-8")
 47       dirssPath = output + "/" + res.netloc.encode("utf-8") + path.encode("utf-8")
 48       dirss_ansi = dirss.decode('utf-8')
 49       if not os.path.exists(dirss_ansi):
 50         os.makedirs(dirss_ansi)
 51     global gTestTime
 52     count = gTestTime    
 53     while True:
 54       if count < 0:
 55         break
 56       count = count - 1
 57       header={"User-Agent": "Mozilla-Firefox5.0"}
 58       if not url.startswith("http://"):
 59         break
 60       try:
 61         # print "url: %s:%d" % (url,count)
 62         time.sleep(0.5)
 63         request = urllib2.Request(url,None,header)
 64         response = urllib2.urlopen(request)
 65         dirssPath_ansi = dirssPath.decode("utf-8")
 66         if not os.path.exists(dirssPath_ansi):
 67           resourceFile = open(dirssPath_ansi,"wb")
 68           responseText = response.read()
 69           if url.endswith(".js"):
 70             responseText = responseText.replace("http://","")
 71             responseText = responseText.replace("https://","")
 72           resourceFile.write(responseText)
 73           resourceFile.close()
 74         break         
 75       except Exception,e:
 76         print "DownloadFile: %s:%s:%d" % (e,url,count)
 77         # pass
 78         # exstr = traceback.format_exc()
 79         # print exstr
 80 
 81   except Exception,e:
 82       pass
 83       # exstr = traceback.format_exc()
 84       # print exstr
 85   
 86   return (responseText,url,output)
 87 
 88 def ReadCss(css):
 89   # print "ReadCss"
 90   mode = 'url\(\"?([^)]+)\"?\)'
 91   pattern = re.compile(mode)
 92   try:
 93     text = css[0]
 94     if css[0] == None:
 95       return
 96     strMatch = pattern.findall(text)
 97     size = len(strMatch)
 98     # print "size: ",size
 99     for i in range(0,size,1):
100       one = strMatch[i]
101       newurl = GetConcatUrl(css[1],one)
102       DownloadFile(newurl,css[2])
103   except Exception,e:
104       pass
105       # exstr = traceback.format_exc()
106       # print exstr 
107 
108 def Download(url,output):
109   # try:
110   header={"User-Agent": "Mozilla-Firefox5.0"}
111   namespace = "{http://www.w3.org/1999/xhtml}"
112   request = urllib2.Request(url,None,header)
113   response = urllib2.urlopen(request)
114 
115   data = response.read()
116   document = html5lib.parse(data)
117   imgElements = document.findall('.//{0}img'.format(namespace))
118   # print "imgElements %d" % len(imgElements)
119   for img in imgElements:
120     src = img.attrib["src"]
121     # print "src %s" % src
122     try:
123       res = urlparse(src)
124       # 非cnblogs的图片不下载
125       if not res.netloc.endswith(".cnblogs.com"):
126         print "image not download: %s:%s" % (src,res.netloc)
127         continue
128     except Exception,e:
129       pass
130     DownloadFile(src,output)
131 
132   linkElements = document.findall('.//{0}link'.format(namespace))
133   # print "linkElements %d" % len(linkElements)
134   for link in linkElements:
135     href = link.attrib["href"]
136     # print "href %s" % href
137     text = DownloadFile(href,output)
138     if link.attrib.has_key("rel") and link.attrib["rel"].lower() == "stylesheet":
139       ReadCss(text)
140 
141   scriptElements = document.findall('.//{0}script'.format(namespace))
142   # print "scriptElements %d" % len(scriptElements)
143   for script in scriptElements:
144     if script.attrib.has_key("src"):
145       src = script.attrib["src"]
146       # print "src %s" % src
147       DownloadFile(src,output)
148     
149   htmlNameIndex = url.rfind("/");
150   urlLen = len(url)
151   htmlName = GetHtmlName(url)
152   output = output.decode("utf-8") + "/"+htmlName+".htm"
153   data = data.replace("http://","")
154   data = data.replace("https://","")
155   data = data.replace("www.w3.org/1999/xhtml","http://www.w3.org/1999/xhtml")
156 
157   resourceFile = open(output,"wb")
158   resourceFile.write(data)
159   resourceFile.close()
160 
161 def GetConcatUrl(url,png):
162   # one: "../images/f_icon.png" -- url http://static.csdn.net/public/common/toolbar/css/index.css
163   count = 0
164   index = png.find("..")
165   startindex = None
166   while index != -1:
167     count = count + 1;
168     startindex = index + 2
169     index = png.find("..",startindex)
170 
171   second = png[startindex:]
172   length = len(url)
173   index = url.rfind("/")
174   endindex = 0
175   while count >= 0 and index != -1:
176     endindex = index
177     index = url.rfind("/",0, endindex)
178     count = count - 1
179   first = url[0:endindex]
180   return first+second
181 
182 def getAllListUrl(url):
183   header={"User-Agent": "Mozilla-Firefox5.0"}
184   request = urllib2.Request(url,None,header)
185   response = urllib2.urlopen(request)
186   data = response.read()
187   
188   # By default, the document will be an xml.etree element instance.Whenever possible, html5lib chooses the accelerated ElementTreeimplementation (i.e. xml.etree.cElementTree on Python 2.x).
189   document = html5lib.parse(data)
190   namespace = "{http://www.w3.org/1999/xhtml}"
191 
192   # get <div id="homepage1_BottomPager" class="topicListFooter">
193   pageList = document.findall('.//{0}div[@id=\'homepage1_HomePageDays_BottomPager\']'.format(namespace))
194   print( "Debug>len(pageList)=%d"%len(pageList) );
195   # get <div class="pager">
196   alinks = list(pageList[0])
197   # get content in <div class="pager">, like:<a href="http://www.cnblogs.com/GnagWang/default.html?page=1">
198   alinks1 = list(alinks[0])
199   lastArticle = alinks1[len(alinks1)-1]
200   
201   # lastArticleHref = u'http://www.cnblogs.com/GnagWang/default.html?page=20'
202   lastArticleHref = lastArticle.attrib["href"]
203   lastPageIndex = lastArticleHref.rfind("=")
204   lastPageNum = int(lastArticleHref[lastPageIndex+1:])
205   urlInfo = lastArticleHref[0:lastPageIndex]
206 
207   urlList = []
208   for x in xrange(1,lastPageNum+1):
209     listUrl = urlInfo+"="+str(x)
210     urlList.append(listUrl)
211 
212   return urlList
213 
214 
215 def getArticleList(url):
216   # 获取所有的文章url
217   # <div id="article_toplist" class="list"></div>
218   # <div id="article_list" class="list"  
219   
220   # <div class="list_item article_item"
221   
222   # <div class="article_title">
223   # <span class="ico ico_type_Original"></span>
224   # <h1>
225   #     <span class="link_title">
226   #         <a href="/infoworld/article/details/18984183">
227 
228   # <div class="article_manage">
229   # <span class="link_postdate"></span>
230 
231   urlList = getAllListUrl(url)
232   print "文章页数(number of pages) ",len(urlList)
233   header={"User-Agent": "Mozilla-Firefox5.0"}
234 
235   allLists = []
236 
237   strPage = "分析 第 {0} 页 ".decode("utf-8").encode("utf-8")
238   pageNum = 0
239   global gTestTime
240   for one in urlList:
241     tryCount = gTestTime # try count
242     pageNum = pageNum + 1
243     pageNumStr = strPage.format(pageNum)
244     print pageNumStr
245 
246     while tryCount > 0:
247       try:
248         tryCount = tryCount - 1
249         time.sleep(0.5) #访问太快会不响应
250         request = urllib2.Request(one,None,header)
251         response = urllib2.urlopen(request)
252 
253         data = response.read()
254         document = html5lib.parse(data,encoding="utf-8")
255         namespace = "{http://www.w3.org/1999/xhtml}"
256         # .//{0}div[@id=\'article_toplist\']
257         #topLists = document.findall('.//{0}div[@id=\'article_toplist\']/{0}div[@class=\'list_item article_item\']'.format(namespace))
258         #articleLists = document.findall('.//{0}div[@id=\'article_list\']/{0}div[@class=\'list_item article_item\']'.format(namespace))
259         articleLists =  document.findall('.//{0}div[@class=\'postTitle\']'.format(namespace))
260         allLists = allLists + articleLists
261         break
262       except Exception, e:
263         print "getArticleList %s:%s:%d" % (e,one,tryCount)
264       
265   
266   count = 0 # 文章数    
267   artices = []
268   for article in allLists:
269       count = count+1
270       alink = article.find(".//{0}a".format(namespace))
271       # href = u'http://www.cnblogs.com/GnagWang/archive/2010/04/02/1702721.html'
272       href = alink.attrib["href"]
273       #oneHref = "http://blog.csdn.net"+href
274       oneHref = href
275 
276       childElement = list(alink)
277       linkIter = alink.itertext()
278       title = "".encode("utf-8")
279       for x in linkIter:
280         title = title+x.strip().encode("utf-8")
281       artices.append([oneHref,title])
282 
283   return artices
284 
285 def GetUserName(url):
286   htmlNameIndex = url.rfind("/");
287   urlLen = len(url)
288   htmlName = ""
289   htmlNameIndex1 = url.rfind("/",0,htmlNameIndex)
290   htmlName = url[htmlNameIndex1+1:htmlNameIndex]
291   # if htmlNameIndex+1 == urlLen:
292     # htmlNameIndex = url.rfind("/",0,htmlNameIndex)
293     # htmlName = url[htmlNameIndex+1:urlLen-1]
294   # else:
295     # htmlName = url[htmlNameIndex+1:]
296   return htmlName
297 
298 
299 def GetHtmlName(url):
300   htmlNameIndex = url.rfind("/");
301   urlLen = len(url)
302   htmlName = ""
303   if htmlNameIndex+1 == urlLen:
304     htmlNameIndex = url.rfind("/",0,htmlNameIndex)
305     htmlName = url[htmlNameIndex+1:urlLen-1]
306   else:
307     htmlName = url[htmlNameIndex+1:]
308   return htmlName
309 
310 
311 
312 #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=19这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页，则URL建议为前面的URL    
313 def Start(url,output):
314 
315   print "备份开始"
316   lists = getArticleList(url)
317   username = GetUserName(url)
318   output_username = output+"/"+username
319   output_username.replace("\\","/")
320   if not os.path.exists(output_username.decode("utf-8")):
321     os.mkdir(output_username.decode("utf-8"))
322 
323   totalNum = len(lists)
324   print "总文章数(number of articles): %d" % totalNum
325 
326   # 生成首页文件
327   doctype = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n'
328   charset = '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />'
329   indexHtml = output_username + ".htm" 
330   f = open(indexHtml.decode("utf-8"),"w")            
331   print >> f,doctype
332   print >> f,'<html>'
333   print >> f,'<head>'
334   print >> f,charset
335   print >> f,'</head>'
336   print >> f,'<frameset cols=\"20%,*\">'
337   navigationHtmlName = username+'-navigation.htm'
338   print >> f,'<frame src=\"'+navigationHtmlName+'\" />'
339   firstHtmlName = GetHtmlName(lists[0][0])
340   print >> f,'<frame src=\"'+username+'/'+firstHtmlName+'.htm\" name=\"showframe\">'
341   print >> f,'</frameset>'
342   print >> f,'</html>'
343   f.close()
344 
345   # 生成导航文件
346   navigationHtml = output+"/"+navigationHtmlName
347   # f = open(navigationHtml.decode("utf-8"),"w")
348   f = codecs.open(navigationHtml.decode("utf-8"),"w","utf-8-sig")
349   print >> f,doctype
350   print >> f,'<html>'
351   print >> f,'<head>'
352   print >> f,charset
353   print >> f,'<style> body{font: 12px Verdana, Arial, Helvetica, sans-serif;}a{color: #808080;}</style>'
354   print >> f,'</head>'
355   print >> f,'<body>'
356   count = 0
357   for x in lists:
358     count = count + 1
359     articleIdHtml = username+"/"+GetHtmlName(x[0])+".htm"
360     print >> f,'<a href=\"'+articleIdHtml + '\" target=\"showframe\">'+str(count)+'.'+x[1].decode("utf-8")+'</a><br /><br />'
361   print >> f,'</body>'
362   print >> f,'</html>'
363   f.close()
364 
365   print "开始下载文章"
366   currentNum = 0
367   strPage = "{0}:{1}.".decode("utf-8").encode("utf-8")
368   global gTestTime
369   for x in lists:
370     count = gTestTime
371     currentNum = currentNum+1
372     while True:
373       if count < 0:
374         break
375       count = count - 1
376       try:
377         time.sleep(1) #访问太快,csdn会报503错误.
378         strPageTemp = strPage.format(totalNum,currentNum)
379         strPageTemp = strPageTemp+x[1]
380         print strPageTemp #这里有时候会不能输出,报output is not utf-8错误,单独执行时
381 
382         print x[0]
383         print "\n"
384         Download(x[0],output_username)
385         break
386       except Exception, e:
387         # exstr = traceback.format_exc()
388         # print exstr
389         pass
390   
391   
392 #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=21这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页，则URL建议为前面的URL   
393 if __name__=='__main__':
394   url = "http://www.cnblogs.com/yaoyansi/default.html?page=4"
395   #output = "C:/Users/apple/Desktop/新建文件夹"
396   output = "/tmp/my_tmp/cnblogs"
397   Start(url,output)
398   # Download("http://blog.csdn.net/dcraw/article/details/6858820",
399   #     "C:/Users/apple/Desktop/新建文件夹/infoworld")
Reference:
[1] http://blog.csdn.net/llrraa2010/article/details/35540845
posted on 2014-10-24 18:58 yys 阅读(299) 评论(0) 收藏举报
刷新页面返回顶部
yys

公告