#!/usr/bin/python
# -*- coding: utf-8 -*-
import urllib2
import urllib
import os
from BeautifulSoup import BeautifulSoup
import re
from urlparse import urlparse
from urlparse import urljoin
import logging
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
logging.basicConfig(level=logging.DEBUG)
class BlogSpider(object):
"""net spider for blog"""
def __init__(self, url):
super(BlogSpider, self).__init__()
self._url = url
class CSDNSpider(BlogSpider):
"""a spider for csdn"""
def __init__(self, url, reg):
super(CSDNSpider, self).__init__()
self.url = url
self.reg = reg
class Downloader(object):
"""download the page"""
def __init__(self, url, path = u'./'):
super(Downloader, self).__init__()
self.__url = url
response = self._urlopen(url)
self.__html = response.read()
self.__title = re.search(r'<title.*?>(.*?)</title>', self.__html, re.S).group(1)
if os.path.isdir(path):
if not os.path.exists(path):
os.mkdir(path)
self.__dir = path
self.__filename = re.sub(r'[\s|\\|/|\?|\||\*|<|>|\"|\:]', u'_', self.__title) + u'.htm'
else:
t = os.path.split(path)
self.__dir = t[0]
self.__filename = t[1]
# transform title to legal dir name
self.__resdir = self.__dir + re.search(r'(.*)\.[html|htm]*$', self.__filename).group(1) + u'_files/'
# root url of the site
uparse = urlparse(url)
scheme = uparse.scheme
if scheme == None:
scheme = 'http'
self.__rooturl = scheme + '://' + uparse.netloc # net location without tailing slash
self.__parenturl = self.__rooturl + re.search(r'(.*/)', uparse.path).group() #url before last slash
#self.__rooturl = re.search(r'\w+?://.+?/', url).group() # string before first address slash
#self.__parenturl = re.search(r'(.*/).*?$', url).group(1) # string before final slash
self.__img_matchinfo = []
self.__css_matchinfo = []
def start(self):
try:
if not os.access(self.__resdir, os.F_OK):
os.mkdir(self.__resdir)
self.save_css()
self.save_img()
self.save_html()
except Exception, e:
print e
raise
@property
def limit(self):
return self.__limit
@limit.setter
def limit(self, val):
assert val > 0, 'Downloader: limit can not be negative.'
self.__limit = val
@property
def url(self):
return self.__url
@url.setter
def url(self, str):
self.__url = str
@property
def html(self):
return self.__html
@property
def title(self):
return self.__title
@property
def dir(self):
return self.__dir
def save_css(self):
logging.debug('---------- Downloading css ----------')
for url in self.abs_url(self.css_urls(self.__html)):
rsp = self._urlopen(url)
f = open(self._to_local(url), 'w')
f.write(rsp.read())
logging.debug(url + '\tsave successfully.')
def save_img(self):
logging.debug('---------- Downloading img ----------')
for url in self.abs_url(self.img_urls(self.__html)):
rsp = self._urlopen(url)
f = open(self._to_local(url), 'wb')
f.write(rsp.read())
# urllib.urlretrieve(url, self._to_local(url))
logging.debug(url + '\tsave successfully.')
def save_html(self):
localhtml = ''
subinfos = self.__css_matchinfo + self.__img_matchinfo
subinfos.sort(lambda x,y: cmp(x.start, y.start))
i = 0
for info in subinfos:
localhtml += self.__html[i:info.start]
localhtml += self._to_local(self.abs_url(info.content))
i = info.end
localhtml += self.__html[i:len(self.__html)]
fp = open(self.__dir + self.__filename, 'w')
fp.write(localhtml)
def _abs_url(self, url):
# uparse = urlparse(url)
# addr = ''
# if uparse.scheme == '': # not start with :// , it's a brief url
# addr += 'http://'
# else:
# addr += uparse.scheme + '://'
# addr += uparse.netloc
# if not uparse.path.startswith('/'):
# addr += '/'
# addr += uparse.path
return urljoin(self.__url, url)
def abs_url(self, url):
retlist = []
if isinstance(url, list):
for u in url:
retlist.append(self._abs_url(u))
return retlist
else:
return self._abs_url(url)
def _to_local(self, str):
assert self.__resdir != None
return self.__resdir + re.search(r'.*/([^\*\s\?]+)', u'/' + str).group(1)
def img_urls(self, html):
ret = []
if self.__img_matchinfo == []:
iter = re.finditer(r'<img.*?src="(.*?)".*?>', html)
for j in iter:
self.__img_matchinfo.append(MatchInfo(j.start(1), j.end(1), j.group(1)))
for i in self.__img_matchinfo:
ret.append(i.content)
return ret
def css_urls(self, html):
ret = []
if self.__css_matchinfo == []:
cssiter = re.finditer(r'<link.*?type="text/css".*?>', html)
for i in cssiter:
j = re.search(r'href="(.*?)"', i.group())
self.__css_matchinfo.append(MatchInfo(i.start() + j.start(1), i.start() + j.end(1), j.group(1)))
for i in self.__css_matchinfo:
ret.append(i.content);
return ret
def _urlopen(self, url):
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36 SE 2.X MetaSr 1.0'
headers = { 'User-Agent' : user_agent }
req = urllib2.Request(url, headers = headers)
return urllib2.urlopen(req)
class MatchInfo(object):
"""match result data"""
def __init__(self, start, end, str):
super(MatchInfo, self).__init__()
self.start = start
self.end = end
self.content = str
d = Downloader('http://blog.csdn.net/pleasecallmewhy/article/details/8932310')
print d.title
print d.abs_url(d.img_urls(d.html))
print d.img_urls(d.html)
print d.css_urls(d.html)
print d._to_local(d.img_urls(d.html)[0])
d.start()