bbb

#!/usr/bin/python
# -*- coding: utf-8 -*-
import urllib2
import urllib
import os
from BeautifulSoup import BeautifulSoup
import re
from urlparse import urlparse
from urlparse import urljoin
import logging
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
logging.basicConfig(level=logging.DEBUG)


class BlogSpider(object):
	"""net spider for blog"""
	def __init__(self, url):
		super(BlogSpider, self).__init__()
		self._url = url
		

class CSDNSpider(BlogSpider):
	"""a spider for csdn"""
	def __init__(self, url, reg):
		super(CSDNSpider, self).__init__()
		self.url = url
		self.reg = reg
		
class Downloader(object):
	"""download the page"""

	def __init__(self, url, path = u'./'):
		super(Downloader, self).__init__()
		self.__url = url
		response = self._urlopen(url)
		self.__html = response.read()
		self.__title = re.search(r'<title.*?>(.*?)</title>', self.__html, re.S).group(1)

		if os.path.isdir(path):
			if not os.path.exists(path):
				os.mkdir(path)
			self.__dir = path
			self.__filename = re.sub(r'[\s|\\|/|\?|\||\*|<|>|\"|\:]', u'_', self.__title) + u'.htm'
		else:
			t = os.path.split(path)
			self.__dir = t[0]
			self.__filename = t[1]
		
		# transform title to legal dir name
		self.__resdir = self.__dir + re.search(r'(.*)\.[html|htm]*$', self.__filename).group(1) + u'_files/'
		# root url of the site
		uparse = urlparse(url)
		scheme = uparse.scheme
		if scheme == None:
			scheme = 'http'
		self.__rooturl = scheme + '://' + uparse.netloc		# net location without tailing slash
		self.__parenturl = self.__rooturl + re.search(r'(.*/)', uparse.path).group()	#url before last slash
		#self.__rooturl = re.search(r'\w+?://.+?/', url).group()			# string before first address slash
		#self.__parenturl = re.search(r'(.*/).*?$', url).group(1)		# string before final slash
		self.__img_matchinfo = []
		self.__css_matchinfo = []

	def start(self):
		try:
			if not os.access(self.__resdir, os.F_OK):
				os.mkdir(self.__resdir)
			self.save_css()
			self.save_img()
			self.save_html()
		except Exception, e:
			print e
			raise

	@property
	def limit(self):
		return self.__limit
	@limit.setter
	def limit(self, val):
		assert val > 0, 'Downloader: limit can not be negative.'
		self.__limit = val

	@property
	def url(self):
		return self.__url
	@url.setter
	def url(self, str):
		self.__url = str

	@property
	def html(self):
		return self.__html

	@property
	def title(self):
		return self.__title

	@property
	def dir(self):
		return self.__dir

	def save_css(self):
		logging.debug('---------- Downloading css ----------')
		for url in self.abs_url(self.css_urls(self.__html)):
			rsp = self._urlopen(url)
			f = open(self._to_local(url), 'w')
			f.write(rsp.read())
			logging.debug(url + '\tsave successfully.')

	def save_img(self):
		logging.debug('---------- Downloading img ----------')
		for url in self.abs_url(self.img_urls(self.__html)):
			rsp = self._urlopen(url)
			f = open(self._to_local(url), 'wb')
			f.write(rsp.read())
			# urllib.urlretrieve(url, self._to_local(url))
			logging.debug(url + '\tsave successfully.')

	def save_html(self):
		localhtml = ''
		subinfos = self.__css_matchinfo + self.__img_matchinfo
		subinfos.sort(lambda x,y: cmp(x.start, y.start))
		i = 0
		for info in subinfos:
			localhtml += self.__html[i:info.start]
			localhtml += self._to_local(self.abs_url(info.content))
			i = info.end
		localhtml += self.__html[i:len(self.__html)]
		fp = open(self.__dir + self.__filename, 'w')
		fp.write(localhtml)

	def _abs_url(self, url):
		# uparse = urlparse(url)
		# addr = ''
		# if uparse.scheme == '':	# not start with :// , it's a brief url
		# 	addr += 'http://'
		# else: 
		# 	addr += uparse.scheme + '://'
		# addr += uparse.netloc
		# if not uparse.path.startswith('/'):
		# 	addr += '/'
		# addr += uparse.path
		return urljoin(self.__url, url)

	def abs_url(self, url):
		retlist = []
		if isinstance(url, list):
			for u in url:
				retlist.append(self._abs_url(u))
			return retlist
		else:
			return self._abs_url(url)

	def _to_local(self, str):
		assert self.__resdir != None
		return self.__resdir + re.search(r'.*/([^\*\s\?]+)', u'/' + str).group(1)

	def img_urls(self, html):
		ret = []
		if self.__img_matchinfo == []:
			iter = re.finditer(r'<img.*?src="(.*?)".*?>', html)
			for j in iter:
				self.__img_matchinfo.append(MatchInfo(j.start(1), j.end(1), j.group(1)))
		for i in self.__img_matchinfo:
			ret.append(i.content)
		return ret

	def css_urls(self, html):
		ret = []
		if self.__css_matchinfo == []:
			cssiter = re.finditer(r'<link.*?type="text/css".*?>', html)
			for i in cssiter:
				j = re.search(r'href="(.*?)"', i.group())
				self.__css_matchinfo.append(MatchInfo(i.start() + j.start(1), i.start() + j.end(1), j.group(1)))
		for i in self.__css_matchinfo:
			ret.append(i.content);
		return ret

	def _urlopen(self, url):
		user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36 SE 2.X MetaSr 1.0' 
		headers = { 'User-Agent' : user_agent } 
		req = urllib2.Request(url, headers = headers) 
		return urllib2.urlopen(req)

class MatchInfo(object):
	"""match result data"""
	def __init__(self, start, end, str):
		super(MatchInfo, self).__init__()
		self.start = start
		self.end = end
		self.content = str

	

d = Downloader('http://blog.csdn.net/pleasecallmewhy/article/details/8932310')
print d.title
print d.abs_url(d.img_urls(d.html))
print d.img_urls(d.html)
print d.css_urls(d.html)
print d._to_local(d.img_urls(d.html)[0])

d.start()

posted @ 2014-10-25 00:25  qso  阅读(108)  评论(0)    收藏  举报