1 from sys import argv
2 from os import makedirs,unlink,sep,mkdir
3 from os.path import dirname,exists,isdir,splitext
4 from string import replace,find,lower
5 from htmllib import HTMLParser
6 from urllib import urlretrieve
7 from urlparse import urlparse,urljoin
8 from formatter import DumbWriter,AbstractFormatter
9 from cStringIO import StringIO
10
11
12 class Retriever(object):
13 def __init__(self,url):
14 self.url = url
15 self.file = 'E:\install\Python27\\' + self.filename(url)
16
17 def filename(self,url,deffile='index.htm'):
18 parsedurl = urlparse(url,'http:',0)
19 path = parsedurl[1] + parsedurl[2]
20 ext = splitext(path) # seperate ext name
21 if ext[1] == '':
22 if path[-1] == '/':
23 path += deffile
24 else:
25 path += '/' + deffile
26
27 ldir = dirname(path)
28 if sep != '/':
29 ldir = replace(ldir,'/',sep)
30 if not isdir(ldir):
31 if exists(ldir): unlink(ldir)
32 makedirs(ldir)
33 return path
34
35 def download(self):
36 try:
37 retval = urlretrieve(self.url,self.file)
38 except IOError:
39 retval = ('*** ERROR: invalid URL "%s"' %\
40 self.url)
41 return retval
42
43 def parseAndGetLinks(self):
44 self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
45 self.parser.feed(open(self.file).read())
46 self.parser.close()
47 return self.parser.anchorlist
48
49 class Crawler(object):
50 count = 0 # static downloaded page counter
51
52 def __init__(self,url):
53 self.q = [url]
54 self.seen = []
55 self.dom = urlparse(url)[1]
56
57 def getPage(self,url):
58 r = Retriever(url)
59 retval = r.download()
60 if retval[0] == '*':
61 print retval,'...skipping parse'
62 return
63 Crawler.count += 1
64 print '\n(',Crawler.count,')'
65 print 'URL:',url
66 print 'FILE:',retval[0]
67 self.seen.append(url)
68
69
70
71 links = r.parseAndGetLinks()
72 for eachLink in links:
73 if eachLink[:4] != 'http' and find(eachLink,'://') == -1:
74 eachLink = urljoin(url,eachLink)
75
76 if find(lower(eachLink),'mailto:') != -1:
77 print '...discarded,mailto link'
78 continue
79 if eachLink not in self.seen:
80 if find(eachLink,self.dom) == -1:
81 print '...discarded,not in domain'
82 else:
83 if eachLink not in self.q:
84 self.q.append(eachLink)
85 print '...new,added to Q'
86 else:
87 print '...discarded,already in Q'
88 else:
89 print '...discarded,already processed'
90
91
92
93 def go(self):#process links in queue
94 while self.q:
95 url = self.q.pop()
96 self.getPage(url)
97
98
99
100 def main():
101 if len(argv) > 1:
102 url = argv[1]
103
104 else:
105 try:
106 url = raw_input('Enter starting URL:')
107 except(KeyboardInerrupt,EOFError):
108 url = ''
109 if not url: return
110 robot = Crawler(url)
111 robot.go()
112
113 if __name__ == '__main__':
114 main()