1 # -*- coding: UTF-8-*-
2 import urllib2,re
3
4 mylist = []
5
6 p = re.compile( r"<a.+?href=.+?>.+?</a>")
7 pname = re.compile( r"(?<=>).*?(?=</a>)" )
8 phref = re.compile( r"(?<=href\=\")http.*?(?=\")")
9
10 html_c = urllib2.urlopen("http://www.baidu.com/?vit=1").read()
11
12 def geturltest(str):
13 html_c = urllib2.urlopen(str).read()
14 arr = p.findall(html_c)
15 for ever in arr:
16 con = phref.findall(ever)
17 if len(con) > 0:
18 print con[0]
19
20 def geturl(str):
21 html_c = urllib2.urlopen(str).read()
22 arr = p.findall(html_c)
23 for ever in arr:
24 con = phref.findall(ever)
25 if len(con) == 0:
26 continue
27 else:
28 print con[0]
29 if mylist.__contains__(con[0]) == False:
30 mylist.append(con[0])
31 geturl(con[0])
32 else:
33 pass
34 def main():
35 geturl("http://www.baidu.com/?vit=1")
36
37 if __name__ == '__main__': main()