
python 正则表达式
import urllib
import re
class getHead:
def __init__(self,url):
self.url=url
def getContent(self):
print "begin to download"
page=urllib.urlopen(self.url)
html=page.read()
return html
def getImage(self,pagestr):
reg=r'src="(.+?\.jpg)"'
image=re.compile(reg)
imagelist=re.findall(image,pagestr)
return imagelist
def getALI(self,pagestr):
reg=r'<li><a.+?</a></li>'
data=re.compile(reg)
datalist=re.findall(data,pagestr)
return datalist
get=getHead("http://www.sina.com.cn/")
for data in get.getALI(get.getContent()):
dr = re.compile(r'<[^>]+>',re.S)
dd=dr.sub('',data)
print dd
for image in get.getImage(get.getContent()):
if image.find(' ')!=-1:
ss=image.split(' ')
print ss[1].replace('data-src="',"")
else:
print image
浙公网安备 33010602011771号