正则表达式匹配(python)
获取图片的python代码
#coding=utf-8 import urllib import re def getHtml(url): page = urllib.urlopen(url) html = page.read() return html def getImg(html): reg = r'src="(.+?\.jpg).+"' imgre = re.compile(reg) imglist = re.findall(imgre,html) x = 0 for imgurl in imglist: urllib.urlretrieve(imgurl,'%s.jpg' % x) x+=1 html = getHtml("http://www.csdn.net/article/2015-01-15/2823564") print getImg(html)
findall和group的用法
import re
reg = r'www\.(.*)\..{3}'
imgre = re.compile(reg)
imglist = re.findall(imgre,'www.python.org')
#for imgurl in imglist:
print imglist
import re
reg = r'(.+):"(.+a)"'
imgre = re.compile(reg)
imglist = re.findall(imgre,'name:"wangjian"ok')
#for imgurl in imglist:
print imglist
import re
reg = r'src="(.+?\.jpg)"'
imgre = re.compile(reg)
html='<img src="http://cms.csdnimg.cn/article/201501/15/54b70da54b668_middle.jpg?_=48735" style="float: none; margin: 0px;" alt="">'
imglist = re.findall(imgre,html)
print imglist
参考https://docs.python.org/2/library/re.html#re.findall
python的search和match的区别
精通正则表达式第三版
解释了在正则表达式中\b元字符的使用的参考文档如下:
http://www.cnblogs.com/85538649/archive/2011/07/26/wtq0705.html
http://www.cnblogs.com/deerchao/archive/2006/08/24/zhengzhe30fengzhongjiaocheng.html

浙公网安备 33010602011771号