python模块--re

就其本质而言,正则表达式是一种小型的、高度专业化的编程语言,(在Python中)它内嵌在Python中,并通过 re 模块实现。正则表达式模式被编译成一系列的字节码,然后由用 C 编写的匹配引擎执行。

字符匹配(普通字符,元字符):

普通字符:大多数字符和字母都会和自身匹配

元字符:. ^ $ * + ? { } [ ] | ( ) \

一 元字符

image_thumb[4]

1.1  . ^ $

import re

res1 = re.findall('a.','abcdefgaab')        # ['ab', 'aa']
res2 = re.findall('^a.','abcdefgaab')       # ['ab']
res3 = re.findall('a.$','abcdefgaab')       # ['ab']

1.2 * + { } ?

import re

res4 = re.findall('a.*','abcdefgaab')       # ['abcdefgaab']
res5 = re.findall('a.+','abcdefgaab')       # ['abcdefgaab']
res6 = re.findall('a.{1,2}','abcdefgaab')   # ['abc', 'aab']

# 匹配一个数字包括整型和浮点型
res7 = re.findall('\d+\.?\d*','12.34,12,0.06,100')      # ['12.34', '12', '0.06', '100']

注意:前面的*,+,{},?等都是贪婪匹配,也就是尽可能匹配,后面加?号使其变成惰性匹配,后面会详细介绍贪婪匹配

import re

res8 = re.findall('a.*?','abcdefgaab')       # ['a', 'a', 'a']
res9 = re.findall('a.+?','abcdefgaab')       # ['ab', 'aa']
res6 = re.findall('a.{1,2}?','abcdefgaab')   # ['ab', 'aa']

1.3 转义符 \

1. 反斜杠后边跟元字符去除特殊功能,比如\.

2. 反斜杠后边跟普通字符实现特殊功能,比如\d

image_thumb[1]

# \b的应用
res10 = re.findall(r'I\b','I am joe')
print(res10)        # ['I']

# 如何匹配abc\de中的‘c\d’
# 注:r是raw的简写,raw string意思是这个字符串中间的特殊字符不用转义
res11 = re.findall(r'c\\d','abc\de')      # ['c\\d']
# 前两个\\和后两个\\分别用于在编程语言里转义成反斜杠,转换成两个反斜杠后再在正则表达式里转义成一个反斜杠
res12 = re.findall('c\\\\d','abc\de')     # ['c\\d']

1.4 分组

res13 = re.findall(r'(ad)+', 'add')        # ['ad']

# 默认显示匹配到的最后一个数据
res14 = re.findall(r'(ad)+joe', 'adadjoe')  # ['ad']
res15 = re.findall(r'(\d)+joe', 'ad123joe') # ['3']

# 加?:可以得到我们想要的结果,该点仅针对于re.findall方法
res16 = re.findall(r'(?:ad)+joe', 'adadjoe')    # ['adadjoe']
res17 = re.findall(r'(?:\d)+joe', 'ad123joe')   # ['123joe']

ret = re.search('(?P<id>\d{3})/(?P<name>\w{3})', '123/com')
print(ret.group())        # 123/com
print(ret.group('id'))    # 123
print(ret.group('name'))  # com

1.5  |

ret = re.search('(ad)|\d','ad123joe')
print(ret.group())      # ad
ret = re.search('(ade)|\d','ad123joe')
print(ret.group())      # 1

1.6 字符集[]

res18 = re.findall('a[bc]d', 'abdacd')      # ['abd', 'acd'] 
res19 = re.findall('[a-z]', 'abdacd')       # ['a', 'b', 'd', 'a', 'c', 'd']     
res20 = re.findall('[.*+]', 'a.bc+')        # ['.', '+']

# 在字符集里有功能的符号: - ^ \
res21 = re.findall('[1-9]', '12abc3')       # ['1', '2', '3']
res22 = re.findall('[^ab]', '12abc3')       # ['1', '2', 'c', '3']
res23 = re.findall('[\d]', '12abc3')        # ['1', '2', '3']

二 贪婪匹配

贪婪匹配:在满足匹配时,匹配尽可能长的字符串,默认情况下,采用贪婪匹配
# 贪婪匹配
ret = re.findall('a.*c','abcabc')     # ['abcabc']
print(ret)

# 非贪婪匹配
ret = re.findall('a.*?c','abcabc')    # ['abc', 'abc']
print(ret)
几组常用的非贪婪匹配组合
image_thumb[9]
.*?的用法
# . 是任意字符
# * 是取0至无限长度
# ? 是非贪婪模式
# 搭配在一起就是取尽量少的任意字符,一般不会这么单独写,他大多用在:.*?x,就是取前面任意长度的字符,直到一个x出现
ret = re.findall('.*?c','abbbcabc')    # ['abbbc', 'abc']

三 常用方法

3.1 findall(pattern, string, flags=0)

参数说明:

  • 正则表达式
  • 要匹配的字符串
  • 标志位,用于控制正则表达式的匹配方式
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case
L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale
M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
flags

返回所有满足匹配条件的结果,放在列表里,该方法上面已使用

注意:findall的优先级查询

import re

ret = re.findall('www.(baidu|qq).com', 'www.baidu.com')
print(ret)  # ['baidu']      这是因为findall会优先把匹配结果组里内容返回,如果想要匹配结果,取消权限即可

ret = re.findall('www.(?:baidu|qq).com', 'www.baidu.com')
print(ret)  # ['www.baidu.com']

3.2 search(pattern, string, flags=0)

在字符串内查找模式匹配,找到第一个匹配然后返回一个包含匹配信息的对象,该对象可以通过调用group()方法得到匹配的字符串,如果字符串没有匹配,则返回None

import re

obj = re.search('\d+', 'abc123')
if obj:
    print(obj.group())      # 123

3.3 match(pattern, string, flags=0)

同search,不过是从起始位置开始根据模型去字符串中匹配指定内容,匹配单个

import re

obj = re.match('\d+', '123abc')
if obj:
    print(obj.group())    # 123

3.4 sub(pattern, repl, string, count=0, flags=0)

用于替换匹配的字符串

old_data = "123abc456"
new_data = re.sub('\d+', 'joe', old_data)
print(new_data)
new_data = re.sub('\d+', 'joe', old_data, 1)
print(new_data)

3.5 subn(pattern, repl, string, count=0, flags=0):

与sub类似,返回一个包含(新字符串, 替换次数)的2元组

old_data = "123abc456"
ret = re.subn('\d+', 'joe', old_data)
print(ret)      # ('joeabcjoe', 2)

3.6 split(pattern, string, maxsplit=0, flags=0)

根据指定匹配进行分组

old_data = "1*2*(3+4)*(9-3)"
new_data = re.split('\*', old_data)
print(new_data)         # ['1', '2', '(3+4)', '(9-3)']
new_data = re.split('\*', old_data, 2)
print(new_data)         # ['1', '2', '(3+4)*(9-3)']

ret=re.split('[ac]','abcd') # 先按'a'分割得到''和'bcd',在对''和'bcd'分别按'c'分割
print(ret)              # ['', 'b', 'd']

注:split的优先级查询

ret=re.split("\d+","abc12def34g")
print(ret)      # ['abc', 'def', 'g']

ret=re.split("(\d+)","abc12def34g")
print(ret)      # ['abc', '12', 'def', '34', 'g']

3.7 finditer(pattern, string, flags=0)

返回字符串中所有非重叠匹配的迭代器。对于每个匹配,迭代器返回一个匹配对象。

ret = re.finditer('\d+', 'abc123def456g')
print(ret)          # <callable_iterator object at 0x0000021C50201978>
print(next(ret).group())    # 123
print(next(ret).group())    # 123

3.8 compile(pattern, flags=0)

编译一个正则表达式模式,返回一个模式对象。

obj = re.compile('\d{3}')
ret = obj.search('abc123defg')
print(ret.group())   # 123

3.9 group&groups

a = "123abc456"
print(re.search("([0-9]*)([a-z]*)([0-9]*)", a).group())       # 123abc456
print(re.search("([0-9]*)([a-z]*)([0-9]*)", a).group(0))      # 123abc456
print(re.search("([0-9]*)([a-z]*)([0-9]*)", a).group(1))      # 123
print(re.search("([0-9]*)([a-z]*)([0-9]*)", a).group(2))      # abc
print(re.search("([0-9]*)([a-z]*)([0-9]*)", a).group(3))      # 456
print(re.search("([0-9]*)([a-z]*)([0-9]*)", a).groups())      # ('123', 'abc', '456')

四 练习

4.1 计算式中的整数匹配

import re

# 方式一:利用分组
ret = re.findall(r"-?\d+\.\d*|(-?\d+)","3-2*(12.2+(-30.15/5)-(-5*6))")
while '' in ret:
    ret.remove("")
print(ret)          # ['3', '-2', '5', '-5', '6']

#方式二:利用sub
import json
ret = re.findall(r"-?\d+\.?\d*","3-2*(12.2+(-30.15/5)-(-5*6))")
ret = re.sub(r"-?\d+\.\d*","",json.dumps(ret))
ret = json.loads(ret)
while '' in ret:
    ret.remove("")
print(ret)          # ['3', '-2', '5', '-5', '6']
View Code

4.2 标签匹配

import re

print(re.findall("<(?P<tag_name>\w+)>\w+</(?P=tag_name)>","<h1>hello</h1>"))   # ['h1']
print(re.search("<(?P<tag_name>\w+)>\w+</(?P=tag_name)>","<h1>hello</h1>").group()) # <h1>hello</h1>
print(re.search(r"<(\w+)>\w+</\1>","<h1>hello</h1>").group())       # <h1>hello</h1>
View Code

4.3 爬虫

import requests
import re
import json


def getPage(url):
    """
    获取网页信息
    """
    response = requests.get(url)
    return response.text

def parsePage(response_html):
    """
    根据正则解析网页信息
    """
    com = re.compile(
        '<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span>'
        '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>', re.S)
    ret = com.finditer(response_html)
    for i in ret:
        yield {
            "id": i.group("id"),
            "title": i.group("title"),
            "rating_num": i.group("rating_num"),
            "comment_num": i.group("comment_num"),
        }

def main(num):
    url = 'https://movie.douban.com/top250?start=%s&filter=' % num
    response_html = getPage(url)
    ret = parsePage(response_html)
    f = open("move_info7", "a", encoding="utf8")

    for obj in ret:
        print(obj)
        data = json.dumps(obj, ensure_ascii=False)
        f.write(data + "\n")


if __name__ == '__main__':
    start_page = 0
    for i in range(10):
        main(start_page)
        start_page += 25
View Code
posted @ 2018-07-03 18:57  Joe1991  阅读(140)  评论(0)    收藏  举报