import re
s="<a>wahaha</a><b>banana</b><h1>qqxing</h1>"
obj=re.compile ('(<\w+>)(?P<nb>\w+)(</\w+>)') #把正则规则编译放入obj对象(?P<组名>)给组起名字
print(re.findall("<\w+>(\w+)</\w+>",s)) #findall与()优先显示
print(re.findall("<\w+>(?:\w+)</\w+>",s))#(?:)在findall中取消()的优先显示
print(obj.search(s).group())#search与compile组合group取固定()中的值
print(obj.search(s).group(1))
print(obj.search(s).group(2))#group(2)=group("nb")
print(obj.search(s).group("nb"))#group("nb")=group(2)
print(obj.search(s).group(3))
print(list(el.group() for el in obj.finditer(s)))#findfiter与compile组合匹配节省内存空间和时间,用for in 遍历, group取值,用list放入列表
print(re.split("\d+","ewrih1hkhh3jhkj4jhj2kh"))#正则切割字符串
print(re.split("(\d+)","ewrih1hkhh3jhkj4jhj2kh"))#显示切割的部分
print(re.sub("\d","666",s)) #sub最后不填数字默认全部替换
print(re.sub('\d','666',s,1))#sub方法最后的数字是要替换的次数
print(re.subn('\d','666',s))#subn方法最后会返回一个元组,前面是替换结果,后面是一共替换的次数
#从"1-2*(60+(-40.35/5)-(-4*3))"中取整数
print(re.findall(r"\d+\.\d+|\d+","1-2*(60+(-40.35/5)-(-4*3))"))#显示整数和小数
print(re.findall(r"(\d+\.\d+)|(\d+)","1-2*(60+(-40.35/5)-(-4*3))"))#显示小数()和整数()中的数据
ri=re.findall(r"\d+\.\d+|(\d+)","1-2*(60+(-40.35/5)-(-4*3))")#只显示整数,有空格
print(ri)
ri.remove("")#去除空格
print(ri)
字符串替换
str.replace('\n','') #通过替换去掉换行
import re
from urllib.request import urlopen
# 正则表达式练习
# 1、匹配一篇英文文章的标题 类似 The Voice Of China
cm=re.compile("[A-Z][a-z]+(?:\s+)?")
print(cm.findall("The Voice Of China djfshafj fa saf asfd asf "))
# 2、匹配一个网址
# 类似 https://www.baidu.com http://www.cnblogs.com
cc=re.compile("http(?:s)?://[wW]{3}\.[a-zA-Z]+\.[a-zA-Z]+")
print(cc.findall("https://www.baidu.com http://www.cnblogs.com"))
# 3、匹配年月日日期 类似 2018-12-06 2018/12/06 2018.12.06 1-9 10-19 20-29 30-31
dd=re.compile("[1-9][0-9]{0,3}[-/\.](?:0?[1-9]|1[0-2])[-/\.](?:0?[1-9]|[1-2][0-9]|3[0-1])")
print(dd.findall("2018-12-06 2018/12/06 2018.12.06"))
# 4、匹配15位或者18位身份证号
ee=re.compile("[1-9][0-9]{14}(?:[0-9]{2}X|[0-9]{3})?")
print(ee.findall("13112619921015002X131126199210150032g131126199210142"))
# 5、从lianjia.html中匹配出标题,户型和面积,结果如下:
# [('金台路交通部部委楼南北大三居带客厅 单位自持物业', '3室1厅', '91.22平米'), ('西山枫林 高楼层南向两居 户型方正 采光好', '2室1厅', '94.14平米')]
def getPage(url):
response=urlopen(url)
return response.read().decode('utf-8')
url="file:///C:/Users/Mr%20Chu/Desktop/ziliao/day17/code/lianjia.html"
response_html = getPage(url)
# print(response_html)
response_htm=response_html.replace("\n","")
# print(response_htm)
com=re.compile('data-sl="">(?P<dizhi>.*?)</a>.*?<span class=.*?>/</span>(?P<huxing>.*?)<span class=.*?>/</span>(?P<daxiao>.*?)<span class=.*?>/</span.*?>/</span>.*?data-sl="">(?P<dizhi1>.*?)</a>.*?<span class=.*?>/</span>(?P<huxing1>.*?)<span class=.*?>/</span>(?P<daxiao1>.*?)<span class=.*?>/</span.*?>/</span>',re.S)
ret = com.findall(response_htm)
print("查询结果为:")
print(ret)
import re
from urllib.request import urlopen
def getPage(url): # 获取网页的字符串
response = urlopen(url)
return response.read().decode('utf-8')
def parsePage(s):
ret = com.finditer(s) # 从s这个网页源码中 找到所有符合com正则表达式规则的内容 并且以迭代器的形式返回
for i in ret:
yield {
"id": i.group("id"),
"title": i.group("title"),
"rating_num": i.group("rating_num"),
"comment_num": i.group("comment_num"),
}
def main(num): # 0 25 50 # 这个函数执行10次,每次爬取一页的内容
url = 'https://movie.douban.com/top250?start=%s&filter=' % num
response_html = getPage(url) # response_html就是这个url对应的html代码 就是 str
ret = parsePage(response_html) # ret是一个生成器
print(ret)
f = open("move_info7", "a", encoding="utf8")
for obj in ret:
print(obj)
data = str(obj)
f.write(data + "\n")
f.close()
com = re.compile(
'<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span>'
'.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>', re.S)
count = 0
for i in range(10):
main(count)
count += 25