【python练习】正则表达式练习
import re
def process(input_data):
"""
将同时有0号和33的用户IMSI和MSISDN提取出来
:param input_data: 用户信息文件
样例
<SUBBEGIN
IMSI=1243560615528273;
MSISDN=986768559232;
VLRLIST=10;
CF=CFD-TS10-REG-91986762386238-YES-NO-20-YES-65535-YES-YES-NO-NO-NO-YES-YES-YES-YES-NO;
OPTGPRS=3-33-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
OPTGPRS=2-2-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
OPTGPRS=1-0-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
OPTGPRS=4-33-65535-241-33-NONE-0-3-00000000-25-1-0-EPS_APN-NONE-65535-1;
CHARGE_GLOBAL=3;
<SUBEND
:return: 号码信息列表,每个格式如示例:IMSI=1243560615528273;MSISDN=986768559232
"""
# 对文件进行处理,返回可读取的列表
f = open(input_data)
message = f.readlines()
f.close()
list = []
for i in range(len(message)):
# list.append(message[i]) # ['<SUBBEGIN\n', '\tIMSI=1243560615528273;\n', '\tMSISDN=986768559232;\n',
list.append(message[i].strip('\t').strip('\n')) # ['<SUBBEGIN', 'IMSI=1243560615528273;', 'MSISDN=986768559232;',
# print(list)
# print(len(list))
# 对列表进行拆分,获取子列表的索引
start_index = []
stop_index = []
for i in range(len(list)):
if list[i] == "<SUBBEGIN":
start_index.append(i)
elif list[i] == "<SUBEND":
stop_index.append(i)
# print(start_index) # [0, 11, 22, 33,...
# print(stop_index) # [10, 21, 32, 43...
# print(len(start_index)) # 1067
# 重组新列表 new_string
result = []
for i in range(len(start_index)):
new_list = [] # 每次重组列表重新生成
# 通过切片实现代替for循环
new_list = list[start_index[i]:stop_index[i]]
for j in range(start_index[i], stop_index[i]):
new_list.append(list[j])
new_string = ''.join(new_list) # 以指定字符串作为分隔符,将 seq 中所有的元素(的字符串表示)合并为一个新的字符串
print(new_string) # 转换为字符串
'''
<SUBBEGINIMSI=1243560615528273;MSISDN=986768559232;VLRLIST=10;
CF=CFD-TS10-REG-91986762386238-YES-NO-20-YES-65535-YES-YES-NO-NO-NO-YES-YES-YES-YES-NO;
OPTGPRS=3-33-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
OPTGPRS=2-2-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
OPTGPRS=1-0-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
OPTGPRS=4-33-65535-241-33-NONE-0-3-00000000-25-1-0-EPS_APN-NONE-65535-1;CHARGE_GLOBAL=3;
'''
# 正则表达式进行匹配
apn_33 = re.findall('OPTGPRS=\d-33', new_string)
apn_0 = re.findall('OPTGPRS=\d-0', new_string)
if len(apn_33) > 0 and len(apn_0) > 0:
content = re.findall('IMSI=.+MSISDN=\d+', new_string) # 正则表达式 返回列表 ['IMSI=1243560615528273;MSISDN=986768559232']
r = ''.join(content) # 将列表结果转换为字符串 'IMSI=1243560615528273;MSISDN=986768559232'
result.append(r) # 将字符串写入result中 ['IMSI=1243560615528273;MSISDN=986768559232']
else:
pass
return result
if __name__ == '__main__':
process('input_data.txt')
2、代码优化通过类实现
import re
class apnInfoFinder():
def __init__(self, input_data):
self.file = input_data
self.msglist = []
self.start_index = []
self.stop_index = []
self.result = []
# 方法1: 对文件进行处理,返回可读取的列表
def getMsgList(self):
with open(self.file) as f:
message = f.readlines()
for i in range(len(message)):
self.msglist.append(message[i].strip('\t').strip('\n')) # ['<SUBBEGIN', 'IMSI=1243560615528273;', 'MSISDN=986768559232;',
return self.msglist
# 方法2 对列表进行拆分,获取子列表的起始索引
def getNewList(self, list):
for i in range(len(list)):
if list[i] == "<SUBBEGIN":
self.start_index.append(i)
elif list[i] == "<SUBEND":
self.stop_index.append(i)
return self.start_index, self.stop_index
# 方法3: 重组新列表并进行匹配查找
def getFinder(self, lenlist, list):
for i in range(len(lenlist)):
new_list = [] # 每次重组列表重新生成
for j in range(self.start_index[i], self.stop_index[i]):
new_list.append(list[j])
new_string = ''.join(new_list) # 以指定字符串作为分隔符,将 seq 中所有的元素(的字符串表示)合并为一个新的字符串
# print(new_string) # 转换为字符串
'''
<SUBBEGINIMSI=1243560615528273;MSISDN=986768559232;VLRLIST=10;
CF=CFD-TS10-REG-91986762386238-YES-NO-20-YES-65535-YES-YES-NO-NO-NO-YES-YES-YES-YES-NO;
OPTGPRS=3-33-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
OPTGPRS=2-2-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
OPTGPRS=1-0-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
OPTGPRS=4-33-65535-241-33-NONE-0-3-00000000-25-1-0-EPS_APN-NONE-65535-1;CHARGE_GLOBAL=3;
'''
# 正则表达式进行匹配
apn_33 = re.findall('OPTGPRS=\d-33', new_string)
apn_0 = re.findall('OPTGPRS=\d-0', new_string)
if len(apn_33) > 0 and len(apn_0) > 0:
content = re.findall('IMSI=.+MSISDN=\d+',
new_string) # 正则表达式 返回列表 ['IMSI=1243560615528273;MSISDN=986768559232']
r = ''.join(content) # 将列表结果转换为字符串 'IMSI=1243560615528273;MSISDN=986768559232'
self.result.append(r) # 将字符串写入result中 ['IMSI=1243560615528273;MSISDN=986768559232']
else:
pass
return self.result
# 方法4: 结果输出为excel文件
def outPut(self):
self.getMsgList()
self.getNewList(self.msglist)
self.getFinder(self.start_index, self.msglist)
with open('output.csv', 'w') as out_result:
for line in self.result:
out_result.writelines(line + '\n')
if __name__ == '__main__':
a = apnInfoFinder('input_data.txt')
a.outPut()
浙公网安备 33010602011771号