解析邮件
发邮件:https://www.cnblogs.com/hudieren/p/16792041.html
收邮件:https://www.cnblogs.com/hudieren/p/16792045.html
解析邮件:https://www.cnblogs.com/hudieren/p/16792096.html
删除邮件:https://www.cnblogs.com/hudieren/p/16798017.html
class ParseEmailContent:
def __init__(self, files, accessory_path):
"""
解析邮件
:param emails_path: 收取到的邮件文件列表
:param accessory_path: 附件存放地址
"""
self.files = files
self.accessory_path = accessory_path
self.result = []
self.index = 0
def _decode_header_attr(self, attr):
try:
result = email.header.decode_header(attr)
values = []
for res in result:
if isinstance(res, tuple):
if isinstance(res[0], bytes):
try:
if res[1]:
values.append(res[0].decode(res[1]))
else:
values.append(res[0].decode('utf-8'))
except:
values.append(res[0].decode('gb18030'))
else:
values.append(res[0])
return ''.join(values)
except:
return None
def _decode_header(self, msg):
"""
:param msg:
:return: 邮件主题,发件人,收件人,日期
"""
_subject = self._decode_header_attr(msg.get('Subject'))
if not _subject:
_subject = '(无主题)'
_from = self._decode_header_attr(msg.get('From'))
try:
_from = re.findall("<([\s\S]*?)>", _from)[0]
except:
pass
_to = self._decode_header_attr(msg.get('To'))
try:
_to = re.findall("<([\s\S]*?)>", _to)
except:
pass
_date = self._decode_header_attr(msg.get('Date'))
if _date:
_date = time.strptime(_date[0:24], '%a, %d %b %Y %H:%M:%S') # 获取邮件的接收时间,格式化收件时间
_date = time.strftime("%Y-%m-%d %H:%M:%S", _date)
else:
_received = self._decode_header_attr(msg.get('Received'))
_date = _received.split('; ')[-1]
return _subject, _from, _to, _date
def _decode_plain(self, content_type, part):
"""
Content-Type: text/plain;
charset="gb18030"
:return: 邮件内容(字符串)
"""
content_type = content_type.split('; ')
charset = [c for c in content_type if 'charset' in c][0].split('=')[-1].lower()
try:
plain = part.get_payload(decode=True).decode(charset)
except:
plain = part.get_payload(decode=True).decode("gb18030")
return plain
def _decode_html(self, content_type, part):
"""
Content-Type: text/html;
charset = "gb18030"
:return: 邮件内容(html)
"""
content_type = content_type.split('; ')
charset = [c for c in content_type if 'charset' in c][0].split('=')[-1].lower()
try:
html = part.get_payload(decode=True).decode(charset)
except:
html = part.get_payload(decode=True).decode("gb18030")
return html
def _decode_file(self, part, _from, file_name):
"""
:return: 邮件附件
"""
filename = self._decode_header_attr(part.get_filename())
id = file_name.replace('.txt', '')
_from = _from.replace('"', "").replace(">", "").replace("<", "")
path = os.path.join(self.accessory_path, f"{id}-{_from}-{filename}")
with open(path, 'wb') as w:
w.write(part.get_payload(decode=True))
return {"path": path, "filename": filename}
def _decode_more_type(self, content_type, part_string):
"""
单个中有多个Content-Type
:return:
"""
plains = []
htmls = []
# try:
# break_point = re.findall('boundary="([\S\s]*?)"', content_type)[0]
# except:
# if content_type[-1] == '"':
# break_point = content_type[-30:-1]
# else:
# break_point = content_type[-29:]
break_point = re.findall('boundary="([\S\s]*?)"', content_type)[0]
new_parts = part_string.split(break_point)
for new_part in new_parts:
new_part = new_part[1:] if new_part[0] == '\n' else new_part
new_part = email.message_from_string(new_part)
new_content_type = str(new_part.get('Content-Type'))
try:
if "text/plain" in new_content_type:
plain = self._decode_plain(new_content_type, new_part)
plains.append(plain)
if "text/html" in new_content_type:
html = self._decode_html(new_content_type, new_part)
htmls.append(html)
except:
pass
return plains, htmls
def _decode_body(self, msg, _from, file_name):
plains = []
htmls = []
files = []
# is_multipart 函数区分是否混合邮件
# 如果邮件的payload是sub - message对象的列表,则返回True,否则返回False。
# 当is_multipart()返回False时,payload是一个字符串对象。
if msg.is_multipart():
parts = msg.get_payload()
for n, part in enumerate(parts):
content_type = str(part.get('Content-Type'))
content_disposition = str(part.get('Content-Disposition'))
if "filename" in content_disposition:
file = self._decode_file(part, _from, file_name)
files.append(file)
else:
part_string = str(part)
content_type_count = part_string.count("Content-Type: ")
if content_type_count == 1:
if "text/plain" in content_type:
plain = self._decode_plain(content_type, part)
plains.append(plain)
if "text/html" in content_type:
html = self._decode_html(content_type, part)
htmls.append(html)
else:
new_plains, new_htmls = self._decode_more_type(content_type, part_string)
plains += new_plains
htmls += new_htmls
else:
# content_type = msg.get_content_type()
content_type = msg.get('Content-Type')
if "text/plain" in content_type:
plain = self._decode_plain(content_type, msg)
plains.append(plain)
if "text/html" in content_type:
html = self._decode_html(content_type, msg)
htmls.append(html)
else:
filename = msg.get_filename()
id = file_name.replace('.txt', '')
_from = _from.replace('"', "").replace(">", "").replace("<", "")
path = os.path.join(self.accessory_path, f"{id}-{_from}-{filename}")
with open(path, 'wb') as w:
w.write(msg.get_payload(decode=True))
files.append({"path": path, "filename": filename})
return plains, htmls, files
def parse(self):
for file_path in self.files:
try:
file_name = os.path.basename(file_path)
print(f"解析文件名:{file_name}")
with open(file_path, "rb") as r:
content = r.read().decode('utf-8')
msg = email.message_from_string(content)
_subject, _from, _to, _date = self._decode_header(msg)
print(f"[主题:{_subject}],[发件人:{_from}],[收件人:{_to}],[日期:{_date}]")
plain, html, files = self._decode_body(msg, _from, file_name)
sin_result = {
"subject": _subject,
"from": _from,
"to": _to,
"date": _date,
"plain": plain,
"html": html,
"files": files,
}
self.result.append(["ok", sin_result])
except Exception as e:
self.result.append(["error", e.args])
return self.result

浙公网安备 33010602011771号