Mapper以及reducer清洗数据

#!/usr/bin/env python3
import sys
import json
import re
from datetime import datetime
from html import unescape


def clean_html(raw_html):
"""
清洗HTML标签和特殊字符
"""
cleanr = re.compile(r'<[^>]+>|&nbsp;|\u3000')
cleantext = re.sub(cleanr, '', unescape(raw_html))
return cleantext.strip()


def calc_days(start, end):
"""
计算两个日期之间的天数差
"""
try:
fmt = "%Y-%m-%d"
start_date = datetime.strptime(start, fmt)
end_date = datetime.strptime(end, fmt)
return (end_date - start_date).days
except:
return -1 # 标记异常值


def process_record(record):
"""
处理单条记录
"""
try:
# 清洗标题和内容
title = clean_html(record.get('title', ''))
content = clean_html(record.get('content', ''))

# 处理部门名称
org = record.get('org', '未知部门')

# 处理日期
letter_date = record.get('letterTime', '').split()[0]
reply_date = record.get('replyTime', '').split()[0]

# 计算处理时长
duration_days = calc_days(letter_date, reply_date)

# 返回清洗后的数据
return {
"title": title,
"content": content,
"org": org,
"letter_date": letter_date,
"reply_date": reply_date,
"duration_days": duration_days
}
except Exception as e:
sys.stderr.write(f"记录处理失败: {str(e)}\n")
return None


def main():
for line in sys.stdin:
try:
# 解析JSON记录
record = json.loads(line.strip())

# 处理记录
cleaned_record = process_record(record)
if cleaned_record:
# 输出清洗后的记录
print(json.dumps(cleaned_record))
except Exception as e:
sys.stderr.write(f"JSON解析失败: {str(e)}\n")


if __name__ == "__main__":
main()
——————————————————————————————————————————————————————————————————————————————————————
#!/usr/bin/env python3
import sys

def main():
"""
Reducer 只需将清洗后的数据原样输出
"""
for line in sys.stdin:
try:
# 直接输出清洗后的记录
print(line.strip())
except Exception as e:
sys.stderr.write(f"Reducer 处理失败: {str(e)}\n")

if __name__ == "__main__":
main()
posted @ 2025-02-07 23:38  芊羽鱼  阅读(11)  评论(0)    收藏  举报