import re
import csv
from datetime import datetime
import geoip2.database
import os
# 加载IP地理位置数据库
geoip_reader = geoip2.database.Reader('path/to/GeoLite2-City.mmdb')
def extract_data(log_line):
"""
提取日志中的关键信息
"""
pattern = r"(\d+\.\d+\.\d+\.\d+),(\d{2}/\w+/\d{4}:\d{2}:\d{2}:\d{2} \+\d{4}),\d+,(\d+),(video|article),(\d+)"
match = re.match(pattern, log_line)
if match:
ip, time, traffic, content_type, content_id = match.groups()
return {
"ip": ip,
"time": time,
"traffic": int(traffic),
"type": content_type,
"id": int(content_id)
}
return None
def process_data(extracted_data):
"""
精细化处理数据
"""
# IP到城市映射
try:
response = geoip_reader.city(extracted_data["ip"])
city = response.city.name or "未知"
except:
city = "未知"
# 时间格式转换
time_format = "%d/%b/%Y:%H:%M:%S %z"
time_obj = datetime.strptime(extracted_data["time"], time_format)
formatted_time = time_obj.strftime("%Y-%m-%d %H:%M:%S")
day = time_obj.day
# 返回最终清洗后的数据
return {
"ip": extracted_data["ip"],
"city": city,
"time": formatted_time,
"day": day,
"traffic": extracted_data["traffic"],
"type": extracted_data["type"],
"id": extracted_data["id"]
}
def main():
input_file = "C:/Users/Administrator/Desktop\00.软件体系结构课前测试-日志数据分析result.txt" # 原始日志文件
output_file = "cleaned_data.csv" # 清洗后的数据文件
# 打开输入文件和输出文件
with open(input_file, "r") as infile, open(output_file, "w", newline="") as outfile:
writer = csv.writer(outfile)
# 写入CSV文件的表头
writer.writerow(["ip", "city", "time", "day", "traffic", "type", "id"])
# 逐行读取并处理数据
for line in infile:
line = line.strip()
extracted_data = extract_data(line)
if extracted_data:
processed_data = process_data(extracted_data)
writer.writerow([
processed_data["ip"],
processed_data["city"],
processed_data["time"],
processed_data["day"],
processed_data["traffic"],
processed_data["type"],
processed_data["id"]
])
print(f"Processed: {processed_data}")
else:
print(f"Skipped invalid line: {line}")
print(f"Data cleaning completed. Cleaned data saved to {output_file}")
if __name__ == "__main__":
main()
浙公网安备 33010602011771号