#!/usr/bin/env python
import re
import os
import os.path
import gzip
import json
from urlparse import *
import datetime
import time
DICT = {}
def print_time(s):
#print (datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
print datetime.datetime.now().strftime("%H:%M:%S.%f") + " " + str(s)
def remove_flag(s):
return s.replace("[","").replace("]","")
def remove_session_prefix(s):
return s.replace("session:","")
def parse_url(url):
a_dict = {}
aUrl = urlparse("http://"+url)
url_params = dict([(k,v[0]) for k,v in parse_qs(aUrl.query).items()])
a_dict.update({"domain":aUrl.netloc})
a_dict.update({"path":aUrl.path})
a_dict.update({"params":url_params})
return a_dict
def get_info(a_dict):
new_dict = {}
if a_dict["userid"] != "" and a_dict["userid"] != "0":
new_dict.update({"userid":a_dict["userid"]})
else:
new_dict.update({"userid":a_dict["session"]})
new_dict.update({"domain":a_dict["url"]["domain"]})
new_dict.update({"path":a_dict["url"]["path"]})
if "fr" in a_dict["url"]["params"]:
new_dict.update({"fr":a_dict["url"]["params"]["fr"]})
else:
new_dict.update({"fr":"-"})
if "ct" in a_dict["url"]["params"]:
new_dict.update({"ct":a_dict["url"]["params"]["ct"]})
else:
new_dict.update({"ct":"-"})
if "ac" in a_dict["url"]["params"]:
new_dict.update({"ac":a_dict["url"]["params"]["ac"]})
else:
new_dict.update({"ac":"-"})
key = json.dumps(new_dict)
if key in DICT:
DICT[key] += 1
#print key + " " + str(DICT[key])
else:
DICT.update({key:1})
def read_logs(path):
for item in os.listdir(path):
f = gzip.open(path+"/"+item,"r")
if f == None:
raise "program can't open this file"
i = 0
while True:
if i % 10000 == 0:print_time(i)
line = f.readline()
if i % 10000 == 0:print_time("read to memory")
if not line:
break
aDict = {}
list = re.findall('\[.*?\]',line)
if i % 10000 == 0:print_time("split to items")
urlDict = parse_url(remove_flag(list[4]))
if i % 10000 == 0:print_time("url fommat to dict")
aDict.update({"userid":remove_flag(list[2])})
aDict.update({"session":remove_session_prefix(remove_flag(list[7]))})
aDict.update({"url":urlDict})
if i % 10000 == 0:print_time("create new dict")
get_info(aDict)
if i % 10000 == 0:print_time("save and diff")
i += 1
f.close()
if __name__ == "__main__":
#try:
read_logs("logs/20130908")
f = open("data","w")
for i in DICT:
f.write(i+" "+str(DICT[i]))
f.close()
#except :
# print "error"