readzip_add_maxL2
#!/usr/bin/env python
import os
import numpy as np
import py7zr
import shutil
import pandas as pd
import time
#处理7Z分笔数据
path = r'G:\datas of status\tick-by-tick trade'#数据文件存放位置
pathsave = 'G:\\datas of status\\python codes\\'#设定临时文件存放位置
listM = np.array(os.listdir(path)) #获取月文件夹
print(listM)
listM=np.char.add(path + "\\",listM)#获取月文件夹路径
def fun_time_l2(a,b):
if float(a)<=float(b) :
return 1
else:
return 0
def read_files(filename):#读文件内容
print(filename)
with open(filename, "r") as f:
df1 = pd.DataFrame(f.readlines())
index = df1.loc[(df1[0].str.contains("find"))].index
if index.isnull :
df1 = df1.drop(index= index)
#print(df1[13870:13890])
df1 = pd.DataFrame(df1[0].str.strip())
#print(df1)
df1 = pd.DataFrame(df1[0].str.split("\t",expand = True))
#print(df1[1].str.strip())
#print(df1[2].str.strip())
#print(df1[1].astype("int")*df1[2].astype("int"))
df1[3] = df1[1].astype("int")*df1[2].astype("int")
df1.columns = ["time","price","vol","amount"]
vol_t = abs(df1["vol"].astype("long")).sum()
amount_t = abs(df1["amount"].astype("long")).sum()
df_f_xiao = df1[(df1["amount"].astype("int") <0)&((df1["amount"].astype("int") > -40000) )]
df_f_zhong = df1[(df1["amount"].astype("int") <= -40000)&((df1["amount"].astype("int") > -200000) )]
df_f_da = df1[(df1["amount"].astype("int") <= - 200000)&((df1["amount"].astype("int") > -1000000) )]
df_f_te_da = df1[(df1["amount"].astype("int") <= - 1000000)]
f_xiao = df_f_xiao["amount"].astype("long").sum()
f_zhong = df_f_zhong["amount"].astype("long").sum()
f_da = df_f_da["amount"].astype("long").sum()
f_te_da = df_f_te_da["amount"].astype("long").sum()
df_z_xiao = df1[(df1["amount"].astype("int") > 0) & ((df1["amount"].astype("int") < 40000))]
df_z_zhong = df1[(df1["amount"].astype("int") >= 40000) & ((df1["amount"].astype("int") < 200000))]
df_z_da = df1[(df1["amount"].astype("int") >= 200000) & ((df1["amount"].astype("int") < 1000000))]
df_z_te_da = df1[(df1["amount"].astype("int") >= 1000000)]
z_xiao = df_z_xiao["amount"].astype("long").sum()
z_zhong = df_z_zhong["amount"].astype("long").sum()
z_da = df_z_da["amount"].astype("long").sum()
z_te_da = df_z_te_da["amount"].astype("long").sum()
#add 增加计算最小值
min_L = df1["price"].astype("int").min()
sum_V = abs(df1["vol"].astype("int")).sum()
min_2 = min_L * 1.02
df_min_2 = df1[ (df1["price"].astype("int") < min_2)]
sum_min_2_v = abs(df_min_2["vol"].astype("long")).sum()
re_min_L2 = abs(sum_min_2_v)/sum_V*100
#add time
df_time_all = pd.DataFrame()
df_time_all["time"] = df1["time"].str[:-2]
df_time_all["price"] = df1["price"]
df_time_all_only =df_time_all.drop_duplicates(subset=['time'],keep='first',inplace=False)
df_time_all_only = df_time_all_only.reset_index(drop = True)
for time_do in df_time_all_only["time"]:
df_time_t = df_time_all[df_time_all["time"] == time_do]
df_time_all_only.loc[df_time_all_only["time"] == time_do,"price"] = df_time_t["price"].min()
df_time_all_only["add_times"] =df_time_all_only["price"].apply(lambda x :fun_time_l2(x,min_2))
time_l2 = df_time_all_only["add_times"].sum()
#print()
#print(re_min_L2)
#print(sum_V)
#sum_V = abs(df1[2]).sum()
#min_2 = min_L * 1.02
#print(min_2)
#print(sum_V)
'''
print(vol_t)
print(amount_t)
print(f_xiao)
print(f_zhong)
print(f_da )
print(f_te_da)
print(z_xiao)
print(z_zhong)
print(z_da )
print(z_te_da)
'''
list_return = [vol_t,amount_t,z_xiao,z_zhong,z_da,z_te_da,f_xiao,f_zhong,f_da,f_te_da,re_min_L2,time_l2]
return list_return
#tempname=r'G:\\datas of status\\python codes\\20200428\\SH600000.txt'
#read_files(tempname)
def read_dirs(savedir):#读文件夹
files=np.array(os.listdir(savedir))
file_names = np.char.add(savedir + "\\",files)
listdir_return = []
for file in file_names:
(filepath, tempfilename) = os.path.split(file)
(filename, extension) = os.path.splitext(tempfilename)
if not os.path.getsize(file):#判断文件大小是否为0
print("file siz = 0")
print(file)
else:
list_t = read_files(file)
list_t.insert(0,filename)
listdir_return.append(list_t)
#print(listdir_return)
npM = pd.DataFrame(listdir_return)
npM.columns = ["name","vol","amount","z_xiao","z_zhong","z_da","z_te_da","f_xiao","f_zhong","f_da","f_te_da","re_min_L2","time_l2"]
return npM
#print(npM)
def extract_files(filename):#提出7Z文件
with py7zr.SevenZipFile(filename, 'r') as archive:
allfiles = archive.getnames()#获取7Z文件内的子文件名
#print(allfiles)
tempdir = allfiles[0].split("/")[0]#取7Z文件内文件夹名称
#print(tempdir)
savedir =pathsave + str(tempdir)
#print(pathsave)
if os.path.exists(savedir):
shutil.rmtree(savedir)#删除同名文件夹
os.mkdir(savedir)#重建文件夹
#archive.extract(pathsave,allfiles[0:3])#解压到文件夹
archive.extractall(pathsave)#解压到文件夹
#print(archive.extractall())
pdM2 = read_dirs(savedir)
shutil.rmtree(savedir)
pdM2.insert(1,"date",tempdir,allow_duplicates=False)
#print(pdM2)
return pdM2
def do_work(listD):
pdM_all = pd.DataFrame(
columns=["name", "date", "vol", "amount", "z_xiao", "z_zhong", "z_da", "z_te_da", "f_xiao", "f_zhong", "f_da",
"f_te_da","re_min_L2","time_l2"])
for filename in listD:
#filename = listD[0]
print("=========")
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
pdD_t = extract_files(filename)
#print(pdD_t["date"][0])
save_dfile = pathsave + "\\" + "everyday_data" + "\\" + pdD_t["date"][0] + ".csv"
#print(save_dfile)
pdD_t = pdD_t.sort_values(by=['time_l2'], ascending=True)
pdD_t.to_csv(save_dfile,sep=",",index=False,header=True)
pdM_all = pdM_all.append(pdD_t)
print(filename)
#print(pdM_all)
save_file = pathsave + pdM_all["date"][0].str[0:6] + ".csv"
save_file = save_file.reset_index(drop = True)
print(save_file[0])
#df.to_csv(‘/opt/births1880.csv’, index=False, header=False
#pdM_all = pdM_all.sort_values(by=['re_min_L2'], ascending=True)
pdM_all.to_csv(save_file[0],sep=",",index=False,header=True)
def start_work():
m = 0 # 开始处理第几个文件夹(1~16,16=202004,15=202003)
do_num = 1
for n in range(do_num):
i = m - n #处理第几个文件夹(1~16)
print(listM[i])
listD = np.array(os.listdir(listM[i]))#获取一个文件夹下所有日文件全路径
print(listD)
listD = np.char.add(listM[i] + "\\",listD)#获取日文件全名
print(listD)
do_work(listD)
print(i)
start_work()
#以下为单位处理一天的数据
def do_one_day():
tempdir = "20200718"#某天数据已解压的文件夹
savedir = pathsave + tempdir
pdM2 = read_dirs(savedir)
pdM2.insert(1, "date", tempdir, allow_duplicates=False)
save_dfile = pathsave + "\\" + "everyday_data" + "\\" + tempdir + ".csv"
#save_dfile = pathsave + "\\" + "everyday_data" + "\\" + "20200710" + ".csv"
# print(save_dfile)
pdM2 = pdM2.sort_values(by=['time_l2'],ascending=True)
pdM2.to_csv(save_dfile, sep=",", index=False, header=True)
#do_one_day()
def do_one_file():
file_name = "G:\\datas of status\\python codes\\20200714\\SH600000.txt"
print(read_files(file_name))
#do_one_file()
单线程,计算时间部分还要优化

浙公网安备 33010602011771号