做特征工程

准确率并没提升太多。
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
ep=1e-5
func_dict={'add':lambda x,y:x+y,'mins':lambda x,y:x-y,'div':lambda x,y:x/(y+ep),'mul':lambda x,y:x*y}

df = pd.read_csv('train.csv')
df=df.drop(['ID'],axis=1)
prfe=df.iloc[:,:-1]
me=np.reshape(prfe.var(axis=1).to_numpy(),(-1,1))
me1=np.reshape(prfe.std(axis=1).to_numpy(),(-1,1))
df=df.to_numpy()
feature=np.abs(np.fft.fft(df[:,:-1]))





df1 = pd.read_csv('test.csv')
df1=df1.drop(['ID'],axis=1)
df1=df1.to_numpy()
yunt_feature=np.abs(np.fft.fft(df1[:,:]))



feature=pd.DataFrame(feature)
yunt_feature=pd.DataFrame(yunt_feature)
def auto_fea(fe,dunc_dict,col_list):
    for col_i in col_list:
        for col_j in col_list:
            if col_i!=col_j:
                for func_name,func in dunc_dict.items():
                    func_feature=func(fe[col_i],fe[col_j])
                    col_name='-'.join([str(col_i),func_name,str(col_j)])
                    fe[col_name]=func_feature


feature=np.concatenate((feature,np.reshape(df[:,-1],(-1,1))),axis=1)
train=pd.DataFrame(feature)
heat=train.corr()
fe=heat.index[abs(heat[feature.shape[1]-1])>0.3]
train=train.to_numpy()
feature=train[:,fe]
feature=feature[:,:-1]
feature=pd.DataFrame(feature)

yunt_feature=yunt_feature.to_numpy()
yunt_feature=yunt_feature[:,fe[:-1]]
yunt_feature=pd.DataFrame(yunt_feature)
auto_fea(feature,func_dict,feature.columns)
auto_fea(yunt_feature,func_dict,yunt_feature.columns)
yunt_feature=yunt_feature.to_numpy()
feature=feature.to_numpy()

feature=np.concatenate((feature,np.reshape(df[:,-1],(-1,1))),axis=1)
train=pd.DataFrame(feature)
heat=train.corr()
fe1=heat.index[abs(heat[feature.shape[1]-1])>0.42]
print(len(fe1))
train=train.to_numpy()
train=train[:,fe1]
yunt_feature=yunt_feature[:,fe1[:-1]]
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
kf=KFold(n_splits=5,shuffle=False)
for k in range(10):
    sum=0
    sum1=0
    i=0
    for train_index,test_index in kf.split(train):
        i=i+1
        tfeature=train[train_index,:-1]
        label=train[train_index,-1]
        clf=tree.DecisionTreeClassifier(criterion='gini',random_state=0,max_depth=k+1)    
        clf.fit(tfeature,label)
        l=clf.predict(tfeature)
        ttest=train[test_index,:-1]
        testlabel=train[test_index,-1]
        l1=clf.predict(ttest)
        pr=accuracy_score(label, l)
        pr1=accuracy_score(testlabel, l1)
        sum=sum+pr
        sum1=sum1+pr1
    clf1=tree.DecisionTreeClassifier(criterion='gini',random_state=0,max_depth=k+1)
    scores = cross_val_score(clf1, train[:,:-1], train[:,-1], cv=5)
    print(k,sum/i,sum1/i,scores.mean())
   
clf1=tree.DecisionTreeClassifier(criterion='gini',random_state=0,max_depth=4+1)    
clf1.fit(train[:,:-1],train[:,-1])
out=clf1.predict(yunt_feature)
out=pd.DataFrame(out)
out.columns = ['CLASS']
w=[]
for k in range(out.shape[0]):
    w.append(k+210)
out['ID']=np.reshape(w,(-1,1))
out[['ID','CLASS']].to_csv('out.csv',index=False)
posted @ 2022-12-11 20:24  祥瑞哈哈哈  阅读(35)  评论(0)    收藏  举报