直接用决策树跑baseline

准确率是0.67,下一篇博客将去掉偏差大的特征。
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
df = pd.read_csv('train.csv')
df1=pd.read_csv('test.csv')
df=df.drop(['ID'],axis=1)
df=df.to_numpy()
feature=df[:,:-1]
label=df[:,-1]
from sklearn.model_selection import train_test_split
tfeature,ttest,tlabel,testlabel=train_test_split(feature,label,test_size=0.2)
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
kf=KFold(n_splits=5,shuffle=False)
from sklearn import svm
from sklearn.model_selection import cross_val_score
for k in range(10):
    sum=0
    sum1=0
    i=0
    for train_index,test_index in kf.split(df):
        i=i+1
        tfeature=df[train_index,:-1]
        label=df[train_index,-1]
        clf=tree.DecisionTreeClassifier(criterion='gini',random_state=0,max_depth=k+1)    
        clf.fit(tfeature,tlabel)
        l=clf.predict(tfeature)
        ttest=df[test_index,:-1]
        testlabel=df[test_index,-1]
        l1=clf.predict(ttest)
        pr=accuracy_score(tlabel, l)
        pr1=accuracy_score(testlabel, l1)
        sum=sum+pr
        sum1=sum1+pr
    clf1=tree.DecisionTreeClassifier(criterion='gini',random_state=0,max_depth=k+1)
    scores = cross_val_score(clf, df[:,:-1], df[:,-1], cv=5)
    print(k,sum/i,sum1/i,scores.mean())
   
clf=tree.DecisionTreeClassifier(criterion='gini',random_state=0,max_depth=11)    
clf.fit(df[:,:-1],df[:,-1])
df1=df1.drop(['ID'],axis=1)
df1=df1.to_numpy()
y=clf.predict(df1[:,:])
out=pd.DataFrame(y)
out.columns = ['CLASS']
w=[]
for k in range(out.shape[0]):
    w.append(k+210)
out['ID']=np.reshape(w,(-1,1))
out[['ID','CLASS']].to_csv('out.csv',index=False)
posted @ 2022-12-09 11:36  祥瑞哈哈哈  阅读(25)  评论(0)    收藏  举报