openfe的使用

文章和代码:https://www.baidu.com/link?url=nz86RZmDqSVtQ85oRCyiQw1NLINAy4c65DoIC-uFJ7jwrM_H7qX9cQHZxltjl7RW&wd=&eqid=f8a55867001e79ac000000036394620b
直接把数据丢里提升效果不太大。。。。。。。他会生成很多特征导致计算机运算需要很长时间,因此不建议放入较大的特征维度,需要自己选择一下。
import pandas as pd
from sklearn.datasets import fetch_california_housing
from openfe import openfe, transform, tree_to_formula
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn import tree
import warnings
warnings.filterwarnings("ignore")
def get_score(train_x, test_x, train_y, test_y):
    train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=1)
    params = {'n_estimators': 1000, 'n_jobs': n_jobs, 'seed': 1}
    gbm = lgb.LGBMRegressor(**params)
    gbm.fit(train_x, train_y, eval_set=[(val_x, val_y)], callbacks=[lgb.early_stopping(50, verbose=False)])
    pred = pd.DataFrame(gbm.predict(test_x), index=test_x.index)
    score = mean_squared_error(test_y, pred)
    return score
import numpy as np
import warnings
if __name__ == '__main__':
    df = pd.read_csv('train.csv')
    df=df.drop(['ID'],axis=1)
    df=df.to_numpy()
    feature=np.abs(np.fft.fft(df[:,:-1]))
    feature=np.concatenate((feature,np.reshape(df[:,-1],(-1,1))),axis=1)
    train=pd.DataFrame(feature)
    heat=train.corr()
    fe=heat.index[abs(heat[240])>0.3]
    train=df[:,fe]
    label=train[:,-1]
    train=train[:,:-1]
    # feature generation
    ofe = openfe()
    w=[]
    for i in range(train.shape[1]):
        s='T'
        w.append(s+str(i))
    train=pd.DataFrame(train)  
    train.columns=w
   
    df1 = pd.read_csv('test.csv')
    df1=df1.drop(['ID'],axis=1)
    df1=df1.to_numpy()
    test_feature=np.abs(np.fft.fft(df1[:,:]))
    test_feature=test_feature[:,fe[:-1]]
    features = ofe.fit(data=pd.DataFrame(train),task='classification', label=pd.DataFrame(label), n_jobs=3, stage1_metric='corr',stage2_metric='permutation')
    test_fe=pd.DataFrame(test_feature)
    test_fe.columns=w
    train_x, test_x = transform(train, test_fe, ofe.new_features_list[:10], n_jobs=3)
    print(train_x.shape)
    print(train.shape)
    for i in range(10):
        clf=tree.DecisionTreeClassifier(criterion='gini',random_state=0,max_depth=i+1)
        clf1=tree.DecisionTreeClassifier(criterion='gini',random_state=0,max_depth=i+1)
        scores = cross_val_score(clf, train, label, cv=5)
        scores1 = cross_val_score(clf1, train_x, label, cv=5)
        print(i,scores.mean(),scores1.mean())    
    print("The top 10 generated features are")

    clf1=tree.DecisionTreeClassifier(criterion='gini',random_state=0,max_depth=3+1)
    clf1.fit(train_x,label)
    out=clf1.predict(train_x)
    print(out)
    out=clf1.predict(test_x)
    print(out)
    out=pd.DataFrame(out)
    out.columns = ['CLASS']
    w=[]
    for k in range(out.shape[0]):
        w.append(k+210)
    out['ID']=np.reshape(w,(-1,1))
    out[['ID','CLASS']].to_csv('out3.csv',index=False)
posted @ 2022-12-10 18:56  祥瑞哈哈哈  阅读(818)  评论(0)    收藏  举报