代码改变世界

生成K-fold交叉验证数据集

2021-05-18 15:21  DataBases  阅读(262)  评论(0)    收藏  举报
import pandas as pd
from sklearn import model_selection
if __name__ == "__main__":
  
  # Training data is in a CSV file called train.csv
  df = pd.read_csv("train.csv")
  # we create a new column called kfold and fill it with -1
  df["kfold"] = -1
  # the next step is to randomize the rows of the data
  df = df.sample(frac=1).reset_index(drop=True)
  # initiate the kfold class from model_selection module
  kf = model_selection.KFold(n_splits=5)
  # fill the new kfold column
  for fold, (trn_, val_) in enumerate(kf.split(X=df)):
    df.loc[val_, 'kfold'] = fold
    # save the new csv with kfold column
  df.to_csv("train_folds.csv", index=False)
============================================
import pandas as pd
from sklearn.datasets import make_regression
from sklearn import model_selection
if __name__ == '__main__':
X,y = make_regression(n_samples=15000,n_features=8,n_targets=1)
df = pd.DataFrame(X,columns=[f"f_{i}" for i in range(X.shape[1])])
df.loc[:, "target"] = y
df["kfold"] = -1
df = df.sample(frac=1).reset_index(drop=True)
kf = model_selection.KFold(n_splits=5)
for f, (t_,v_) in enumerate(kf.split(X=df)):
df.loc[v_, "kfold"] = f

print(df)