切分数据集

faqs = pd.read_csv('./data/FAQ.csv', sep='\t').iloc[:, 1:]
faqs

# In[3]
faqs

# In[3]
# 切分数据
faqs_len = len(faqs)
print('len(faqs):', faqs_len)
X_train, X_dev_test, y_train, y_dev_test = \
        train_test_split(faqs['question'].to_list(), faqs['label'].to_list(), test_size=0.4, random_state=6, stratify=faqs['label'].to_list())
X_dev, X_test, y_dev, y_test = \
        train_test_split(X_dev_test, y_dev_test, test_size=0.5, random_state=6, stratify=y_dev_test)
print('train: ', len(X_train), len(y_train))
print('dev: ', len(X_dev), len(y_dev))
print('test: ', len(X_test), len(y_test))

# In[3]
from sklearn.model_selection import train_test_split
# 存放train数据
X_train_DataFrame = pd.DataFrame(X_train, columns=['question'])
y_train_DataFrame = pd.DataFrame(y_train, columns=['label'])
train_all = pd.concat([X_train_DataFrame, y_train_DataFrame], axis=1)
train_all.to_csv('./data/train.csv', sep='\t')

# In[4]
# 存放dev数据
X_dev_DataFrame = pd.DataFrame(X_dev, columns=['question'])
y_dev_DataFrame = pd.DataFrame(y_dev, columns=['label'])
dev_all = pd.concat([X_dev_DataFrame, y_dev_DataFrame], axis=1)
dev_all.to_csv('./data/dev.csv', sep='\t')

# In[4]
# 存放test数据
X_test_DataFrame = pd.DataFrame(X_test, columns=['question'])
y_test_DataFrame = pd.DataFrame(y_test, columns=['label'])
test_all = pd.concat([X_test_DataFrame, y_test_DataFrame], axis=1)
test_all.to_csv('./data/test.csv', sep='\t')
posted @ 2021-03-27 11:11  douzujun  阅读(266)  评论(0编辑  收藏  举报