分割数据集为train, dev, test代码样例
数据分割代码,可以调整分割比,以及文件路径
import random
def split_data(file_path, train_ratio=0.8, dev_ratio=0.1, test_ratio=0.1):
with open(file_path, 'r') as file:
lines = file.readlines()
random.shuffle(lines)
total_lines = len(lines)
train_end = int(total_lines * train_ratio)
dev_end = train_end + int(total_lines * dev_ratio)
train_data = lines[:train_end]
dev_data = lines[train_end:dev_end]
test_data = lines[dev_end:]
# path needs to be changed according to conditions
with open('./data/train.txt', 'w') as train_file:
train_file.writelines(train_data)
# path needs to be changed according to conditions
with open('./data/dev.txt', 'w') as dev_file:
dev_file.writelines(dev_data)
# path needs to be changed according to conditions
with open('./data/test.txt', 'w') as test_file:
test_file.writelines(test_data)
if __name__ == "__main__":
split_data('./data/all.txt')

浙公网安备 33010602011771号