分割数据集为train, dev, test代码样例

数据分割代码,可以调整分割比,以及文件路径

import random

def split_data(file_path, train_ratio=0.8, dev_ratio=0.1, test_ratio=0.1):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    random.shuffle(lines)
    total_lines = len(lines)
    train_end = int(total_lines * train_ratio)
    dev_end = train_end + int(total_lines * dev_ratio)

    train_data = lines[:train_end]
    dev_data = lines[train_end:dev_end]
    test_data = lines[dev_end:]

    # path needs to be changed according to conditions
    with open('./data/train.txt', 'w') as train_file:
        train_file.writelines(train_data)
    # path needs to be changed according to conditions
    with open('./data/dev.txt', 'w') as dev_file:
        dev_file.writelines(dev_data)
    # path needs to be changed according to conditions
    with open('./data/test.txt', 'w') as test_file:
        test_file.writelines(test_data)

if __name__ == "__main__":
    split_data('./data/all.txt')
posted @ 2025-01-24 11:32  我千五可以  阅读(39)  评论(0)    收藏  举报