diff --git a/OpenOrca.py b/OpenOrca.py index 7faef74..67ca7e9 100644 --- a/OpenOrca.py +++ b/OpenOrca.py @@ -17,11 +17,11 @@ class CustomDataset(DatasetBuilder): def _split_generators(self, dl_manager): base_path = dl_manager.download_and_extract('https://huggingface.co/datasets/Open-Orca/OpenOrca') folders = ['001-cot', '002-flan', '003-flan-1m', '004-flan1m-aug-shuf', '005-flan-5m', - '006-flan-chatgpt', '007-gpt4_100k', '008-niv', '009-t0'] # add more as needed + '006-flan-chatgpt', '007-gpt4_100k', '008-niv', '009-t0'] split_generators = [] for folder in folders: - train_file_path = os.path.join(base_path, f'{folder}/cot-train.jsonl') - test_file_path = os.path.join(base_path, f'{folder}/cot-test.jsonl') + train_file_path = os.path.join(base_path, f'{folder}/{folder}-train.jsonl') + test_file_path = os.path.join(base_path, f'{folder}/{follder}-test.jsonl') if os.path.isfile(train_file_path) and os.path.isfile(test_file_path): split_generators.extend([ SplitGenerator(name=f'{folder.replace("-", "_")}_train', gen_kwargs={"filepath": train_file_path}),