diff --git a/OpenOrca.py b/OpenOrca.py index 3d4f215..ecdfcd6 100644 --- a/OpenOrca.py +++ b/OpenOrca.py @@ -15,14 +15,14 @@ class CustomDataset(DatasetBuilder): ) def _split_generators(self, dl_manager): - base_path = 'path_to_your_data' + base_path = dl_manager.download_and_extract('https://huggingface.co/datasets/Open-Orca/OpenOrca') folders = ['001-cot', '002-flan', '003-flan-1m', '004-flan1m-aug-shuf', '005-flan-5m', '006-flan-chatgpt', '007-gpt4_100k', '008-niv', '009-t0'] # add more as needed split_generators = [] for folder in folders: split_generators.extend([ - SplitGenerator(name=f'{folder.replace("-", "_")}_train', gen_kwargs={"filepath": f'{folder}/train.jsonl'}), - SplitGenerator(name=f'{folder.replace("-", "_")}_test', gen_kwargs={"filepath": f'{folder}/test.jsonl'}), + SplitGenerator(name=f'{folder.replace("-", "_")}_train', gen_kwargs={"filepath": os.path.join(base_path, f'{folder}/train.jsonl')}), + SplitGenerator(name=f'{folder.replace("-", "_")}_test', gen_kwargs={"filepath": os.path.join(base_path, f'{folder}/test.jsonl')}), ]) return split_generators @@ -36,3 +36,4 @@ class CustomDataset(DatasetBuilder): 'question': data['question'], 'response': data['response'] } +