Update OpenOrca.py

This commit is contained in:
Alignment Lab AI 2023-06-29 23:54:28 +00:00 committed by huggingface-web
parent e30de90a53
commit 762ff80316

@ -15,14 +15,14 @@ class CustomDataset(DatasetBuilder):
) )
def _split_generators(self, dl_manager): def _split_generators(self, dl_manager):
base_path = 'path_to_your_data' base_path = dl_manager.download_and_extract('https://huggingface.co/datasets/Open-Orca/OpenOrca')
folders = ['001-cot', '002-flan', '003-flan-1m', '004-flan1m-aug-shuf', '005-flan-5m', folders = ['001-cot', '002-flan', '003-flan-1m', '004-flan1m-aug-shuf', '005-flan-5m',
'006-flan-chatgpt', '007-gpt4_100k', '008-niv', '009-t0'] # add more as needed '006-flan-chatgpt', '007-gpt4_100k', '008-niv', '009-t0'] # add more as needed
split_generators = [] split_generators = []
for folder in folders: for folder in folders:
split_generators.extend([ split_generators.extend([
SplitGenerator(name=f'{folder.replace("-", "_")}_train', gen_kwargs={"filepath": f'{folder}/train.jsonl'}), SplitGenerator(name=f'{folder.replace("-", "_")}_train', gen_kwargs={"filepath": os.path.join(base_path, f'{folder}/train.jsonl')}),
SplitGenerator(name=f'{folder.replace("-", "_")}_test', gen_kwargs={"filepath": f'{folder}/test.jsonl'}), SplitGenerator(name=f'{folder.replace("-", "_")}_test', gen_kwargs={"filepath": os.path.join(base_path, f'{folder}/test.jsonl')}),
]) ])
return split_generators return split_generators
@ -36,3 +36,4 @@ class CustomDataset(DatasetBuilder):
'question': data['question'], 'question': data['question'],
'response': data['response'] 'response': data['response']
} }