From cd1cb0237bf4d4a14506a31e4054cfe16a60682a Mon Sep 17 00:00:00 2001 From: Alignment Lab AI Date: Thu, 29 Jun 2023 23:44:20 +0000 Subject: [PATCH] Upload OpenOrca.py --- OpenOrca.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 OpenOrca.py diff --git a/OpenOrca.py b/OpenOrca.py new file mode 100644 index 0000000..47e77f6 --- /dev/null +++ b/OpenOrca.py @@ -0,0 +1,38 @@ +from datasets import DatasetBuilder, DatasetInfo, SplitGenerator, SplitInfo +from datasets.features import Features, Value +import json +import os + +class CustomDataset(DatasetBuilder): + def _info(self) -> DatasetInfo: + return DatasetInfo( + features=Features({ + 'id': Value('string'), + 'system_prompt': Value('string'), + 'question': Value('string'), + 'response': Value('string') + }), + ) + + def _split_generators(self, dl_manager): + base_path = 'path_to_your_data' + folders = ['001-cot', '002-flan', '003-flan-1m', '004-flan1m-aug-shuf', '005-flan-5m', + '006-flan-chatgpt', '007-gpt4_100k', '008-niv', '009-t0'] # add more as needed + split_generators = [] + for folder in folders: + split_generators.extend([ + SplitGenerator(name=f'{folder}_train', gen_kwargs={'filepath': os.path.join(base_path, folder, f'{folder}-train.jsonl')}), + SplitGenerator(name=f'{folder}_test', gen_kwargs={'filepath': os.path.join(base_path, folder, f'{folder}-test.jsonl')}), + ]) + return split_generators + + def _generate_examples(self, filepath): + with open(filepath, 'r') as f: + for id_, line in enumerate(f): + data = json.loads(line) + yield id_, { + 'id': data['id'], + 'system_prompt': data['system_prompt'], + 'question': data['question'], + 'response': data['response'] + }