From 7eff4d5e71910fa78cf0870b9742e06c2f0a40b5 Mon Sep 17 00:00:00 2001 From: Alignment Lab AI Date: Fri, 30 Jun 2023 00:20:44 +0000 Subject: [PATCH] Update OpenOrca.py --- OpenOrca.py | 67 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/OpenOrca.py b/OpenOrca.py index f6f6932..f5a0f58 100644 --- a/OpenOrca.py +++ b/OpenOrca.py @@ -3,6 +3,22 @@ from datasets.features import Features, Value import json import os +class CustomDataset(DatasetBuilder): + def _info(self) -> DatasetInfo: + return DatasetInfo( + features=Features({ + 'id': Value('string'), + 'system_prompt': Value('string'), + 'question': Value('string'), + 'response': Value('string') + }), + ) + +from datasets import DatasetBuilder, DatasetInfo, SplitGenerator, SplitInfo +from datasets.features import Features, Value +import json +import os + class CustomDataset(DatasetBuilder): def _info(self) -> DatasetInfo: return DatasetInfo( @@ -16,19 +32,48 @@ class CustomDataset(DatasetBuilder): def _split_generators(self, dl_manager): base_path = dl_manager.download_and_extract('https://huggingface.co/datasets/Open-Orca/OpenOrca') - folders = ['001-cot', '002-flan', '003-flan-1m', '004-flan1m-aug-shuf', '005-flan-5m', - '006-flan-chatgpt', '007-gpt4_100k', '008-niv', '009-t0'] - split_generators = [] - for folder in folders: - train_file_path = os.path.join(base_path, f'{folder}/{folder}-train.jsonl') - test_file_path = os.path.join(base_path, f'{folder}/{folder}-test.jsonl') - if os.path.isfile(train_file_path) and os.path.isfile(test_file_path): - split_generators.extend([ - SplitGenerator(name=f'{folder.replace("-", "_")}_train', gen_kwargs={"filepath": train_file_path}), - SplitGenerator(name=f'{folder.replace("-", "_")}_test', gen_kwargs={"filepath": test_file_path}), - ]) + + split_generators = [ + SplitGenerator(name='001_cot_train', gen_kwargs={"filepath": os.path.join(base_path, '001-cot/cot-train.jsonl')}), + SplitGenerator(name='001_cot_test', gen_kwargs={"filepath": os.path.join(base_path, '001-cot/cot-test.jsonl')}), + + SplitGenerator(name='002_flan_train', gen_kwargs={"filepath": os.path.join(base_path, '002-flan/flan-train.jsonl')}), + SplitGenerator(name='002_flan_test', gen_kwargs={"filepath": os.path.join(base_path, '002-flan/flan-test.jsonl')}), + + SplitGenerator(name='003_flan1m_train', gen_kwargs={"filepath": os.path.join(base_path, '003-flan-1m/flan-1m-train.jsonl')}), + SplitGenerator(name='003_flan1m_test', gen_kwargs={"filepath": os.path.join(base_path, '003-flan-1m/flan-1m-test.jsonl')}), + + SplitGenerator(name='004_flan1m_aug_shuf_train', gen_kwargs={"filepath": os.path.join(base_path, '004-flan1m-aug-shuf/flan1m-aug-shuf-train.jsonl')}), + SplitGenerator(name='004_flan1m_aug_shuf_test', gen_kwargs={"filepath": os.path.join(base_path, '004-flan1m-aug-shuf/flan1m-aug-shuf-test.jsonl')}), + + SplitGenerator(name='005_flan5m_train', gen_kwargs={"filepath": os.path.join(base_path, '005-flan-5m/flan-5m-train.jsonl')}), + SplitGenerator(name='005_flan5m_test', gen_kwargs={"filepath": os.path.join(base_path, '005-flan-5m/flan-5m-test.jsonl')}), + + SplitGenerator(name='006_flan_chatgpt_train', gen_kwargs={"filepath": os.path.join(base_path, '006-flan-chatgpt/flan-chatgpt-train.jsonl')}), + SplitGenerator(name='006_flan_chatgpt_test', gen_kwargs={"filepath": os.path.join(base_path, '006-flan-chatgpt/flan-chatgpt-test.jsonl')}), + + SplitGenerator(name='007_gpt4_100k_train', gen_kwargs={"filepath": os.path.join(base_path, '007-gpt4_100k/gpt4_100k-train.jsonl')}), + SplitGenerator(name='007_gpt4_100k_test', gen_kwargs={"filepath": os.path.join(base_path, '007-gpt4_100k/gpt4_100k-test.jsonl')}), + + SplitGenerator(name='008_niv_train', gen_kwargs={"filepath": os.path.join(base_path, '008-niv/niv-train.jsonl')}), + SplitGenerator(name='008_niv_test', gen_kwargs={"filepath": os.path.join(base_path, '008-niv/niv-test.jsonl')}), + + SplitGenerator(name='009_t0_train', gen_kwargs={"filepath": os.path.join(base_path, '009-t0/t0-train.jsonl')}), + SplitGenerator(name='009_t0_test', gen_kwargs={"filepath": os.path.join(base_path, '009-t0/t0-test.jsonl')}), + ] + return split_generators + def _generate_examples(self, filepath): + with open(filepath, 'r') as f: + for id_, line in enumerate(f): + data = json.loads(line) + yield id_, { + 'id': data['id'], + 'system_prompt': data['system_prompt'], + 'question': data['question'], + 'response': data['response'] + } def _generate_examples(self, filepath): with open(filepath, 'r', encoding='utf-8') as f: for id_, line in enumerate(f):