from datasets import DatasetBuilder, DatasetInfo, SplitGenerator, SplitInfo from datasets.features import Features, Value import json import os class CustomDataset(DatasetBuilder): def _info(self) -> DatasetInfo: return DatasetInfo( features=Features({ 'id': Value('string'), 'system_prompt': Value('string'), 'question': Value('string'), 'response': Value('string') }), ) def _split_generators(self, dl_manager): base_path = dl_manager.download_and_extract('https://huggingface.co/datasets/Open-Orca/OpenOrca') folders = ['001-cot', '002-flan', '003-flan-1m', '004-flan1m-aug-shuf', '005-flan-5m', '006-flan-chatgpt', '007-gpt4_100k', '008-niv', '009-t0'] # add more as needed split_generators = [] for folder in folders: train_file_path = os.path.join(base_path, f'{folder}/cot-train.jsonl') test_file_path = os.path.join(base_path, f'{folder}/cot-test.jsonl') if os.path.isfile(train_file_path) and os.path.isfile(test_file_path): split_generators.extend([ SplitGenerator(name=f'{folder.replace("-", "_")}_train', gen_kwargs={"filepath": train_file_path}), SplitGenerator(name=f'{folder.replace("-", "_")}_test', gen_kwargs={"filepath": test_file_path}), ]) return split_generators def _generate_examples(self, filepath): with open(filepath, 'r', encoding='utf-8') as f: for id_, line in enumerate(f): data = json.loads(line.strip()) yield id_, { 'id': data.get('id', ''), 'system_prompt': data.get('system_prompt', ''), 'question': data.get('question', ''), 'response': data.get('response', '') }