OpenOrca/OpenOrca.py

42 lines
1.9 KiB
Python
Raw Normal View History

2023-06-29 23:44:20 +00:00
from datasets import DatasetBuilder, DatasetInfo, SplitGenerator, SplitInfo
from datasets.features import Features, Value
import json
import os
class CustomDataset(DatasetBuilder):
def _info(self) -> DatasetInfo:
return DatasetInfo(
features=Features({
'id': Value('string'),
'system_prompt': Value('string'),
'question': Value('string'),
'response': Value('string')
}),
)
def _split_generators(self, dl_manager):
2023-06-29 23:54:28 +00:00
base_path = dl_manager.download_and_extract('https://huggingface.co/datasets/Open-Orca/OpenOrca')
2023-06-29 23:44:20 +00:00
folders = ['001-cot', '002-flan', '003-flan-1m', '004-flan1m-aug-shuf', '005-flan-5m',
'006-flan-chatgpt', '007-gpt4_100k', '008-niv', '009-t0'] # add more as needed
split_generators = []
for folder in folders:
2023-06-30 00:02:09 +00:00
train_file_path = os.path.join(base_path, f'{folder}/cot-train.jsonl')
test_file_path = os.path.join(base_path, f'{folder}/cot-test.jsonl')
if os.path.isfile(train_file_path) and os.path.isfile(test_file_path):
split_generators.extend([
SplitGenerator(name=f'{folder.replace("-", "_")}_train', gen_kwargs={"filepath": train_file_path}),
SplitGenerator(name=f'{folder.replace("-", "_")}_test', gen_kwargs={"filepath": test_file_path}),
])
2023-06-29 23:44:20 +00:00
return split_generators
def _generate_examples(self, filepath):
2023-06-30 00:02:09 +00:00
with open(filepath, 'r', encoding='utf-8') as f:
2023-06-29 23:44:20 +00:00
for id_, line in enumerate(f):
2023-06-30 00:02:09 +00:00
data = json.loads(line.strip())
2023-06-29 23:44:20 +00:00
yield id_, {
2023-06-30 00:02:09 +00:00
'id': data.get('id', ''),
'system_prompt': data.get('system_prompt', ''),
'question': data.get('question', ''),
'response': data.get('response', '')
2023-06-29 23:44:20 +00:00
}