60 lines
3.5 KiB
Python
60 lines
3.5 KiB
Python
from datasets import DatasetBuilder, DatasetInfo, SplitGenerator, SplitInfo
|
|
from datasets.features import Features, Value
|
|
import json
|
|
import os
|
|
|
|
class CustomDataset(DatasetBuilder):
|
|
def _info(self) -> DatasetInfo:
|
|
return DatasetInfo(
|
|
features=Features({
|
|
'id': Value('string'),
|
|
'system_prompt': Value('string'),
|
|
'question': Value('string'),
|
|
'response': Value('string')
|
|
}),
|
|
)
|
|
|
|
def _split_generators(self, dl_manager):
|
|
base_path = dl_manager.download_and_extract('https://huggingface.co/datasets/Open-Orca/OpenOrca')
|
|
|
|
split_generators = [
|
|
SplitGenerator(name='001_cot_train', gen_kwargs={"filepath": os.path.join(base_path, '001-cot/cot-train.jsonl')}),
|
|
SplitGenerator(name='001_cot_test', gen_kwargs={"filepath": os.path.join(base_path, '001-cot/cot-test.jsonl')}),
|
|
|
|
SplitGenerator(name='002_flan_train', gen_kwargs={"filepath": os.path.join(base_path, '002-flan/flan-train.jsonl')}),
|
|
SplitGenerator(name='002_flan_test', gen_kwargs={"filepath": os.path.join(base_path, '002-flan/flan-test.jsonl')}),
|
|
|
|
SplitGenerator(name='003_flan1m_train', gen_kwargs={"filepath": os.path.join(base_path, '003-flan-1m/flan-1m-train.jsonl')}),
|
|
SplitGenerator(name='003_flan1m_test', gen_kwargs={"filepath": os.path.join(base_path, '003-flan-1m/flan-1m-test.jsonl')}),
|
|
|
|
SplitGenerator(name='004_flan1m_aug_shuf_train', gen_kwargs={"filepath": os.path.join(base_path, '004-flan1m-aug-shuf/flan1m-aug-shuf-train.jsonl')}),
|
|
SplitGenerator(name='004_flan1m_aug_shuf_test', gen_kwargs={"filepath": os.path.join(base_path, '004-flan1m-aug-shuf/flan1m-aug-shuf-test.jsonl')}),
|
|
|
|
SplitGenerator(name='005_flan5m_train', gen_kwargs={"filepath": os.path.join(base_path, '005-flan-5m/flan-5m-train.jsonl')}),
|
|
SplitGenerator(name='005_flan5m_test', gen_kwargs={"filepath": os.path.join(base_path, '005-flan-5m/flan-5m-test.jsonl')}),
|
|
|
|
SplitGenerator(name='006_flan_chatgpt_train', gen_kwargs={"filepath": os.path.join(base_path, '006-flan-chatgpt/train.jsonl')}),
|
|
SplitGenerator(name='006_flan_chatgpt_test', gen_kwargs={"filepath": os.path.join(base_path, '006-flan-chatgpt/test.jsonl')}),
|
|
|
|
SplitGenerator(name='007_gpt4_100k_train', gen_kwargs={"filepath": os.path.join(base_path, '007-gpt4_100k/gpt4_100k-train.jsonl')}),
|
|
SplitGenerator(name='007_gpt4_100k_test', gen_kwargs={"filepath": os.path.join(base_path, '007-gpt4_100k/gpt4_100k-test.jsonl')}),
|
|
|
|
SplitGenerator(name='008_niv_train', gen_kwargs={"filepath": os.path.join(base_path, '008-niv/niv-train.jsonl')}),
|
|
SplitGenerator(name='008_niv_test', gen_kwargs={"filepath": os.path.join(base_path, '008-niv/niv-test.jsonl')}),
|
|
|
|
SplitGenerator(name='009_t0_train', gen_kwargs={"filepath": os.path.join(base_path, '009-t0/t0-train.jsonl')}),
|
|
SplitGenerator(name='009_t0_test', gen_kwargs={"filepath": os.path.join(base_path, '009-t0/t0-test.jsonl')}),
|
|
]
|
|
|
|
return split_generators
|
|
|
|
def _generate_examples(self, filepath):
|
|
with open(filepath, 'r') as f:
|
|
for id_, line in enumerate(f):
|
|
data = json.loads(line)
|
|
yield id_, {
|
|
'id': data['id'],
|
|
'system_prompt': data['system_prompt'],
|
|
'question': data['question'],
|
|
'response': data['response']
|
|
} |