Upload OpenOrca.py
This commit is contained in:
parent
e23f0c7e01
commit
cd1cb0237b
38
OpenOrca.py
Normal file
38
OpenOrca.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
from datasets import DatasetBuilder, DatasetInfo, SplitGenerator, SplitInfo
|
||||||
|
from datasets.features import Features, Value
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
class CustomDataset(DatasetBuilder):
|
||||||
|
def _info(self) -> DatasetInfo:
|
||||||
|
return DatasetInfo(
|
||||||
|
features=Features({
|
||||||
|
'id': Value('string'),
|
||||||
|
'system_prompt': Value('string'),
|
||||||
|
'question': Value('string'),
|
||||||
|
'response': Value('string')
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _split_generators(self, dl_manager):
|
||||||
|
base_path = 'path_to_your_data'
|
||||||
|
folders = ['001-cot', '002-flan', '003-flan-1m', '004-flan1m-aug-shuf', '005-flan-5m',
|
||||||
|
'006-flan-chatgpt', '007-gpt4_100k', '008-niv', '009-t0'] # add more as needed
|
||||||
|
split_generators = []
|
||||||
|
for folder in folders:
|
||||||
|
split_generators.extend([
|
||||||
|
SplitGenerator(name=f'{folder}_train', gen_kwargs={'filepath': os.path.join(base_path, folder, f'{folder}-train.jsonl')}),
|
||||||
|
SplitGenerator(name=f'{folder}_test', gen_kwargs={'filepath': os.path.join(base_path, folder, f'{folder}-test.jsonl')}),
|
||||||
|
])
|
||||||
|
return split_generators
|
||||||
|
|
||||||
|
def _generate_examples(self, filepath):
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
for id_, line in enumerate(f):
|
||||||
|
data = json.loads(line)
|
||||||
|
yield id_, {
|
||||||
|
'id': data['id'],
|
||||||
|
'system_prompt': data['system_prompt'],
|
||||||
|
'question': data['question'],
|
||||||
|
'response': data['response']
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user