Update OpenOrca.py

This commit is contained in:
Alignment Lab AI 2023-06-30 00:02:09 +00:00 committed by huggingface-web
parent 762ff80316
commit 4e4ee209e8

@ -20,20 +20,22 @@ class CustomDataset(DatasetBuilder):
'006-flan-chatgpt', '007-gpt4_100k', '008-niv', '009-t0'] # add more as needed
split_generators = []
for folder in folders:
split_generators.extend([
SplitGenerator(name=f'{folder.replace("-", "_")}_train', gen_kwargs={"filepath": os.path.join(base_path, f'{folder}/train.jsonl')}),
SplitGenerator(name=f'{folder.replace("-", "_")}_test', gen_kwargs={"filepath": os.path.join(base_path, f'{folder}/test.jsonl')}),
])
train_file_path = os.path.join(base_path, f'{folder}/cot-train.jsonl')
test_file_path = os.path.join(base_path, f'{folder}/cot-test.jsonl')
if os.path.isfile(train_file_path) and os.path.isfile(test_file_path):
split_generators.extend([
SplitGenerator(name=f'{folder.replace("-", "_")}_train', gen_kwargs={"filepath": train_file_path}),
SplitGenerator(name=f'{folder.replace("-", "_")}_test', gen_kwargs={"filepath": test_file_path}),
])
return split_generators
def _generate_examples(self, filepath):
with open(filepath, 'r') as f:
with open(filepath, 'r', encoding='utf-8') as f:
for id_, line in enumerate(f):
data = json.loads(line)
data = json.loads(line.strip())
yield id_, {
'id': data['id'],
'system_prompt': data['system_prompt'],
'question': data['question'],
'response': data['response']
'id': data.get('id', ''),
'system_prompt': data.get('system_prompt', ''),
'question': data.get('question', ''),
'response': data.get('response', '')
}