diff --git a/OpenOrca.py b/OpenOrca.py index ecdfcd6..7faef74 100644 --- a/OpenOrca.py +++ b/OpenOrca.py @@ -20,20 +20,22 @@ class CustomDataset(DatasetBuilder): '006-flan-chatgpt', '007-gpt4_100k', '008-niv', '009-t0'] # add more as needed split_generators = [] for folder in folders: - split_generators.extend([ - SplitGenerator(name=f'{folder.replace("-", "_")}_train', gen_kwargs={"filepath": os.path.join(base_path, f'{folder}/train.jsonl')}), - SplitGenerator(name=f'{folder.replace("-", "_")}_test', gen_kwargs={"filepath": os.path.join(base_path, f'{folder}/test.jsonl')}), - ]) + train_file_path = os.path.join(base_path, f'{folder}/cot-train.jsonl') + test_file_path = os.path.join(base_path, f'{folder}/cot-test.jsonl') + if os.path.isfile(train_file_path) and os.path.isfile(test_file_path): + split_generators.extend([ + SplitGenerator(name=f'{folder.replace("-", "_")}_train', gen_kwargs={"filepath": train_file_path}), + SplitGenerator(name=f'{folder.replace("-", "_")}_test', gen_kwargs={"filepath": test_file_path}), + ]) return split_generators def _generate_examples(self, filepath): - with open(filepath, 'r') as f: + with open(filepath, 'r', encoding='utf-8') as f: for id_, line in enumerate(f): - data = json.loads(line) + data = json.loads(line.strip()) yield id_, { - 'id': data['id'], - 'system_prompt': data['system_prompt'], - 'question': data['question'], - 'response': data['response'] + 'id': data.get('id', ''), + 'system_prompt': data.get('system_prompt', ''), + 'question': data.get('question', ''), + 'response': data.get('response', '') } -