diff --git a/README.md b/README.md index e8c0ccc..aec2504 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ dataset_info: - name: test num_bytes: 713732 num_examples: 1319 - download_size: 4915944 + download_size: 2725633 dataset_size: 4676934 - config_name: socratic features: @@ -49,8 +49,21 @@ dataset_info: - name: test num_bytes: 936859 num_examples: 1319 - download_size: 6374717 + download_size: 3164254 dataset_size: 6134967 +configs: +- config_name: main + data_files: + - split: train + path: main/train-* + - split: test + path: main/test-* +- config_name: socratic + data_files: + - split: train + path: socratic/train-* + - split: test + path: socratic/test-* --- # Dataset Card for GSM8K diff --git a/dataset_infos.json b/dataset_infos.json deleted file mode 100644 index cc67a6f..0000000 --- a/dataset_infos.json +++ /dev/null @@ -1 +0,0 @@ -{"main": {"description": "GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality\nlinguistically diverse grade school math word problems. The\ndataset was created to support the task of question answering\non basic mathematical problems that require multi-step reasoning.\n", "citation": "@misc{cobbe2021training,\n title={Training Verifiers to Solve Math Word Problems},\n author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},\n year={2021},\n eprint={2110.14168},\n archivePrefix={arXiv},\n primaryClass={cs.LG}\n}\n", "homepage": "https://openai.com/blog/grade-school-math", "license": "MIT", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "gsm8k", "config_name": "main", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3963202, "num_examples": 7473, "dataset_name": "gsm8k"}, "test": {"name": "test", "num_bytes": 713732, "num_examples": 1319, "dataset_name": "gsm8k"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl": {"num_bytes": 4166206, "checksum": "17f347dc51477c50d4efb83959dbb7c56297aba886e5544ee2aaed3024813465"}, "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl": {"num_bytes": 749738, "checksum": "3730d312f6e3440559ace48831e51066acaca737f6eabec99bccb9e4b3c39d14"}}, "download_size": 4915944, "post_processing_size": null, "dataset_size": 4676934, "size_in_bytes": 9592878}, "socratic": {"description": "GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality\nlinguistically diverse grade school math word problems. The\ndataset was created to support the task of question answering\non basic mathematical problems that require multi-step reasoning.\n", "citation": "@misc{cobbe2021training,\n title={Training Verifiers to Solve Math Word Problems},\n author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},\n year={2021},\n eprint={2110.14168},\n archivePrefix={arXiv},\n primaryClass={cs.LG}\n}\n", "homepage": "https://openai.com/blog/grade-school-math", "license": "MIT", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "gsm8k", "config_name": "socratic", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 5198108, "num_examples": 7473, "dataset_name": "gsm8k"}, "test": {"name": "test", "num_bytes": 936859, "num_examples": 1319, "dataset_name": "gsm8k"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train_socratic.jsonl": {"num_bytes": 5401739, "checksum": "153d86551187cfd64ef7afb59bfd0ef75cea3ae9388e7ad31e43920b6dd77872"}, "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test_socratic.jsonl": {"num_bytes": 972978, "checksum": "c96673362fa7a699f4836a9b6474a067448f95fe58064727501ee63ba4c3fdb6"}}, "download_size": 6374717, "post_processing_size": null, "dataset_size": 6134967, "size_in_bytes": 12509684}} \ No newline at end of file diff --git a/gsm8k.py b/gsm8k.py deleted file mode 100644 index 044df0b..0000000 --- a/gsm8k.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Grade School Math 8k dataset.""" - -import json -import textwrap - -import datasets - - -_CITATION = """\ -@misc{cobbe2021training, - title={Training Verifiers to Solve Math Word Problems}, - author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman}, - year={2021}, - eprint={2110.14168}, - archivePrefix={arXiv}, - primaryClass={cs.LG} -} -""" - -_DESCRIPTION = """\ -GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality -linguistically diverse grade school math word problems. The -dataset was created to support the task of question answering -on basic mathematical problems that require multi-step reasoning. -""" - -_HOMEPAGE = "https://openai.com/blog/grade-school-math" - -_LICENSE = "MIT" - -_BASE_URL = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/" - - -class Gsm8kConfig(datasets.BuilderConfig): - """BuilderConfig for GSM8K.""" - - def __init__(self, urls, **kwargs): - """BuilderConfig for GSM8K. - - Args: - urls: *dict[string]*, the urls for each split of the GSM8k set. - """ - super().__init__(version=datasets.Version("1.1.0"), **kwargs) - self.urls = urls - - -class Gsm8k(datasets.GeneratorBasedBuilder): - """Grade School Math 8k (GSM8K)""" - - BUILDER_CONFIGS = [ - Gsm8kConfig( - name="main", - description=textwrap.dedent( - """ - It is segmented into 7.5K training problems and 1K test problems. - These problems take between 2 and 8 steps to solve, and solutions - primarily involve performing a sequence of elementary calculations - using basic arithmetic operations (+ - / *) to reach the final - answer. A bright middle school student should be able to solve - every problem. - """, - ), - urls={ - "train": _BASE_URL + "train.jsonl", - "test": _BASE_URL + "test.jsonl", - }, - ), - Gsm8kConfig( - name="socratic", - description=textwrap.dedent( - """ - Additionally, there is a modified solution format that injects - automatically generated "Socratic subquestions" before each step. - """ - ), - urls={ - "train": _BASE_URL + "train_socratic.jsonl", - "test": _BASE_URL + "test_socratic.jsonl", - }, - ), - ] - - def _info(self): - features = datasets.Features( - { - "question": datasets.Value("string"), - "answer": datasets.Value("string"), - } - ) - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - homepage=_HOMEPAGE, - license=_LICENSE, - citation=_CITATION, - ) - - def _split_generators(self, dl_manager): - data_dir = dl_manager.download_and_extract(self.config.urls) - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": data_dir["train"], - }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": data_dir["test"], - }, - ), - ] - - def _generate_examples(self, filepath): - with open(filepath, encoding="utf-8") as f: - for key, row in enumerate(f): - data = json.loads(row) - yield key, { - "question": data["question"], - "answer": data["answer"], - } diff --git a/main/test-00000-of-00001.parquet b/main/test-00000-of-00001.parquet new file mode 100644 index 0000000..861005f --- /dev/null +++ b/main/test-00000-of-00001.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee7b8da9e381df27b9e3f7758a159ab2bdaa4dbaa910546cbbc47e0cb44e4f59 +size 419088 diff --git a/main/train-00000-of-00001.parquet b/main/train-00000-of-00001.parquet new file mode 100644 index 0000000..61b47c9 --- /dev/null +++ b/main/train-00000-of-00001.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea82612ea9582142387730c793eb67d3b12849002bc0b7fa6f8efafa7351419d +size 2306545 diff --git a/socratic/test-00000-of-00001.parquet b/socratic/test-00000-of-00001.parquet new file mode 100644 index 0000000..a3283da --- /dev/null +++ b/socratic/test-00000-of-00001.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:477dba7028204b465491b8f346ec774262ccd77147d942a0881e94cc6da7c99e +size 486995 diff --git a/socratic/train-00000-of-00001.parquet b/socratic/train-00000-of-00001.parquet new file mode 100644 index 0000000..439b1ce --- /dev/null +++ b/socratic/train-00000-of-00001.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54eb5fd2105a9126ac6410541b2e9dbe0199701258957c9af02d9ed675c90378 +size 2677259