Convert dataset to Parquet (#3)
- Convert dataset to Parquet (2c868b8af0ba15693ce607396871eb07802efab1) - Add socratic data files (60b4b82b84c2dac2b54f7d7f3056eae01514f12e) - Delete loading script (1383163dbe0fd47ea822c587d5acbeb46f53bc61) - Delete legacy dataset_infos.json (d14e435e3a378c1bc8ad3ce2038b366968b8cf55)
This commit is contained in:
parent
25fb0412db
commit
e53f048856
17
README.md
17
README.md
@ -34,7 +34,7 @@ dataset_info:
|
|||||||
- name: test
|
- name: test
|
||||||
num_bytes: 713732
|
num_bytes: 713732
|
||||||
num_examples: 1319
|
num_examples: 1319
|
||||||
download_size: 4915944
|
download_size: 2725633
|
||||||
dataset_size: 4676934
|
dataset_size: 4676934
|
||||||
- config_name: socratic
|
- config_name: socratic
|
||||||
features:
|
features:
|
||||||
@ -49,8 +49,21 @@ dataset_info:
|
|||||||
- name: test
|
- name: test
|
||||||
num_bytes: 936859
|
num_bytes: 936859
|
||||||
num_examples: 1319
|
num_examples: 1319
|
||||||
download_size: 6374717
|
download_size: 3164254
|
||||||
dataset_size: 6134967
|
dataset_size: 6134967
|
||||||
|
configs:
|
||||||
|
- config_name: main
|
||||||
|
data_files:
|
||||||
|
- split: train
|
||||||
|
path: main/train-*
|
||||||
|
- split: test
|
||||||
|
path: main/test-*
|
||||||
|
- config_name: socratic
|
||||||
|
data_files:
|
||||||
|
- split: train
|
||||||
|
path: socratic/train-*
|
||||||
|
- split: test
|
||||||
|
path: socratic/test-*
|
||||||
---
|
---
|
||||||
|
|
||||||
# Dataset Card for GSM8K
|
# Dataset Card for GSM8K
|
||||||
|
@ -1 +0,0 @@
|
|||||||
{"main": {"description": "GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality\nlinguistically diverse grade school math word problems. The\ndataset was created to support the task of question answering\non basic mathematical problems that require multi-step reasoning.\n", "citation": "@misc{cobbe2021training,\n title={Training Verifiers to Solve Math Word Problems},\n author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},\n year={2021},\n eprint={2110.14168},\n archivePrefix={arXiv},\n primaryClass={cs.LG}\n}\n", "homepage": "https://openai.com/blog/grade-school-math", "license": "MIT", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "gsm8k", "config_name": "main", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3963202, "num_examples": 7473, "dataset_name": "gsm8k"}, "test": {"name": "test", "num_bytes": 713732, "num_examples": 1319, "dataset_name": "gsm8k"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl": {"num_bytes": 4166206, "checksum": "17f347dc51477c50d4efb83959dbb7c56297aba886e5544ee2aaed3024813465"}, "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl": {"num_bytes": 749738, "checksum": "3730d312f6e3440559ace48831e51066acaca737f6eabec99bccb9e4b3c39d14"}}, "download_size": 4915944, "post_processing_size": null, "dataset_size": 4676934, "size_in_bytes": 9592878}, "socratic": {"description": "GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality\nlinguistically diverse grade school math word problems. The\ndataset was created to support the task of question answering\non basic mathematical problems that require multi-step reasoning.\n", "citation": "@misc{cobbe2021training,\n title={Training Verifiers to Solve Math Word Problems},\n author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},\n year={2021},\n eprint={2110.14168},\n archivePrefix={arXiv},\n primaryClass={cs.LG}\n}\n", "homepage": "https://openai.com/blog/grade-school-math", "license": "MIT", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "gsm8k", "config_name": "socratic", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 5198108, "num_examples": 7473, "dataset_name": "gsm8k"}, "test": {"name": "test", "num_bytes": 936859, "num_examples": 1319, "dataset_name": "gsm8k"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train_socratic.jsonl": {"num_bytes": 5401739, "checksum": "153d86551187cfd64ef7afb59bfd0ef75cea3ae9388e7ad31e43920b6dd77872"}, "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test_socratic.jsonl": {"num_bytes": 972978, "checksum": "c96673362fa7a699f4836a9b6474a067448f95fe58064727501ee63ba4c3fdb6"}}, "download_size": 6374717, "post_processing_size": null, "dataset_size": 6134967, "size_in_bytes": 12509684}}
|
|
135
gsm8k.py
135
gsm8k.py
@ -1,135 +0,0 @@
|
|||||||
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
"""Grade School Math 8k dataset."""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import textwrap
|
|
||||||
|
|
||||||
import datasets
|
|
||||||
|
|
||||||
|
|
||||||
_CITATION = """\
|
|
||||||
@misc{cobbe2021training,
|
|
||||||
title={Training Verifiers to Solve Math Word Problems},
|
|
||||||
author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
|
|
||||||
year={2021},
|
|
||||||
eprint={2110.14168},
|
|
||||||
archivePrefix={arXiv},
|
|
||||||
primaryClass={cs.LG}
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
_DESCRIPTION = """\
|
|
||||||
GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality
|
|
||||||
linguistically diverse grade school math word problems. The
|
|
||||||
dataset was created to support the task of question answering
|
|
||||||
on basic mathematical problems that require multi-step reasoning.
|
|
||||||
"""
|
|
||||||
|
|
||||||
_HOMEPAGE = "https://openai.com/blog/grade-school-math"
|
|
||||||
|
|
||||||
_LICENSE = "MIT"
|
|
||||||
|
|
||||||
_BASE_URL = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/"
|
|
||||||
|
|
||||||
|
|
||||||
class Gsm8kConfig(datasets.BuilderConfig):
|
|
||||||
"""BuilderConfig for GSM8K."""
|
|
||||||
|
|
||||||
def __init__(self, urls, **kwargs):
|
|
||||||
"""BuilderConfig for GSM8K.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
urls: *dict[string]*, the urls for each split of the GSM8k set.
|
|
||||||
"""
|
|
||||||
super().__init__(version=datasets.Version("1.1.0"), **kwargs)
|
|
||||||
self.urls = urls
|
|
||||||
|
|
||||||
|
|
||||||
class Gsm8k(datasets.GeneratorBasedBuilder):
|
|
||||||
"""Grade School Math 8k (GSM8K)"""
|
|
||||||
|
|
||||||
BUILDER_CONFIGS = [
|
|
||||||
Gsm8kConfig(
|
|
||||||
name="main",
|
|
||||||
description=textwrap.dedent(
|
|
||||||
"""
|
|
||||||
It is segmented into 7.5K training problems and 1K test problems.
|
|
||||||
These problems take between 2 and 8 steps to solve, and solutions
|
|
||||||
primarily involve performing a sequence of elementary calculations
|
|
||||||
using basic arithmetic operations (+ - / *) to reach the final
|
|
||||||
answer. A bright middle school student should be able to solve
|
|
||||||
every problem.
|
|
||||||
""",
|
|
||||||
),
|
|
||||||
urls={
|
|
||||||
"train": _BASE_URL + "train.jsonl",
|
|
||||||
"test": _BASE_URL + "test.jsonl",
|
|
||||||
},
|
|
||||||
),
|
|
||||||
Gsm8kConfig(
|
|
||||||
name="socratic",
|
|
||||||
description=textwrap.dedent(
|
|
||||||
"""
|
|
||||||
Additionally, there is a modified solution format that injects
|
|
||||||
automatically generated "Socratic subquestions" before each step.
|
|
||||||
"""
|
|
||||||
),
|
|
||||||
urls={
|
|
||||||
"train": _BASE_URL + "train_socratic.jsonl",
|
|
||||||
"test": _BASE_URL + "test_socratic.jsonl",
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
def _info(self):
|
|
||||||
features = datasets.Features(
|
|
||||||
{
|
|
||||||
"question": datasets.Value("string"),
|
|
||||||
"answer": datasets.Value("string"),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return datasets.DatasetInfo(
|
|
||||||
description=_DESCRIPTION,
|
|
||||||
features=features,
|
|
||||||
homepage=_HOMEPAGE,
|
|
||||||
license=_LICENSE,
|
|
||||||
citation=_CITATION,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _split_generators(self, dl_manager):
|
|
||||||
data_dir = dl_manager.download_and_extract(self.config.urls)
|
|
||||||
return [
|
|
||||||
datasets.SplitGenerator(
|
|
||||||
name=datasets.Split.TRAIN,
|
|
||||||
gen_kwargs={
|
|
||||||
"filepath": data_dir["train"],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
datasets.SplitGenerator(
|
|
||||||
name=datasets.Split.TEST,
|
|
||||||
gen_kwargs={
|
|
||||||
"filepath": data_dir["test"],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
def _generate_examples(self, filepath):
|
|
||||||
with open(filepath, encoding="utf-8") as f:
|
|
||||||
for key, row in enumerate(f):
|
|
||||||
data = json.loads(row)
|
|
||||||
yield key, {
|
|
||||||
"question": data["question"],
|
|
||||||
"answer": data["answer"],
|
|
||||||
}
|
|
BIN
main/test-00000-of-00001.parquet
(Stored with Git LFS)
Normal file
BIN
main/test-00000-of-00001.parquet
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
main/train-00000-of-00001.parquet
(Stored with Git LFS)
Normal file
BIN
main/train-00000-of-00001.parquet
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
socratic/test-00000-of-00001.parquet
(Stored with Git LFS)
Normal file
BIN
socratic/test-00000-of-00001.parquet
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
socratic/train-00000-of-00001.parquet
(Stored with Git LFS)
Normal file
BIN
socratic/train-00000-of-00001.parquet
(Stored with Git LFS)
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user