From c1ee3417f7333f93248590f03eac23c61247760b Mon Sep 17 00:00:00 2001
From: Bleys
Date: Sat, 15 Jul 2023 00:21:18 +0000
Subject: [PATCH] Update README.md
---
README.md | 52 +++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 47 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index 89610b6..1b6ee5b 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ task_categories:
- feature-extraction
- text-generation
- text2text-generation
-pretty_name: Open Orca
+pretty_name: OpenOrca
size_categories:
- 10M🐋 The Open Orca Dataset! 🐋
+🐋 The OpenOrca Dataset! 🐋
![OpenOrca Logo](https://huggingface.co/datasets/Open-Orca/OpenOrca/resolve/main/OpenOrcaLogo.png "OpenOrca Logo")
-We are thrilled to announce the release of the Open Orca dataset!
+We are thrilled to announce the release of the OpenOrca dataset!
This rich collection of augmented FLAN data aligns, as best as possible, with the distributions outlined in the [Orca paper](https://arxiv.org/abs/2306.02707).
It has been instrumental in generating high-performing model checkpoints and serves as a valuable resource for all NLP researchers and developers!
@@ -56,7 +56,7 @@ It beats current state of the art models on BigBench-Hard and AGIEval, and achie
# Dataset Summary
-The Open Orca dataset is a collection of augmented [FLAN Collection data](https://arxiv.org/abs/2301.13688).
+The OpenOrca dataset is a collection of augmented [FLAN Collection data](https://arxiv.org/abs/2301.13688).
Currently ~1M GPT-4 completions, and ~3.2M GPT-3.5 completions.
It is tabularized in alignment with the distributions presented in the ORCA paper and currently represents a partial completion of the full intended dataset, with ongoing generation to expand its scope.
The data is primarily used for training and evaluation in the field of natural language processing.
@@ -185,4 +185,46 @@ Further, the data should be used in accordance with the guidelines and recommend
This dataset is organized such that it can be naively loaded via Hugging Face datasets library.
We recommend using streaming due to the large size of the files.
-Regular updates and data generation progress can be monitored through the OpenOrca repository on Hugging Face.
\ No newline at end of file
+Regular updates and data generation progress can be monitored through the OpenOrca repository on Hugging Face.
+
+
+# Citation
+
+```bibtex
+@misc{OpenOrca,
+ title = {OpenOrca: An Open Dataset of Augmented Reasoning Traces},
+ author = {Wing Lian and Bleys Goodson and Eugene Pentland and Austin Cook and Chanvichet Vong and "Teknium"},
+ year = {2023},
+ publisher = {HuggingFace},
+ journal = {HuggingFace repository},
+ howpublished = {\url{https://https://huggingface.co/Open-Orca/OpenOrca},
+}
+```
+```bibtex
+@misc{mukherjee2023orca,
+ title={Orca: Progressive Learning from Complex Explanation Traces of GPT-4},
+ author={Subhabrata Mukherjee and Arindam Mitra and Ganesh Jawahar and Sahaj Agarwal and Hamid Palangi and Ahmed Awadallah},
+ year={2023},
+ eprint={2306.02707},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+```
+```bibtex
+@misc{longpre2023flan,
+ title={The Flan Collection: Designing Data and Methods for Effective Instruction Tuning},
+ author={Shayne Longpre and Le Hou and Tu Vu and Albert Webson and Hyung Won Chung and Yi Tay and Denny Zhou and Quoc V. Le and Barret Zoph and Jason Wei and Adam Roberts},
+ year={2023},
+ eprint={2301.13688},
+ archivePrefix={arXiv},
+ primaryClass={cs.AI}
+}
+```
+```bibtex
+@software{touvron2023llama,
+ title={LLaMA: Open and Efficient Foundation Language Models},
+ author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and Rodriguez, Aurelien and Joulin, Armand and Grave, Edouard and Lample, Guillaume},
+ journal={arXiv preprint arXiv:2302.13971},
+ year={2023}
+}
+```
\ No newline at end of file