Upload 7 files

This commit is contained in:
Eric Hartford 2023-06-17 12:21:00 +00:00 committed by huggingface-web
parent dfc0bd752a
commit 50b61f9f1d
8 changed files with 501 additions and 0 deletions

4
.gitattributes vendored

@ -52,3 +52,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.jpg filter=lfs diff=lfs merge=lfs -text
*.jpeg filter=lfs diff=lfs merge=lfs -text
*.webp filter=lfs diff=lfs merge=lfs -text
data/cot.jsonl filter=lfs diff=lfs merge=lfs -text
data/flan.jsonl filter=lfs diff=lfs merge=lfs -text
data/niv.jsonl filter=lfs diff=lfs merge=lfs -text
data/t0.jsonl filter=lfs diff=lfs merge=lfs -text

204
FLAN-5m.ipynb Normal file

@ -0,0 +1,204 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Generates FLAN-5M data mixture from FLAN-v2 collection"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"\n",
"# System Messages\n",
"# Page 9, Table 2\n",
"SM = {\n",
" 1: \"\",\n",
" 2: \"You are an AI assistant. Provide a detailed answer so user dont need to search outside to understand the answer.\",\n",
" 3: \"You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\",\n",
" 4: \"You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\",\n",
" 5: \"You are an AI assistant that follows instruction extremely well. Help as much as you can.\",\n",
" 6: \"You are an AI assistant that helps people find information. Provide a detailed answer so user dont need to search outside to understand the answer.\",\n",
" 7: \"You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\",\n",
" 8: \"You should describe the task and explain your answer. While answering a multiple choice question, first output the correct answer(s). Then explain why other answers are wrong. Think like you are answering to a five year old.\",\n",
" 9: \"Explain how you used the definition to come up with the answer.\",\n",
" 10: \"You are an AI assistant. You should describe the task and explain your answer. While answering a multiple choice question, first output the correct answer(s). Then explain why other answers are wrong. You might need to use additional knowledge to answer the question.\",\n",
" 11: \"You are an AI assistant that helps people find information. User will you give you a question. Your task is to answer as faithfully as you can. While answering think step-bystep and justify your answer.\",\n",
" 12: \"User will you give you a task with some instruction. Your job is follow the instructions as faithfully as you can. While answering think step-by-step and justify your answer.\",\n",
" 13: \"You are a teacher. Given a task, you explain in simple steps what the task is asking, any guidelines it provides and how to use those guidelines to find the answer.\",\n",
" 14: \"You are an AI assistant, who knows every language and how to translate one language to another. Given a task, you explain in simple steps what the task is asking, any guidelines that it provides. You solve the task and show how you used the guidelines to solve the task.\",\n",
" 15: \"Given a definition of a task and a sample input, break the definition into small parts.\\nEach of those parts will have some instruction. Explain their meaning by showing an example that meets the criteria in the instruction. Use the following format:\\nPart # : a key part of the definition.\\nUsage: Sample response that meets the criteria from the key part. Explain why you think it meets the criteria.\",\n",
" 16: \"You are an AI assistant that helps people find information.\",\n",
"}\n",
"\n",
"# System Message Pickers \n",
"# Figure 6 page 10\n",
"sm_cot = lambda: SM[random.choice([6, 11, 16])]\n",
"sm_niv = lambda: SM[random.choice([1, 2, 5, 7, 9, 12, 13, 14, 15])]\n",
"sm_t0 = lambda: SM[random.choice([1, 2, 3, 5, 7])]\n",
"sm_flan = lambda multiple_choice: SM[random.choice([3, 4, 7, 8, 10])] if multiple_choice else SM[random.choice([3, 4, 7])]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"import pandas as pd\n",
"from IPython.display import display\n",
"import datasets\n",
"import tqdm\n",
"from check_if_multiple_choice import check_if_multiple_choice\n",
"\n",
"# Table 3 Page 10\n",
"cot_total = 150000\n",
"niv_total = 440000\n",
"flan_total = 2500000\n",
"t0_total = 2000000\n",
"\n",
"output_dir = \"data\"\n",
"os.makedirs(output_dir, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cot = iter(datasets.load_dataset(\n",
" \"conceptofmind/cot_submix_original\", split=\"train\", streaming=True))\n",
"\n",
"def process_cot(cot):\n",
" f = open(\"data/cot.jsonl\", \"w\", encoding='utf8')\n",
" stream = tqdm.tqdm(cot, total=cot_total) \n",
" for i, data in enumerate(stream):\n",
" if data['template_type'] != 'zs_opt':\n",
" continue\n",
" question = data['inputs']\n",
" system_prompt = sm_cot()\n",
" json.dump({\"id\": f\"cot.{i}\", \"messages\": [{\"role\": \"system\", \"content\": system_prompt}, {\"role\": \"user\", \"content\": question}]}, f, ensure_ascii=False)\n",
" f.write(\"\\n\")\n",
" if i >= cot_total:\n",
" break\n",
" f.close()\n",
" \n",
"process_cot(cot)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"niv = iter(datasets.load_dataset(\n",
" \"conceptofmind/niv2_submix_original\", split=\"train\", streaming=True))\n",
"\n",
"def process_niv(niv) :\n",
" f = open(\"data/niv.jsonl\", \"w\", encoding='utf8')\n",
" stream = tqdm.tqdm(niv, total=niv_total)\n",
" for i, data in enumerate(stream):\n",
" if not 'zs' in data['template_type']:\n",
" continue\n",
" question = data['inputs'] \n",
" system_prompt = sm_niv()\n",
" json.dump({\"id\": f\"niv.{i}\", \"messages\": [{\"role\": \"system\", \"content\": system_prompt}, {\"role\": \"user\", \"content\": question}]}, f, ensure_ascii=False)\n",
" f.write(\"\\n\")\n",
" if i >= niv_total:\n",
" break\n",
" f.close()\n",
" \n",
"process_niv(niv)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"flan = iter(datasets.load_dataset(\n",
" \"conceptofmind/flan2021_submix_original\", split=\"train\", streaming=True))\n",
"\n",
"def process_flan(flan) :\n",
" f = open(\"data/flan.jsonl\", \"w\", encoding='utf8')\n",
" stream = tqdm.tqdm(flan, total=flan_total)\n",
" for i, data in enumerate(stream):\n",
" question = data['inputs']\n",
" if not 'zs' in data['template_type']:\n",
" continue\n",
" system_prompt = sm_flan(check_if_multiple_choice(data))\n",
" json.dump({\"id\": f\"flan.{i}\", \"messages\": [{\"role\": \"system\", \"content\": system_prompt}, {\"role\": \"user\", \"content\": question}]}, f, ensure_ascii=False)\n",
" f.write(\"\\n\")\n",
" if i >= flan_total:\n",
" break\n",
" f.close()\n",
"\n",
"process_flan(flan)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"t0 = iter(datasets.load_dataset(\n",
" \"conceptofmind/t0_submix_original\", split=\"train\", streaming=True))\n",
"\n",
"def process_t0(t0) :\n",
" f = open(\"data/t0.jsonl\", \"w\", encoding='utf8')\n",
" stream = tqdm.tqdm(t0, total=t0_total)\n",
" for i, data in enumerate(stream):\n",
" question = data['inputs']\n",
" if not 'zs' in data['template_type']:\n",
" continue\n",
" system_prompt = sm_t0()\n",
" json.dump({\"id\": f\"t0.{i}\", \"messages\": [{\"role\": \"system\", \"content\": system_prompt}, {\"role\": \"user\", \"content\": question}]}, f, ensure_ascii=False)\n",
" f.write(\"\\n\")\n",
" if i >= t0_total:\n",
" break\n",
" f.close()\n",
"\n",
"process_t0(t0)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "281f1c8753b18c9d2968280632816a025c721e632f5f355c2f6dfab2614fba3c"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

281
check_if_multiple_choice.py Normal file

@ -0,0 +1,281 @@
# From https://github.com/google-research/FLAN/blob/main/flan/templates.py
# Modified to be used for figuring out which one is multiple choice
def rte_check(string):
return not string.startswith("Generate a context and a hypothesis")
def cosmos_qa_check(string: str):
if string.startswith("Write a question about the article"):
return False
elif string.endswith("Generate a question about the above context."):
return False
else:
return True
def ag_news_subset_check(string: str):
[
("{title}\n\n{text}\n\nWhat is this text about?\n{options_}", "{answer}"),
("{title}\n\n{text}\n\nWhich topic is this article about?\n{options_}", "{answer}"),
("{text}\nWhich is the best summary of this article?\n{options_}", "{answer}"),
("{text}\nWhat is this text about?\n{options_}", "{answer}"),
("{text}\n\nWhat best summarizes the content of the above article?\n{options_}", "{answer}"),
("Which is this about?\n\n{text}\n\n{options_}", "{answer}"),
("Which is an appropriate title for this article?\n\n{text}\n\n{options_}", "{answer}"),
("Select the topic that this about:\n\n{text}\n\n{options_}", "{answer}"),
("Write a title:\n{text}", "{title}"),
("{text}\n\nWhat is a good title for this?", "{title}"),
]
if string.startswith("Write a title:"):
return False
elif string.endswith("What is a good title for this?"):
return False
else:
return True
def imdb_reviews_check(string: str):
if string.startswith("Write a"):
return False
elif string.startswith("Generate a movie review with"):
return False
elif string.startswith("What's an example of a movie review?"):
return False
else:
return True
def paws_wiki_check(string: str):
if string.startswith("Please check if these have the same meaning. Answer \"yes\" if they do, otherwise \"no\"."):
return False
else:
return True
def sentiment140_check(string: str):
if string.startswith("Generate a tweet that has the following sentiment: "):
return False
elif string.startswith("Write a "):
return False
elif string.startswith("What is an example of a tweet?"):
return False
else:
return True
def story_cloze_check(string: str):
if string.startswith("Write a story that ends with this"):
return False
elif string.startswith("Write a plausible story that ends with this sentence?"):
return False
else:
return True
def copa_check(string: str):
if string.startswith("Write a sentence."):
return False
elif string.startswith("Write two sentences."):
return False
else:
return True
def yelp_polarity_reviews_check(string: str):
if string.startswith("What would be an example of an "):
return False
elif string.startswith("Generate a "):
return False
elif string.startswith("Write a "):
return False
else:
return True
def arc_check(string: str):
if string.startswith("Write a question you would see in a school textbook."):
return False
elif string.startswith("What's an example of a grad-school level question?"):
return False
elif string.startswith("I just took a test in school today. What question was I asked?"):
return False
else:
return True
def anli_check(string: str):
if string.startswith("Generate a context and a hypothesis."):
return False
else:
return True
def multirc_check(string: str):
if string.endswith("Do you have any questions?"):
return False
elif string.endswith("What question would one ask from this paragraph?"):
return False
else:
return True
def cb_check(string: str):
if string.startswith("Generate a context and a hypothesis."):
return False
else:
return True
def cola_check(string: str):
if string.startswith("Generate short a sentence that is linguistically"):
return False
elif string.startswith("Produce a brief English sentence that would be considered grammatically"):
return False
else:
return True
def sst2_check(string: str):
if string.startswith("Write a "):
return False
elif string.startswith("Generate a short movie review that has"):
return False
else:
return True
def qnli_check(string: str):
if string.startswith("Can you generate a question with a factual answer?"):
return False
else:
return True
def snli_check(string: str):
if string.startswith("Write a brief sentence."):
return False
else:
return True
def trec_check(string: str):
if string.startswith("Please ask me a question."):
return False
else:
return True
def stsb_check(string: str):
if string.endswith(
"Generate a new sentence that is, on a scale from 0 to 5, a {answer_str} in textual similarity to the above sentence."):
return False
elif string.endswith("out of 5 in terms of textual similarity to the above sentence?"):
return False
else:
return True
def piqa_check(string: str):
if string.startswith(
"What's an example of a task that requires knowledge of physical objects to perform?"):
return False
elif string.startswith("What kind of task would test someone's ability to perform physical reasoning?"):
return False
else:
return True
def openbookqa_check(string: str):
if string.startswith(
"What sentence would provide a factual answer to this question:"):
return False
elif string.startswith("What is a random fact?"):
return False
elif string.startswith("Generate a sentence that contains a fact."):
return False
else:
return True
PATTERNS = {
"rte": rte_check,
"wsc": lambda x: True,
"wsc273": lambda x: True,
"wic": lambda x: True,
"record": lambda x: True,
"natural_questions": lambda x: False,
"trivia_qa": lambda x: False,
"math_dataset": lambda x: False,
"aeslc": lambda x: False,
"cnn_dailymail": lambda x: False,
"gigaword": lambda x: False,
"multi_news": lambda x: False,
"newsroom": lambda x: False,
"samsum": lambda x: False,
"xsum": lambda x: False,
"squad_v1": lambda x: False,
"squad_v2": lambda x: False,
"drop": lambda x: False,
"quac": lambda x: False,
"para_crawl": lambda x: False,
"wmt16_translate": lambda x: False,
"wmt14_enfr": lambda x: False,
"true_case": lambda x: False,
"fix_punct": lambda x: False,
"word_segment": lambda x: False,
"cosmos_qa": cosmos_qa_check,
"ag_news_subset": ag_news_subset_check,
"bool_q": lambda x: True,
"definite_pronoun_resolution": lambda x: True,
"glue_mrpc": lambda x: True,
"glue_qqp": lambda x: True,
"imdb_reviews": imdb_reviews_check,
"paws_wiki": paws_wiki_check,
"sentiment140": sentiment140_check,
"story_cloze": story_cloze_check,
"copa": copa_check,
# Technically has multiple choice but ignored because of string parsing issues
"winogrande": lambda x: False,
"yelp_polarity_reviews": yelp_polarity_reviews_check,
"arc": arc_check,
"anli": anli_check,
"coqa": lambda x: False,
"opinion_abstracts_rotten_tomatoes": lambda x: False,
"opinion_abstracts_idebate": lambda x: False,
"common_gen": lambda x: False,
"dart": lambda x: False,
"e2e_nlg": lambda x: False,
"web_nlg_en": lambda x: False,
"wiki_lingua_english_en": lambda x: False,
"multirc": multirc_check,
"cb": cb_check,
"cola": cola_check,
"sst2": sst2_check,
"mnli": lambda x: True,
"qnli": qnli_check,
"wnli": lambda x: True,
"snli": snli_check,
"trec": trec_check,
"stsb": stsb_check,
"hellaswag": lambda x: True,
"piqa": piqa_check,
"openbookqa": openbookqa_check,
}
def check_if_multiple_choice(data_item):
inputs = data_item['inputs']
targets = data_item['targets']
task_source = data_item['task_source']
task_name = data_item['task_name']
template_type = data_item['template_type']
if '_noopt' in template_type:
return False
if 'zs' not in template_type:
raise ValueError(
"Template type does not contain zs, do not use this function for non-zs templates")
for key in list(PATTERNS.keys()):
if key + ":" in task_name:
return PATTERNS[key](inputs)

BIN
data/cot.jsonl (Stored with Git LFS) Normal file

Binary file not shown.

BIN
data/flan.jsonl (Stored with Git LFS) Normal file

Binary file not shown.

BIN
data/niv.jsonl (Stored with Git LFS) Normal file

Binary file not shown.

BIN
data/t0.jsonl (Stored with Git LFS) Normal file

Binary file not shown.

0
requirements.txt Normal file