Upload 7 files

2023-06-17 12:21:00 +00:00 · 2023-06-17 12:21:00 +00:00 · 50b61f9f1d
commit 50b61f9f1d
parent dfc0bd752a
8 changed files with 501 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -52,3 +52,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text
 *.jpeg filter=lfs diff=lfs merge=lfs -text
 *.webp filter=lfs diff=lfs merge=lfs -text
+data/cot.jsonl filter=lfs diff=lfs merge=lfs -text
+data/flan.jsonl filter=lfs diff=lfs merge=lfs -text
+data/niv.jsonl filter=lfs diff=lfs merge=lfs -text
+data/t0.jsonl filter=lfs diff=lfs merge=lfs -text
--- a/FLAN-5m.ipynb
+++ b/FLAN-5m.ipynb
@ -0,0 +1,204 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Generates FLAN-5M data mixture from FLAN-v2 collection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "\n",
+    "# System Messages\n",
+    "# Page 9, Table 2\n",
+    "SM = {\n",
+    "    1: \"\",\n",
+    "    2: \"You are an AI assistant. Provide a detailed answer so user don’t need to search outside to understand the answer.\",\n",
+    "    3: \"You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\",\n",
+    "    4: \"You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\",\n",
+    "    5: \"You are an AI assistant that follows instruction extremely well. Help as much as you can.\",\n",
+    "    6: \"You are an AI assistant that helps people find information. Provide a detailed answer so user don’t need to search outside to understand the answer.\",\n",
+    "    7: \"You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\",\n",
+    "    8: \"You should describe the task and explain your answer. While answering a multiple choice question, first output the correct answer(s). Then explain why other answers are wrong. Think like you are answering to a five year old.\",\n",
+    "    9: \"Explain how you used the definition to come up with the answer.\",\n",
+    "    10: \"You are an AI assistant. You should describe the task and explain your answer. While answering a multiple choice question, first output the correct answer(s). Then explain why other answers are wrong. You might need to use additional knowledge to answer the question.\",\n",
+    "    11: \"You are an AI assistant that helps people find information. User will you give you a question. Your task is to answer as faithfully as you can. While answering think step-bystep and justify your answer.\",\n",
+    "    12: \"User will you give you a task with some instruction. Your job is follow the instructions as faithfully as you can. While answering think step-by-step and justify your answer.\",\n",
+    "    13: \"You are a teacher. Given a task, you explain in simple steps what the task is asking, any guidelines it provides and how to use those guidelines to find the answer.\",\n",
+    "    14: \"You are an AI assistant, who knows every language and how to translate one language to another. Given a task, you explain in simple steps what the task is asking, any guidelines that it provides. You solve the task and show how you used the guidelines to solve the task.\",\n",
+    "    15: \"Given a definition of a task and a sample input, break the definition into small parts.\\nEach of those parts will have some instruction. Explain their meaning by showing an example that meets the criteria in the instruction. Use the following format:\\nPart  # : a key part of the definition.\\nUsage: Sample response that meets the criteria from the key part. Explain why you think it meets the criteria.\",\n",
+    "    16: \"You are an AI assistant that helps people find information.\",\n",
+    "}\n",
+    "\n",
+    "# System Message Pickers \n",
+    "# Figure 6 page 10\n",
+    "sm_cot = lambda: SM[random.choice([6, 11, 16])]\n",
+    "sm_niv = lambda: SM[random.choice([1, 2, 5, 7, 9, 12, 13, 14, 15])]\n",
+    "sm_t0 = lambda: SM[random.choice([1, 2, 3, 5, 7])]\n",
+    "sm_flan = lambda multiple_choice: SM[random.choice([3, 4, 7, 8, 10])] if multiple_choice else SM[random.choice([3, 4, 7])]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import pandas as pd\n",
+    "from IPython.display import display\n",
+    "import datasets\n",
+    "import tqdm\n",
+    "from check_if_multiple_choice import check_if_multiple_choice\n",
+    "\n",
+    "# Table 3 Page 10\n",
+    "cot_total = 150000\n",
+    "niv_total = 440000\n",
+    "flan_total = 2500000\n",
+    "t0_total = 2000000\n",
+    "\n",
+    "output_dir = \"data\"\n",
+    "os.makedirs(output_dir, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cot = iter(datasets.load_dataset(\n",
+    "    \"conceptofmind/cot_submix_original\", split=\"train\", streaming=True))\n",
+    "\n",
+    "def process_cot(cot):\n",
+    "    f = open(\"data/cot.jsonl\", \"w\", encoding='utf8')\n",
+    "    stream = tqdm.tqdm(cot, total=cot_total)  \n",
+    "    for i, data in enumerate(stream):\n",
+    "        if data['template_type'] != 'zs_opt':\n",
+    "            continue\n",
+    "        question = data['inputs']\n",
+    "        system_prompt = sm_cot()\n",
+    "        json.dump({\"id\": f\"cot.{i}\", \"messages\": [{\"role\": \"system\", \"content\": system_prompt}, {\"role\": \"user\", \"content\": question}]}, f, ensure_ascii=False)\n",
+    "        f.write(\"\\n\")\n",
+    "        if i >= cot_total:\n",
+    "            break\n",
+    "    f.close()\n",
+    "    \n",
+    "process_cot(cot)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "niv = iter(datasets.load_dataset(\n",
+    "    \"conceptofmind/niv2_submix_original\", split=\"train\", streaming=True))\n",
+    "\n",
+    "def process_niv(niv) :\n",
+    "  f = open(\"data/niv.jsonl\", \"w\", encoding='utf8')\n",
+    "  stream = tqdm.tqdm(niv, total=niv_total)\n",
+    "  for i, data in enumerate(stream):\n",
+    "    if not 'zs' in data['template_type']:\n",
+    "      continue\n",
+    "    question = data['inputs']    \n",
+    "    system_prompt = sm_niv()\n",
+    "    json.dump({\"id\": f\"niv.{i}\", \"messages\": [{\"role\": \"system\", \"content\": system_prompt}, {\"role\": \"user\", \"content\": question}]}, f, ensure_ascii=False)\n",
+    "    f.write(\"\\n\")\n",
+    "    if i >= niv_total:\n",
+    "      break\n",
+    "  f.close()\n",
+    "  \n",
+    "process_niv(niv)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "flan = iter(datasets.load_dataset(\n",
+    "    \"conceptofmind/flan2021_submix_original\", split=\"train\", streaming=True))\n",
+    "\n",
+    "def process_flan(flan) :\n",
+    "  f = open(\"data/flan.jsonl\", \"w\", encoding='utf8')\n",
+    "  stream = tqdm.tqdm(flan, total=flan_total)\n",
+    "  for i, data in enumerate(stream):\n",
+    "    question = data['inputs']\n",
+    "    if not 'zs' in data['template_type']:\n",
+    "      continue\n",
+    "    system_prompt = sm_flan(check_if_multiple_choice(data))\n",
+    "    json.dump({\"id\": f\"flan.{i}\", \"messages\": [{\"role\": \"system\", \"content\": system_prompt}, {\"role\": \"user\", \"content\": question}]}, f, ensure_ascii=False)\n",
+    "    f.write(\"\\n\")\n",
+    "    if i >= flan_total:\n",
+    "      break\n",
+    "  f.close()\n",
+    "\n",
+    "process_flan(flan)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t0 = iter(datasets.load_dataset(\n",
+    "    \"conceptofmind/t0_submix_original\", split=\"train\", streaming=True))\n",
+    "\n",
+    "def process_t0(t0) :\n",
+    "  f = open(\"data/t0.jsonl\", \"w\", encoding='utf8')\n",
+    "  stream = tqdm.tqdm(t0, total=t0_total)\n",
+    "  for i, data in enumerate(stream):\n",
+    "    question = data['inputs']\n",
+    "    if not 'zs' in data['template_type']:\n",
+    "      continue\n",
+    "    system_prompt = sm_t0()\n",
+    "    json.dump({\"id\": f\"t0.{i}\", \"messages\": [{\"role\": \"system\", \"content\": system_prompt}, {\"role\": \"user\", \"content\": question}]}, f, ensure_ascii=False)\n",
+    "    f.write(\"\\n\")\n",
+    "    if i >= t0_total:\n",
+    "      break\n",
+    "  f.close()\n",
+    "\n",
+    "process_t0(t0)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "281f1c8753b18c9d2968280632816a025c721e632f5f355c2f6dfab2614fba3c"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/check_if_multiple_choice.py
+++ b/check_if_multiple_choice.py
@ -0,0 +1,281 @@
+# From https://github.com/google-research/FLAN/blob/main/flan/templates.py
+# Modified to be used for figuring out which one is multiple choice
+
+
+def rte_check(string):
+    return not string.startswith("Generate a context and a hypothesis")
+
+
+def cosmos_qa_check(string: str):
+    if string.startswith("Write a question about the article"):
+        return False
+    elif string.endswith("Generate a question about the above context."):
+        return False
+    else:
+        return True
+
+
+def ag_news_subset_check(string: str):
+    [
+        ("{title}\n\n{text}\n\nWhat is this text about?\n{options_}", "{answer}"),
+        ("{title}\n\n{text}\n\nWhich topic is this article about?\n{options_}", "{answer}"),
+        ("{text}\nWhich is the best summary of this article?\n{options_}", "{answer}"),
+        ("{text}\nWhat is this text about?\n{options_}", "{answer}"),
+        ("{text}\n\nWhat best summarizes the content of the above article?\n{options_}", "{answer}"),
+        ("Which is this about?\n\n{text}\n\n{options_}", "{answer}"),
+        ("Which is an appropriate title for this article?\n\n{text}\n\n{options_}", "{answer}"),
+        ("Select the topic that this about:\n\n{text}\n\n{options_}", "{answer}"),
+        ("Write a title:\n{text}", "{title}"),
+        ("{text}\n\nWhat is a good title for this?", "{title}"),
+    ]
+    if string.startswith("Write a title:"):
+        return False
+    elif string.endswith("What is a good title for this?"):
+        return False
+    else:
+        return True
+
+
+def imdb_reviews_check(string: str):
+    if string.startswith("Write a"):
+        return False
+    elif string.startswith("Generate a movie review with"):
+        return False
+    elif string.startswith("What's an example of a movie review?"):
+        return False
+    else:
+        return True
+
+
+def paws_wiki_check(string: str):
+    if string.startswith("Please check if these have the same meaning. Answer \"yes\" if they do, otherwise \"no\"."):
+        return False
+    else:
+        return True
+
+
+def sentiment140_check(string: str):
+    if string.startswith("Generate a tweet that has the following sentiment: "):
+        return False
+    elif string.startswith("Write a "):
+        return False
+    elif string.startswith("What is an example of a tweet?"):
+        return False
+    else:
+        return True
+
+
+def story_cloze_check(string: str):
+    if string.startswith("Write a story that ends with this"):
+        return False
+    elif string.startswith("Write a plausible story that ends with this sentence?"):
+        return False
+    else:
+        return True
+
+
+def copa_check(string: str):
+    if string.startswith("Write a sentence."):
+        return False
+    elif string.startswith("Write two sentences."):
+        return False
+    else:
+        return True
+
+
+def yelp_polarity_reviews_check(string: str):
+    if string.startswith("What would be an example of an "):
+        return False
+    elif string.startswith("Generate a "):
+        return False
+    elif string.startswith("Write a "):
+        return False
+    else:
+        return True
+
+
+def arc_check(string: str):
+    if string.startswith("Write a question you would see in a school textbook."):
+        return False
+    elif string.startswith("What's an example of a grad-school level question?"):
+        return False
+    elif string.startswith("I just took a test in school today. What question was I asked?"):
+        return False
+    else:
+        return True
+
+
+def anli_check(string: str):
+    if string.startswith("Generate a context and a hypothesis."):
+        return False
+    else:
+        return True
+
+
+def multirc_check(string: str):
+    if string.endswith("Do you have any questions?"):
+        return False
+    elif string.endswith("What question would one ask from this paragraph?"):
+        return False
+    else:
+        return True
+
+
+def cb_check(string: str):
+    if string.startswith("Generate a context and a hypothesis."):
+        return False
+    else:
+        return True
+
+
+def cola_check(string: str):
+    if string.startswith("Generate short a sentence that is linguistically"):
+        return False
+    elif string.startswith("Produce a brief English sentence that would be considered grammatically"):
+        return False
+    else:
+        return True
+
+
+def sst2_check(string: str):
+    if string.startswith("Write a "):
+        return False
+    elif string.startswith("Generate a short movie review that has"):
+        return False
+    else:
+        return True
+
+
+def qnli_check(string: str):
+    if string.startswith("Can you generate a question with a factual answer?"):
+        return False
+    else:
+        return True
+
+
+def snli_check(string: str):
+    if string.startswith("Write a brief sentence."):
+        return False
+    else:
+        return True
+
+
+def trec_check(string: str):
+    if string.startswith("Please ask me a question."):
+        return False
+    else:
+        return True
+
+
+def stsb_check(string: str):
+    if string.endswith(
+            "Generate a new sentence that is, on a scale from 0 to 5, a {answer_str} in textual similarity to the above sentence."):
+        return False
+    elif string.endswith("out of 5 in terms of textual similarity to the above sentence?"):
+        return False
+    else:
+        return True
+
+
+def piqa_check(string: str):
+    if string.startswith(
+            "What's an example of a task that requires knowledge of physical objects to perform?"):
+        return False
+    elif string.startswith("What kind of task would test someone's ability to perform physical reasoning?"):
+        return False
+    else:
+        return True
+
+
+def openbookqa_check(string: str):
+    if string.startswith(
+            "What sentence would provide a factual answer to this question:"):
+        return False
+    elif string.startswith("What is a random fact?"):
+        return False
+    elif string.startswith("Generate a sentence that contains a fact."):
+        return False
+    else:
+        return True
+
+
+PATTERNS = {
+    "rte": rte_check,
+    "wsc": lambda x: True,
+    "wsc273": lambda x: True,
+    "wic": lambda x: True,
+    "record": lambda x: True,
+    "natural_questions": lambda x: False,
+    "trivia_qa": lambda x: False,
+    "math_dataset": lambda x: False,
+    "aeslc": lambda x: False,
+    "cnn_dailymail": lambda x: False,
+    "gigaword": lambda x: False,
+    "multi_news": lambda x: False,
+    "newsroom": lambda x: False,
+    "samsum": lambda x: False,
+    "xsum": lambda x: False,
+    "squad_v1": lambda x: False,
+    "squad_v2": lambda x: False,
+    "drop": lambda x: False,
+    "quac": lambda x: False,
+    "para_crawl": lambda x: False,
+    "wmt16_translate": lambda x: False,
+    "wmt14_enfr": lambda x: False,
+    "true_case": lambda x: False,
+    "fix_punct": lambda x: False,
+    "word_segment": lambda x: False,
+    "cosmos_qa": cosmos_qa_check,
+    "ag_news_subset": ag_news_subset_check,
+    "bool_q": lambda x: True,
+    "definite_pronoun_resolution": lambda x: True,
+    "glue_mrpc": lambda x: True,
+    "glue_qqp": lambda x: True,
+    "imdb_reviews": imdb_reviews_check,
+    "paws_wiki": paws_wiki_check,
+    "sentiment140": sentiment140_check,
+    "story_cloze": story_cloze_check,
+    "copa": copa_check,
+    # Technically has multiple choice but ignored because of string parsing issues
+    "winogrande": lambda x: False,
+    "yelp_polarity_reviews": yelp_polarity_reviews_check,
+    "arc": arc_check,
+    "anli": anli_check,
+    "coqa": lambda x: False,
+    "opinion_abstracts_rotten_tomatoes": lambda x: False,
+    "opinion_abstracts_idebate": lambda x: False,
+    "common_gen": lambda x: False,
+    "dart": lambda x: False,
+    "e2e_nlg": lambda x: False,
+    "web_nlg_en": lambda x: False,
+    "wiki_lingua_english_en": lambda x: False,
+    "multirc": multirc_check,
+    "cb": cb_check,
+    "cola": cola_check,
+    "sst2": sst2_check,
+    "mnli": lambda x: True,
+    "qnli": qnli_check,
+    "wnli": lambda x: True,
+    "snli": snli_check,
+    "trec": trec_check,
+    "stsb": stsb_check,
+    "hellaswag": lambda x: True,
+    "piqa": piqa_check,
+    "openbookqa": openbookqa_check,
+}
+
+
+def check_if_multiple_choice(data_item):
+    inputs = data_item['inputs']
+    targets = data_item['targets']
+    task_source = data_item['task_source']
+    task_name = data_item['task_name']
+    template_type = data_item['template_type']
+    if '_noopt' in template_type:
+        return False
+    if 'zs' not in template_type:
+        raise ValueError(
+            "Template type does not contain zs, do not use this function for non-zs templates")
+    for key in list(PATTERNS.keys()):
+        if key + ":" in task_name:
+            return PATTERNS[key](inputs)
--- a/data/cot.jsonl
+++ b/data/cot.jsonl
--- a/data/flan.jsonl
+++ b/data/flan.jsonl
--- a/data/niv.jsonl
+++ b/data/niv.jsonl
--- a/data/t0.jsonl
+++ b/data/t0.jsonl
--- a/requirements.txt
+++ b/requirements.txt