From bb8db30333679ba5c70ef5dae9351b7bcd83d96d Mon Sep 17 00:00:00 2001 From: Yoach Lacombe Date: Wed, 2 Oct 2024 08:21:21 +0000 Subject: [PATCH] Update README.md (#4) - Update README.md (c329f74454be12bf7e475f5406b456f7a5855f96) Co-authored-by: Vaibhav Srivastav --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index e338234..a6be9be 100644 --- a/README.md +++ b/README.md @@ -119,8 +119,8 @@ Whisper is a state-of-the-art model for automatic speech recognition (ASR) and s et al. from OpenAI. Trained on >5M hours of labeled data, Whisper demonstrates a strong ability to generalise to many datasets and domains in a zero-shot setting. -Whisper large-v3-turbo is a distilled version of [Whisper large-v3](https://huggingface.co/openai/whisper-large-v3). In other words, it's the exact same model, except that the number of decoding layers have reduced from 32 to 4. -As a result, the model is way faster, at the expense of a minor quality degradation. +Whisper large-v3-turbo is a finetuned version of a pruned [Whisper large-v3](https://huggingface.co/openai/whisper-large-v3). In other words, it's the exact same model, except that the number of decoding layers have reduced from 32 to 4. +As a result, the model is way faster, at the expense of a minor quality degradation. You can find more details about it [in this GitHub discussion](https://github.com/openai/whisper/discussions/2363). **Disclaimer**: Content for this model card has partly been written by the 🤗 Hugging Face team, and partly copied and pasted from the original model card. @@ -148,7 +148,7 @@ from datasets import load_dataset device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 -model_id = "ylacombe/whisper-large-v3-turbo" +model_id = "openai/whisper-large-v3-turbo" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True @@ -252,7 +252,7 @@ from datasets import Audio, load_dataset device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 -model_id = "ylacombe/whisper-large-v3-turbo" +model_id = "openai/whisper-large-v3-turbo" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True @@ -327,7 +327,7 @@ from datasets import load_dataset device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 -model_id = "ylacombe/whisper-large-v3-turbo" +model_id = "openai/whisper-large-v3-turbo" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True @@ -373,7 +373,7 @@ torch.set_float32_matmul_precision("high") device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 -model_id = "ylacombe/whisper-large-v3-turbo" +model_id = "openai/whisper-large-v3-turbo" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True @@ -472,7 +472,7 @@ checkpoints are summarised in the following table with links to the models on th | large | 1550 M | x | [✓](https://huggingface.co/openai/whisper-large) | | large-v2 | 1550 M | x | [✓](https://huggingface.co/openai/whisper-large-v2) | | large-v3 | 1550 M | x | [✓](https://huggingface.co/openai/whisper-large-v3) | -| large-v3-turbo | 809 M | x | [✓](https://huggingface.co/ylacombe/whisper-large-v3-turbo) | +| large-v3-turbo | 809 M | x | [✓](https://huggingface.co/openai/whisper-large-v3-turbo) | ## Fine-Tuning