From 4a70170c215b36a3cce4b4253f6d0612bb7d4146 Mon Sep 17 00:00:00 2001 From: Matthew Carrigan Date: Fri, 29 Sep 2023 14:32:25 +0000 Subject: [PATCH] Move to in-library checkpoint (for real this time) (#107) - Move to in-library checkpoint (e48d41620636bcb72a4f28b1e09308d1e3d1fdf7) --- config.json | 23 +- configuration_RW.py | 75 --- configuration_falcon.py | 152 ++++++ generation_config.json | 8 +- modelling_RW.py => modeling_falcon.py | 680 ++++++++++++++++---------- tokenizer_config.json | 6 +- 6 files changed, 591 insertions(+), 353 deletions(-) delete mode 100644 configuration_RW.py create mode 100644 configuration_falcon.py rename modelling_RW.py => modeling_falcon.py (59%) diff --git a/config.json b/config.json index a39a009..1644d27 100644 --- a/config.json +++ b/config.json @@ -2,16 +2,16 @@ "alibi": false, "apply_residual_connection_post_layernorm": false, "architectures": [ - "RWForCausalLM" + "FalconForCausalLM" ], "attention_dropout": 0.0, "auto_map": { - "AutoConfig": "configuration_RW.RWConfig", - "AutoModel": "modelling_RW.RWModel", - "AutoModelForSequenceClassification": "modelling_RW.RWForSequenceClassification", - "AutoModelForTokenClassification": "modelling_RW.RWForTokenClassification", - "AutoModelForQuestionAnswering": "modelling_RW.RWForQuestionAnswering", - "AutoModelForCausalLM": "modelling_RW.RWForCausalLM" + "AutoConfig": "configuration_falcon.FalconConfig", + "AutoModel": "modeling_falcon.FalconModel", + "AutoModelForSequenceClassification": "modeling_falcon.FalconForSequenceClassification", + "AutoModelForTokenClassification": "modeling_falcon.FalconForTokenClassification", + "AutoModelForQuestionAnswering": "modeling_falcon.FalconForQuestionAnswering", + "AutoModelForCausalLM": "modeling_falcon.FalconForCausalLM" }, "bias": false, "bos_token_id": 11, @@ -20,10 +20,11 @@ "hidden_size": 8192, "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, - "model_type": "RefinedWeb", - "n_head": 128, - "n_head_kv": 8, - "n_layer": 60, + "model_type": "falcon", + "new_decoder_architecture": true, + "num_attention_heads": 128, + "num_hidden_layers": 60, + "num_kv_heads": 8, "parallel_attn": true, "torch_dtype": "bfloat16", "transformers_version": "4.27.4", diff --git a/configuration_RW.py b/configuration_RW.py deleted file mode 100644 index 85edccd..0000000 --- a/configuration_RW.py +++ /dev/null @@ -1,75 +0,0 @@ -# coding=utf-8 -# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Bloom configuration""" -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - - -logger = logging.get_logger(__name__) - - -class RWConfig(PretrainedConfig): - model_type = "RefinedWeb" - keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = { - "num_hidden_layers": "n_layer", - "num_attention_heads": "n_head", - } - - def __init__( - self, - vocab_size=250880, - hidden_size=64, - n_layer=2, - n_head=8, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - use_cache=True, - bos_token_id=1, - eos_token_id=2, - apply_residual_connection_post_layernorm=False, - hidden_dropout=0.0, - attention_dropout=0.0, - n_head_kv=None, - alibi=False, - **kwargs, - ): - self.vocab_size = vocab_size - # Backward compatibility with n_embed kwarg - n_embed = kwargs.pop("n_embed", None) - self.hidden_size = hidden_size if n_embed is None else n_embed - self.n_layer = n_layer - self.n_head = n_head - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.use_cache = use_cache - self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.n_head_kv = n_head if n_head_kv is None else n_head_kv - self.alibi = alibi - - super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) - - @property - def head_dim(self): - return self.hidden_size // self.n_head - - @property - def rotary(self): - return not self.alibi diff --git a/configuration_falcon.py b/configuration_falcon.py new file mode 100644 index 0000000..def8c2b --- /dev/null +++ b/configuration_falcon.py @@ -0,0 +1,152 @@ +# coding=utf-8 +# Copyright 2023 the Falcon authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Falcon configuration""" +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "tiiuae/falcon-40b": "https://huggingface.co/tiiuae/falcon-40b/resolve/main/config.json", + "tiiuae/falcon-7b": "https://huggingface.co/tiiuae/falcon-7b/resolve/main/config.json", +} + + +class FalconConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the + [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 65024): + Vocabulary size of the Falcon model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`FalconModel`] + hidden_size (`int`, *optional*, defaults to 4544): + Dimension of the hidden representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 71): + Number of attention heads for each attention layer in the Transformer encoder. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + use_cache (`bool`, *optional*, defaults to `True`): + Whether the model should return the last key/values attentions (not used by all models). Only relevant if + `config.is_decoder=True`. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): + The epsilon used by the layer normalization layers. + hidden_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for MLP layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for attention layers. + num_kv_heads (`int`, *optional*): + Number of key-value heads to use per attention layer. If unset, defaults to the same value as + `num_attention_heads`. + alibi (`bool`, *optional*, defaults to `False`): + Whether to use ALiBi positional biases during self-attention. + new_decoder_architecture (`bool`, *optional*, defaults to `False`): + Whether to use the new (Falcon-40B) decoder architecture. If `True`, the `multi_query` and `parallel_attn` + arguments are ignored, as the new decoder always uses parallel attention. + multi_query (`bool`, *optional*, defaults to `True`): + Whether to use multi-query attention in the decoder. Ignored when `new_decoder_architecture` is `True`. + parallel_attn (`bool`, *optional*, defaults to `True`): + Whether to compute attention in parallel with the feedforward layer. If False, they are consecutive + instead, as in the original Transformer architecture. Ignored when `new_decoder_architecture` is `True`. + bias (`bool`, *optional*, defaults to `False`): + Whether to use bias on Linear layers. + bos_token_id (`int`, *optional*, defaults to 11): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 11): + The id of the "end-of-sequence" token. + + Example: + + ```python + >>> from transformers import FalconModel, FalconConfig + + >>> # Initializing a small (2-layer) Falcon configuration + >>> configuration = FalconConfig(num_hidden_layers=2) + + >>> # Initializing a model from the small configuration + >>> model = FalconModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "falcon" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=65024, + hidden_size=4544, + num_hidden_layers=32, + num_attention_heads=71, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + use_cache=True, + hidden_dropout=0.0, + attention_dropout=0.0, + num_kv_heads=None, + alibi=False, + new_decoder_architecture=False, + multi_query=True, + parallel_attn=True, + bias=False, + bos_token_id=11, + eos_token_id=11, + **kwargs, + ): + logger.warning_once( + "\nWARNING: You are currently loading Falcon using legacy code contained in the model repository. Falcon has now been fully ported into the Hugging Face transformers library. " + "For the most up-to-date and high-performance version of the Falcon model code, please update to the latest version of transformers and then load the model " + "without the trust_remote_code=True argument.\n" + ) + self.vocab_size = vocab_size + # Backward compatibility with n_embed kwarg + n_embed = kwargs.pop("n_embed", None) + self.hidden_size = hidden_size if n_embed is None else n_embed + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.use_cache = use_cache + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.num_kv_heads = num_attention_heads if num_kv_heads is None else num_kv_heads + self.alibi = alibi + self.new_decoder_architecture = new_decoder_architecture + self.multi_query = multi_query # Ignored when new_decoder_architecture is True + self.parallel_attn = parallel_attn + self.bias = bias + + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + @property + def head_dim(self): + return self.hidden_size // self.num_attention_heads + + @property + def rotary(self): + return not self.alibi diff --git a/generation_config.json b/generation_config.json index 2cb1ddd..02b145e 100644 --- a/generation_config.json +++ b/generation_config.json @@ -1,6 +1,6 @@ { "_from_model_config": true, - "bos_token_id": 1, - "eos_token_id": 2, - "transformers_version": "4.27.4" -} + "bos_token_id": 11, + "eos_token_id": 11, + "transformers_version": "4.33.0.dev0" +} \ No newline at end of file diff --git a/modelling_RW.py b/modeling_falcon.py similarity index 59% rename from modelling_RW.py rename to modeling_falcon.py index f0c38a9..834822c 100644 --- a/modelling_RW.py +++ b/modeling_falcon.py @@ -1,9 +1,20 @@ -# port of models described in RW -# We use the bloom model as a starting point for these model. -# Please refer to the bloom models for usage instructions. +# coding=utf-8 +# Copyright 2023 the Falcon authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Falcon model.""" import math -import warnings from typing import Optional, Tuple, Union import torch @@ -20,59 +31,60 @@ from transformers.modeling_outputs import ( TokenClassifierOutput, ) from transformers.modeling_utils import PreTrainedModel -from transformers.utils import logging -from .configuration_RW import RWConfig +from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_falcon import FalconConfig + logger = logging.get_logger(__name__) +FALCON_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "tiiuae/falcon-40b", + "tiiuae/falcon-40b-instruct", + "tiiuae/falcon-7b", + "tiiuae/falcon-7b-instruct", + "tiiuae/falcon-rw-7b", + "tiiuae/falcon-rw-1b", +] +_CHECKPOINT_FOR_DOC = "Rocketknight1/falcon-rw-1b" +_CONFIG_FOR_DOC = "FalconConfig" + + # NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations. # In order not to degrade the quality of our HF-port, we keep these characteristics in the final model. -class Linear(nn.Linear): +class FalconLinear(nn.Linear): def forward(self, input: torch.Tensor) -> torch.Tensor: - ret = input @ self.weight.T + hidden_states = input @ self.weight.T if self.bias is None: - return ret - else: - return ret + self.bias + return hidden_states + return hidden_states + self.bias -from einops import rearrange - # rotary pos emb helpers (torch.jit.script does not seem to support staticmethod...) def rotate_half(x): x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=x1.ndim - 1) # dim=-1 triggers a bug in torch < 1.8.0 + return torch.cat((-x2, x1), dim=-1) -class RotaryEmbedding(torch.nn.Module): +class FalconRotaryEmbedding(nn.Module): """Implementation of RotaryEmbedding from GPT-NeoX. - This implementation is design to operate on queries and keys that are compatible with - [batch_size, n_heads_per_partition, seq_len, head_dim] (e.g. MinGPTAttention format). + This implementation is designed to operate on queries and keys that are compatible with `[batch_size, + n_heads_per_partition, seq_len, head_dim]` (e.g. MinGPTAttention format). """ - def __init__( - self, - head_dim: int, - base=10000, - ): + def __init__(self, head_dim: int, base=10000): super().__init__() inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim)) self.register_buffer("inv_freq", inv_freq, persistent=False) self.head_dim = head_dim - self.seq_len_cached = None - self.batch_size_cached = None + self.seq_len_cached = -1 self.cos_cached: torch.Tensor | None = None self.sin_cached: torch.Tensor | None = None - def cos_sin( - self, - seq_len: int, - device="cuda", - dtype=torch.bfloat16, - ) -> torch.Tensor: - if seq_len != self.seq_len_cached: - self.seq_len_cached = seq_len - t = torch.arange(seq_len, device=device).type_as(self.inv_freq) + def cos_sin(self, seq_len: int, past_key_values_length: int, device="cpu", dtype=torch.bfloat16) -> torch.Tensor: + total_length = seq_len + past_key_values_length + if total_length > self.seq_len_cached: + self.seq_len_cached = total_length + t = torch.arange(total_length, device=device, dtype=self.inv_freq.dtype) freqs = torch.einsum("i,j->ij", t, self.inv_freq) emb = torch.cat((freqs, freqs), dim=-1).to(device) @@ -85,36 +97,46 @@ class RotaryEmbedding(torch.nn.Module): self.cos_cached = self.cos_cached.type(dtype) self.sin_cached = self.sin_cached.type(dtype) - return self.cos_cached, self.sin_cached + return ( + self.cos_cached[:, past_key_values_length : seq_len + past_key_values_length], + self.sin_cached[:, past_key_values_length : seq_len + past_key_values_length], + ) - def forward(self, q, k): - batch, seq_len, head_dim = q.shape - cos, sin = self.cos_sin(seq_len, q.device, q.dtype) - return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin) + def forward(self, query, key, past_key_values_length=0): + batch, seq_len, head_dim = query.shape + cos, sin = self.cos_sin(seq_len, past_key_values_length, query.device, query.dtype) + return (query * cos) + (rotate_half(query) * sin), (key * cos) + (rotate_half(key) * sin) def _make_causal_mask( input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int ) -> torch.BoolTensor: + """ + Make causal mask used for self-attention. This mask does not take the existing attention mask into account - it + just blocks tokens from attending forwards in the sequence. The output shape will be `[batch_size, 1, + target_length, target_length+past_key_values_length]`. + """ batch_size, target_length = input_ids_shape - mask = torch.empty((target_length, target_length + past_key_values_length), dtype=torch.bool, device=device) - # ONNX doesn't support `torch.Tensor.triu` properly, thus we use this workaround - seq_ids = torch.arange(target_length, device=device) - mask[:, past_key_values_length:] = seq_ids[:, None] < seq_ids[None, :] - - if past_key_values_length > 0: - mask[:, :past_key_values_length] = False + mask = torch.triu(torch.ones((target_length, target_length), dtype=torch.bool, device=device), diagonal=1) + # If past_key_values_length is 0 this is an empty tensor and the concatenation is a no-op. + # This code style is an unfortunate consequence of getting your TF engineer to port models; doing it this + # way avoids a data-dependent conditional, which will help me when I have to port this to XLA later. + past_mask = torch.zeros((target_length, past_key_values_length), dtype=torch.bool, device=device) + mask = torch.cat([past_mask, mask], dim=-1) expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length) return expanded_mask -def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor: - batch_size, src_length = mask.shape - tgt_length = tgt_length if tgt_length is not None else src_length +def _expand_mask(mask: torch.Tensor, past_key_values_length: int) -> torch.BoolTensor: + """ + Expands attention_mask from `[batch_size, seq_length]` to `[batch_size, 1, seq_length, seq_length + past_length]`. + """ + batch_size, total_length = mask.shape + seq_length = total_length - past_key_values_length if past_key_values_length is not None else total_length expanded_mask = ~(mask[:, None, None, :].to(torch.bool)) - return expanded_mask.expand(batch_size, 1, tgt_length, src_length) + return expanded_mask.expand(batch_size, 1, seq_length, total_length) def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor: @@ -145,18 +167,32 @@ def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torc return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype) +# Copied from transformers.models.bloom.modeling_bloom.dropout_add def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor: + """ + Dropout add function + + Args: + x (`torch.tensor`, *required*): + input tensor + residual (`torch.tensor`, *required*): + residual tensor + prob (`float`, *required*): + dropout probability + training (`bool`, *required*): + training mode + """ out = F.dropout(x, p=prob, training=training) out = residual + out return out -class Attention(nn.Module): - def __init__(self, config: RWConfig): +class FalconAttention(nn.Module): + def __init__(self, config: FalconConfig): super().__init__() self.hidden_size = config.hidden_size - self.num_heads = config.n_head + self.num_heads = config.num_attention_heads self.head_dim = self.hidden_size // self.num_heads self.split_size = self.hidden_size self.hidden_dropout = config.hidden_dropout @@ -167,59 +203,62 @@ class Attention(nn.Module): f" {self.num_heads})." ) - self.maybe_rotary = RotaryEmbedding(config.head_dim) if config.rotary else lambda q, k: (q, k) + self.maybe_rotary = FalconRotaryEmbedding(config.head_dim) if config.rotary else lambda q, k, t: (q, k) # Layer-wise attention scaling self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim) self.beta = self.inv_norm_factor - - self.query_key_value = Linear( - self.hidden_size, - (config.n_head_kv * 2 + config.n_head) * self.head_dim, - bias=config.bias, - ) - self.dense = Linear(self.hidden_size, self.hidden_size, bias=config.bias) + if config.new_decoder_architecture: + qkv_out_dim = (config.num_kv_heads * 2 + config.num_attention_heads) * self.head_dim + elif config.multi_query: + qkv_out_dim = self.hidden_size + 2 * self.head_dim + else: + qkv_out_dim = 3 * self.hidden_size + self.query_key_value = FalconLinear(self.hidden_size, qkv_out_dim, bias=config.bias) + self.new_decoder_architecture = config.new_decoder_architecture + self.multi_query = config.multi_query + self.dense = FalconLinear(self.hidden_size, self.hidden_size, bias=config.bias) self.attention_dropout = nn.Dropout(config.attention_dropout) - self.num_kv = config.n_head_kv + self.num_kv_heads = config.num_kv_heads if (self.new_decoder_architecture or not self.multi_query) else 1 def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ - Split the last dimension into (num_heads, head_dim), results share same memory - storage as `fused_qkv` + Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv` Args: fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim] Returns: - query: [batch_size, seq_length, num_heads, head_dim] - key: [batch_size, seq_length, num_heads, head_dim] + query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim] value: [batch_size, seq_length, num_heads, head_dim] """ - batch, seq_len, _ = fused_qkv.shape - qkv = fused_qkv.view(batch, seq_len, -1, self.num_heads // self.num_kv + 2, 64) - q = qkv[:, :, :, :-2] - k = qkv[:, :, :, [-2]] - v = qkv[:, :, :, [-1]] - k = torch.broadcast_to(k, q.shape) - v = torch.broadcast_to(v, q.shape) + if self.new_decoder_architecture: + batch, seq_len, _ = fused_qkv.shape + qkv = fused_qkv.view(batch, seq_len, -1, self.num_heads // self.num_kv_heads + 2, self.head_dim) + query = qkv[:, :, :, :-2] + key = qkv[:, :, :, [-2]] + value = qkv[:, :, :, [-1]] + key = torch.broadcast_to(key, query.shape) + value = torch.broadcast_to(value, query.shape) - q, k, v = [ - rearrange( - x, - "batch seq_len group num_heads head_dim ->\ - batch seq_len (group num_heads) head_dim", - head_dim=self.head_dim, - ) - for x in [q, k, v] - ] - return q, k, v + query, key, value = [x.flatten(2, 3) for x in (query, key, value)] + return query, key, value + elif not self.multi_query: + batch_size, seq_length, three_times_hidden_size = fused_qkv.shape + fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim) + return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :] + else: + batch_size, seq_length, three_times_hidden_size = fused_qkv.shape + fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads + 2, self.head_dim) + return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :] + # Copied from transformers.models.bloom.modeling_bloom.BloomAttention._merge_heads def _merge_heads(self, x: torch.Tensor) -> torch.Tensor: """ Merge heads together over the last dimenstion Args: - x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim] + x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim] Returns: torch.tensor: [batch_size, seq_length, num_heads * head_dim] @@ -242,7 +281,7 @@ class Attention(nn.Module): def forward( self, hidden_states: torch.Tensor, - alibi: torch.Tensor, + alibi: Optional[torch.Tensor], attention_mask: torch.Tensor, layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, head_mask: Optional[torch.Tensor] = None, @@ -250,106 +289,120 @@ class Attention(nn.Module): output_attentions: bool = False, ): fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] - + num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads # 3 x [batch_size, seq_length, num_heads, head_dim] (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) - batch_size, q_length, _, _ = query_layer.shape + batch_size, query_length, _, _ = query_layer.shape - query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim) + query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, query_length, self.head_dim) key_layer = key_layer.transpose(1, 2).reshape( - batch_size * self.num_heads, - q_length, + batch_size * num_kv_heads, + query_length, self.head_dim, ) - value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim) + value_layer = value_layer.transpose(1, 2).reshape(batch_size * num_kv_heads, query_length, self.head_dim) - query_layer, key_layer = self.maybe_rotary(query_layer, key_layer) + past_kv_length = 0 if layer_past is None else layer_past[0].shape[1] + query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length) if layer_past is not None: past_key, past_value = layer_past # concatenate along seq_length dimension: - # - key: [batch_size * self.num_heads, head_dim, kv_length] + # - key: [batch_size * self.num_heads, kv_length, head_dim] # - value: [batch_size * self.num_heads, kv_length, head_dim] key_layer = torch.cat((past_key, key_layer), dim=1) value_layer = torch.cat((past_value, value_layer), dim=1) _, kv_length, _ = key_layer.shape - - if use_cache is True: + if use_cache: present = (key_layer, value_layer) else: present = None + attention_mask_float = (attention_mask * 1.0).masked_fill(attention_mask, float("-1e9")).to(query_layer.dtype) + + query_layer_ = query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim) + key_layer_ = key_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim) + value_layer_ = value_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim) + if alibi is None: - query_layer_ = query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim) - key_layer_ = key_layer.reshape(batch_size, self.num_heads, -1, self.head_dim) - value_layer_ = value_layer.reshape(batch_size, self.num_heads, -1, self.head_dim) + if output_attentions: + # F.scaled_dot_product_attention doesn't return the attention weights, so we have + # to do it by hand if we want them + attention_scores = query_layer_ @ key_layer_.transpose(-1, -2) + attention_scores /= math.sqrt(self.head_dim) - attn_output = F.scaled_dot_product_attention( - query_layer_, key_layer_, value_layer_, None, 0.0, is_causal=True - ) + attention_scores = F.softmax( + attention_scores + attention_mask_float, dim=-1, dtype=hidden_states.dtype + ) + attn_output = attention_scores @ value_layer_ + else: + attn_output = F.scaled_dot_product_attention( + query_layer_, key_layer_, value_layer_, attention_mask_float, 0.0, is_causal=False + ) + attention_scores = None - x = attn_output.view(batch_size, self.num_heads, q_length, self.head_dim) - x = x.permute(0, 2, 1, 3) - attn_output = x.reshape(batch_size, q_length, self.num_heads * self.head_dim) + attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim) + attn_output = attn_output.permute(0, 2, 1, 3) + attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) output_tensor = self.dense(attn_output) - outputs = (output_tensor, present) - assert not output_attentions # not supported. - return outputs + if output_attentions: + return output_tensor, present, attention_scores + else: + return output_tensor, present + else: - attention_mask_float = (attention_mask * 1.0).masked_fill(attention_mask, -1e9).to(torch.bfloat16) - matmul_result = query_layer @ key_layer.transpose(-1, -2) + matmul_result = query_layer_ @ key_layer_.transpose(-1, -2) # change view to [batch_size, num_heads, q_length, kv_length] - attention_scores = matmul_result.view(batch_size, self.num_heads, q_length, kv_length) + attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length) # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length] input_dtype = attention_scores.dtype # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38` if input_dtype == torch.float16 or input_dtype == torch.bfloat16: attention_scores = attention_scores.to(torch.float32) - # attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min) - attention_probs = F.softmax( - (attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)) * self.inv_norm_factor - + attention_mask_float, - dim=-1, - dtype=hidden_states.dtype, - ) + # Matt (HF) note: We could possibly use F.scaled_dot_product_attention here too, by + # adding (alibi * self.inv_norm_factor) to attention_mask_float. I think this would be mathematically + # equivalent and more performant, but there might be a numerical difference. If you're reading this + # and you'd like to experiment and maybe file a PR, feel free! + attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1) + attention_logits *= self.inv_norm_factor + attention_probs = F.softmax(attention_logits + attention_mask_float, dim=-1, dtype=hidden_states.dtype) # [batch_size, num_heads, q_length, kv_length] attention_probs = self.attention_dropout(attention_probs) if head_mask is not None: attention_probs = attention_probs * head_mask - # change view [batch_size x num_heads, q_length, kv_length] - attention_probs_reshaped = attention_probs.view(batch_size * self.num_heads, q_length, kv_length) + # change view [batch_size, num_heads, q_length, kv_length] + attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length) # matmul: [batch_size * num_heads, q_length, head_dim] - context_layer = attention_probs_reshaped @ value_layer + context_layer = (attention_probs_reshaped @ value_layer_).flatten(0, 1) # change view [batch_size, num_heads, q_length, head_dim] context_layer = self._merge_heads(context_layer) output_tensor = self.dense(context_layer) - outputs = (output_tensor, present) if output_attentions: - outputs += (attention_probs,) - - return outputs + return output_tensor, present, attention_probs + else: + return output_tensor, present -class MLP(nn.Module): - def __init__(self, config: RWConfig): +class FalconMLP(nn.Module): + def __init__(self, config: FalconConfig): super().__init__() hidden_size = config.hidden_size - self.dense_h_to_4h = Linear(hidden_size, 4 * hidden_size, bias=config.bias) + self.dense_h_to_4h = FalconLinear(hidden_size, 4 * hidden_size, bias=config.bias) self.act = nn.GELU() - self.dense_4h_to_h = Linear(4 * hidden_size, hidden_size, bias=config.bias) + self.dense_4h_to_h = FalconLinear(4 * hidden_size, hidden_size, bias=config.bias) self.hidden_dropout = config.hidden_dropout def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -358,43 +411,47 @@ class MLP(nn.Module): return x -class DecoderLayer(nn.Module): - def __init__(self, config: RWConfig): +class FalconDecoderLayer(nn.Module): + def __init__(self, config: FalconConfig): super().__init__() hidden_size = config.hidden_size - - self.ln_attn = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - - self.num_heads = config.n_head - self.self_attention = Attention(config) - - self.mlp = MLP(config) - - self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm + self.num_heads = config.num_attention_heads + self.self_attention = FalconAttention(config) + self.mlp = FalconMLP(config) self.hidden_dropout = config.hidden_dropout - self.config = config + if config.new_decoder_architecture: + # The layer norm before self-attention + self.ln_attn = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + # The layer norm before the MLP + self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + else: + self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + if not config.parallel_attn: + self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + def forward( self, hidden_states: torch.Tensor, - alibi: torch.Tensor, + alibi: Optional[torch.Tensor], attention_mask: torch.Tensor, layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, head_mask: Optional[torch.Tensor] = None, use_cache: bool = False, output_attentions: bool = False, ): - - ln_attn = self.ln_attn(hidden_states) - ln_mlp = self.ln_mlp(hidden_states) - residual = hidden_states + if self.config.new_decoder_architecture: + attention_layernorm_out = self.ln_attn(hidden_states) + mlp_layernorm_out = self.ln_mlp(hidden_states) + else: + attention_layernorm_out = self.input_layernorm(hidden_states) + # Self attention. attn_outputs = self.self_attention( - ln_attn, + attention_layernorm_out, layer_past=layer_past, attention_mask=attention_mask, alibi=alibi, @@ -405,14 +462,24 @@ class DecoderLayer(nn.Module): attention_output = attn_outputs[0] + if not self.config.new_decoder_architecture: + if self.config.parallel_attn: + mlp_layernorm_out = attention_layernorm_out + else: + residual = dropout_add( + attention_output, residual, self.config.attention_dropout, training=self.training + ) + mlp_layernorm_out = self.post_attention_layernorm(residual) + outputs = attn_outputs[1:] # MLP. - mlp_output = self.mlp(ln_mlp) + mlp_output = self.mlp(mlp_layernorm_out) - output = dropout_add( - mlp_output + attention_output, residual, self.config.hidden_dropout, training=self.training - ) + if self.config.new_decoder_architecture or self.config.parallel_attn: + mlp_output += attention_output + + output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training) if use_cache: outputs = (output,) + outputs @@ -422,24 +489,93 @@ class DecoderLayer(nn.Module): return outputs # hidden_states, present, attentions -class RWPreTrainedModel(PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] +FALCON_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`FalconConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +FALCON_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.num_hidden_layers`): + Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see + `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have + their past given to this model should not be passed as `input_ids` as they have already been computed. + + Each element of `past_key_values` is a tuple (past_key, past_value): + - past_key: [batch_size * num_heads, head_dim, kv_length] + - past_value: [batch_size * num_heads, kv_length, head_dim] + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + + If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see + `past_key_values`). + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +class FalconPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ - config_class = RWConfig + config_class = FalconConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True - _no_split_modules = ["DecoderLayer"] + _no_split_modules = ["FalconDecoderLayer"] def __init__(self, *inputs, **kwargs): super().__init__(*inputs, **kwargs) def _init_weights(self, module: nn.Module): """Initialize the weights.""" - if isinstance(module, nn.Linear) or isinstance(module, Linear): + if isinstance(module, nn.Linear) or isinstance(module, FalconLinear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) @@ -453,26 +589,28 @@ class RWPreTrainedModel(PreTrainedModel): module.bias.data.zero_() module.weight.data.fill_(1.0) + # Copied from transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._set_gradient_checkpointing with BloomModel->FalconModel def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False): - if isinstance(module, RWModel): + if isinstance(module, FalconModel): module.gradient_checkpointing = value @staticmethod - def _convert_to_standard_cache( + def _convert_cache_to_standard_format( past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]: """ Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size, num_heads, ...])) """ - batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape + batch_size_times_num_heads, kv_length, head_dim = past_key_value[0][0].shape + # [batch_size * self.num_heads, kv_length, head_dim] -> [batch_size, num_heads, kv_length, head_dim] + # Note that don't want to use self.num_attention_heads because the number of heads may vary depending + # on whether we use multi_query attention. num_heads = batch_size_times_num_heads // batch_size - # key: [batch_size * num_heads, head_dim, seq_length] -> [batch_size, num_heads, head_dim, seq_length] - # value: [batch_size * num_heads, seq_length, head_dim] -> [batch_size, num_heads, seq_length, head_dim] return tuple( ( - layer_past[0].view(batch_size, num_heads, head_dim, seq_length), - layer_past[1].view(batch_size, num_heads, seq_length, head_dim), + layer_past[0].view(batch_size, num_heads, kv_length, head_dim), + layer_past[1].view(batch_size, num_heads, kv_length, head_dim), ) for layer_past in past_key_value ) @@ -481,32 +619,35 @@ class RWPreTrainedModel(PreTrainedModel): def _convert_to_rw_cache( past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]] ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]: - batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape + batch_size, num_heads, kv_length, head_dim = past_key_value[0][0].shape batch_size_times_num_heads = batch_size * num_heads - # key: [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length] - # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim] + # [batch_size, num_heads, kv_length, head_dim] -> [batch_size * num_heads, kv_length, head_dim] return tuple( ( - layer_past[0].view(batch_size_times_num_heads, head_dim, seq_length), - layer_past[1].view(batch_size_times_num_heads, seq_length, head_dim), + layer_past[0].view(batch_size_times_num_heads, kv_length, head_dim), + layer_past[1].view(batch_size_times_num_heads, kv_length, head_dim), ) for layer_past in past_key_value ) -class RWModel(RWPreTrainedModel): - def __init__(self, config: RWConfig): +@add_start_docstrings( + "The bare Falcon Model transformer outputting raw hidden-states without any specific head on top.", + FALCON_START_DOCSTRING, +) +class FalconModel(FalconPreTrainedModel): + def __init__(self, config: FalconConfig): super().__init__(config) self.embed_dim = config.hidden_size - self.num_heads = config.n_head - self.alibi = config.alibi + self.num_heads = config.num_attention_heads + self.use_alibi = config.alibi # Embedding + LN Embedding self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim) # Transformer blocks - self.h = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.h = nn.ModuleList([FalconDecoderLayer(config) for _ in range(config.num_hidden_layers)]) # Final Layer Norm self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) @@ -519,22 +660,31 @@ class RWModel(RWPreTrainedModel): def get_input_embeddings(self): return self.word_embeddings + @staticmethod def _prepare_attn_mask( - self, attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int + attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int ) -> torch.BoolTensor: - # create causal mask - # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length] + # Create a causal mask + # The attention mask we receive as input should cover the whole extended sequence, including any past + # cache, so its shape should be [batch_size, seq_length + past_key_values_length] + # The output shape will be [batch_size, 1, seq_length, seq_length + past_key_values_length] + if input_shape[1] + past_key_values_length != attention_mask.shape[1]: + raise ValueError( + "Attention mask shape should be (batch_size, seq_length + past_key_values_length)" + f" but is {attention_mask.shape} with input_ids shape {input_shape} and past length" + f" {past_key_values_length}." + ) combined_attention_mask = None device = attention_mask.device - _, src_length = input_shape + _, seq_length = input_shape - if src_length > 1: + if seq_length > 1: combined_attention_mask = _make_causal_mask( input_shape, device=device, past_key_values_length=past_key_values_length ) - # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length] - expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length) + # [batch_size, seq_length + past_key_values_length] -> [batch_size, 1, seq_length, seq_length + past_key_values_length] + expanded_attn_mask = _expand_mask(attention_mask, past_key_values_length=past_key_values_length) combined_attention_mask = ( expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask ) @@ -544,6 +694,12 @@ class RWModel(RWPreTrainedModel): def set_input_embeddings(self, new_embeddings: torch.Tensor): self.word_embeddings = new_embeddings + @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPastAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -555,18 +711,7 @@ class RWModel(RWPreTrainedModel): output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - **deprecated_arguments, ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]: - if deprecated_arguments.pop("position_ids", False) is not False: - # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` - warnings.warn( - "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" - " passing `position_ids`.", - FutureWarning, - ) - if len(deprecated_arguments) > 0: - raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -585,12 +730,14 @@ class RWModel(RWPreTrainedModel): if past_key_values is None: past_key_values = tuple([None] * len(self.h)) + else: + past_key_values = self._convert_to_rw_cache(past_key_values) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape batch_size x num_heads x N x N # head_mask has shape n_layer x batch x num_heads x N x N - head_mask = self.get_head_mask(head_mask, self.config.n_layer) + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) @@ -602,17 +749,15 @@ class RWModel(RWPreTrainedModel): all_hidden_states = () if output_hidden_states else None # Compute alibi tensor: check build_alibi_tensor documentation - seq_length_with_past = seq_length past_key_values_length = 0 if past_key_values[0] is not None: - past_key_values_length = past_key_values[0][0].shape[2] - seq_length_with_past = seq_length_with_past + past_key_values_length + past_key_values_length = past_key_values[0][0].shape[1] # 1 because RW-cache, not standard format if attention_mask is None: - attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device) + attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=hidden_states.device) else: attention_mask = attention_mask.to(hidden_states.device) - if self.alibi: + if self.use_alibi: alibi = build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype) else: alibi = None @@ -624,12 +769,10 @@ class RWModel(RWPreTrainedModel): ) for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): - if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -674,6 +817,9 @@ class RWModel(RWPreTrainedModel): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) + if presents is not None: + presents = self._convert_cache_to_standard_format(presents, batch_size) + if not return_dict: return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) @@ -685,12 +831,16 @@ class RWModel(RWPreTrainedModel): ) -class RWForCausalLM(RWPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] +@add_start_docstrings( + "The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).", + FALCON_START_DOCSTRING, +) +class FalconForCausalLM(FalconPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] - def __init__(self, config: RWConfig): + def __init__(self, config: FalconConfig): super().__init__(config) - self.transformer = RWModel(config) + self.transformer = FalconModel(config) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # Initialize weights and apply final processing @@ -705,25 +855,26 @@ class RWForCausalLM(RWPreTrainedModel): def prepare_inputs_for_generation( self, input_ids: torch.LongTensor, - past: Optional[torch.Tensor] = None, + past_key_values: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs, ) -> dict: - # only last token for input_ids if past is not None - if past: - input_ids = input_ids[:, -1].unsqueeze(-1) - - # the cache may be in the stardard format (e.g. in contrastive search), convert to our's format if needed - if past[0][0].shape[0] == input_ids.shape[0]: - past = self._convert_to_rw_cache(past) + if past_key_values is not None: + input_ids = input_ids[:, -1:] return { "input_ids": input_ids, - "past_key_values": past, + "past_key_values": past_key_values, "use_cache": kwargs.get("use_cache"), "attention_mask": attention_mask, } + @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -736,7 +887,6 @@ class RWForCausalLM(RWPreTrainedModel): output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - **deprecated_arguments, ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -744,15 +894,6 @@ class RWForCausalLM(RWPreTrainedModel): `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - if deprecated_arguments.pop("position_ids", False) is not False: - # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` - warnings.warn( - "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" - " passing `position_ids`.", - FutureWarning, - ) - if len(deprecated_arguments) > 0: - raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -805,7 +946,6 @@ class RWForCausalLM(RWPreTrainedModel): Output shares the same memory storage as `past`. """ - standardized_past = self._convert_to_standard_cache(past, batch_size=len(beam_idx)) # Get a copy of `beam_idx` on all the devices where we need those indices. device_to_beam_idx = { @@ -816,23 +956,42 @@ class RWForCausalLM(RWPreTrainedModel): layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]), layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]), ) - for layer_past in standardized_past + for layer_past in past ) - return self._convert_to_rw_cache(reordered_past) + return reordered_past -class RWForSequenceClassification(RWPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] +@add_start_docstrings( + """ + The Falcon Model transformer with a sequence classification head on top (linear layer). - def __init__(self, config: RWConfig): + [`FalconForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-1) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + FALCON_START_DOCSTRING, +) +class FalconForSequenceClassification(FalconPreTrainedModel): + def __init__(self, config: FalconConfig): super().__init__(config) self.num_labels = config.num_labels - self.transformer = RWModel(config) + self.transformer = FalconModel(config) self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False) # Initialize weights and apply final processing self.post_init() + @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -845,7 +1004,6 @@ class RWForSequenceClassification(RWPreTrainedModel): output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - **deprecated_arguments, ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -853,15 +1011,6 @@ class RWForSequenceClassification(RWPreTrainedModel): config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - if deprecated_arguments.pop("position_ids", False) is not False: - # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` - warnings.warn( - "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" - " passing `position_ids`.", - FutureWarning, - ) - if len(deprecated_arguments) > 0: - raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -936,17 +1085,22 @@ class RWForSequenceClassification(RWPreTrainedModel): ) -class RWForTokenClassification(RWPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] - - def __init__(self, config: RWConfig): +@add_start_docstrings( + """ + Falcon Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + FALCON_START_DOCSTRING, +) +class FalconForTokenClassification(FalconPreTrainedModel): + def __init__(self, config: FalconConfig): super().__init__(config) self.num_labels = config.num_labels - self.transformer = RWModel(config) - if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None: + self.transformer = FalconModel(config) + if getattr(config, "classifier_dropout", None) is not None: classifier_dropout = config.classifier_dropout - elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None: + elif getattr(config, "hidden_dropout", None) is not None: classifier_dropout = config.hidden_dropout else: classifier_dropout = 0.1 @@ -956,6 +1110,12 @@ class RWForTokenClassification(RWPreTrainedModel): # Initialize weights and apply final processing self.post_init() + @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -968,7 +1128,6 @@ class RWForTokenClassification(RWPreTrainedModel): output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - **deprecated_arguments, ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -976,15 +1135,6 @@ class RWForTokenClassification(RWPreTrainedModel): config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - if deprecated_arguments.pop("position_ids", False) is not False: - # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` - warnings.warn( - "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" - " passing `position_ids`.", - FutureWarning, - ) - if len(deprecated_arguments) > 0: - raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -1008,7 +1158,9 @@ class RWForTokenClassification(RWPreTrainedModel): if labels is not None: batch_size, seq_length = labels.shape loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)) + loss = loss_fct( + logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length) + ) if not return_dict: output = (logits,) + transformer_outputs[2:] @@ -1022,22 +1174,27 @@ class RWForTokenClassification(RWPreTrainedModel): ) -class RWForQuestionAnswering(RWPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] - +@add_start_docstrings( + """ + The Falcon Model transformer with a span classification head on top for extractive question-answering tasks like + SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + FALCON_START_DOCSTRING, +) +class FalconForQuestionAnswering(FalconPreTrainedModel): def __init__(self, config): super().__init__(config) - self.transformer = RWModel(config) + self.transformer = FalconModel(config) self.qa_outputs = nn.Linear(config.hidden_size, 2) # Initialize weights and apply final processing self.post_init() + @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING) def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, - position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, start_positions: Optional[torch.LongTensor] = None, @@ -1061,7 +1218,6 @@ class RWForQuestionAnswering(RWPreTrainedModel): outputs = self.transformer( input_ids, attention_mask=attention_mask, - position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, diff --git a/tokenizer_config.json b/tokenizer_config.json index 5b966c7..0216f62 100644 --- a/tokenizer_config.json +++ b/tokenizer_config.json @@ -1,7 +1,11 @@ { "add_prefix_space": false, "eos_token": "<|endoftext|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], "model_max_length": 2048, "special_tokens_map_file": null, "tokenizer_class": "PreTrainedTokenizerFast" -} +} \ No newline at end of file