diff --git a/model_index.json b/model_index.json new file mode 100644 index 0000000..0ee02d7 --- /dev/null +++ b/model_index.json @@ -0,0 +1,24 @@ +{ + "_class_name": "AllegroPipeline", + "_diffusers_version": "0.31.0.dev0", + "scheduler": [ + "diffusers", + "EulerAncestralDiscreteScheduler" + ], + "text_encoder": [ + "transformers", + "T5EncoderModel" + ], + "tokenizer": [ + "transformers", + "T5Tokenizer" + ], + "transformer": [ + "diffusers", + "AllegroTransformer3DModel" + ], + "vae": [ + "diffusers", + "AutoencoderKLAllegro" + ] +} diff --git a/transformer/config.json b/transformer/config.json index e7d8fa6..178383f 100644 --- a/transformer/config.json +++ b/transformer/config.json @@ -1,38 +1,30 @@ { "_class_name": "AllegroTransformer3DModel", - "_diffusers_version": "0.28.0", + "_diffusers_version": "0.31.0.dev0", "activation_fn": "gelu-approximate", "attention_bias": true, "attention_head_dim": 96, - "ca_attention_mode": "xformers", "caption_channels": 4096, "cross_attention_dim": 2304, - "double_self_attention": false, - "downsampler": null, "dropout": 0.0, "in_channels": 4, "interpolation_scale_h": 2.0, "interpolation_scale_t": 2.2, "interpolation_scale_w": 2.0, - "model_max_length": 300, "norm_elementwise_affine": false, "norm_eps": 1e-06, "norm_type": "ada_norm_single", "num_attention_heads": 24, - "num_embeds_ada_norm": 1000, "num_layers": 32, - "only_cross_attention": false, "out_channels": 4, "patch_size": 2, "patch_size_t": 1, - "sa_attention_mode": "flash", + "sample_frames": 22, + "sample_height": 90, "sample_size": [ 90, 160 ], "sample_size_t": 22, - "upcast_attention": false, - "use_additional_conditions": null, - "use_linear_projection": false, - "use_rope": true + "sample_width": 160 } diff --git a/vae/config.json b/vae/config.json index 9b540d4..347eb37 100644 --- a/vae/config.json +++ b/vae/config.json @@ -1,6 +1,6 @@ { - "_class_name": "AllegroAutoencoderKL3D", - "_diffusers_version": "0.28.0", + "_class_name": "AutoencoderKLAllegro", + "_diffusers_version": "0.31.0.dev0", "act_fn": "silu", "block_out_channels": [ 128, @@ -8,33 +8,37 @@ 512, 512 ], - "blocks_tempdown_li": [ - true, - true, - false, - false + "down_block_types": [ + "AllegroDownBlock3D", + "AllegroDownBlock3D", + "AllegroDownBlock3D", + "AllegroDownBlock3D" ], - "blocks_tempup_li": [ - false, - true, - true, - false - ], - "chunk_len": 24, - "down_block_num": 4, "force_upcast": true, "in_channels": 3, "latent_channels": 4, "layers_per_block": 2, - "load_mode": "full", "norm_num_groups": 32, "out_channels": 3, "sample_size": 320, - "scale_factor": 0.13, - "t_over": 8, - "tile_overlap": [ - 120, - 80 + "scaling_factor": 0.13, + "temporal_compression_ratio": 4, + "temporal_downsample_blocks": [ + true, + true, + false, + false ], - "up_block_num": 4 + "temporal_upsample_blocks": [ + false, + true, + true, + false + ], + "up_block_types": [ + "AllegroUpBlock3D", + "AllegroUpBlock3D", + "AllegroUpBlock3D", + "AllegroUpBlock3D" + ] }