support sdxl

2026-04-03 17:56:15 +02:00 · 2023-11-10 11:57:39 +08:00
parent 60dfd554c0
commit d6f459dbd6
111 changed files with 5620 additions and 3750 deletions
--- a/animatediff/models/attention.py
+++ b/animatediff/models/attention.py
@@ -1,300 +0,0 @@
-# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py
-
-from dataclasses import dataclass
-from typing import Optional
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.modeling_utils import ModelMixin
-from diffusers.utils import BaseOutput
-from diffusers.utils.import_utils import is_xformers_available
-from diffusers.models.attention import CrossAttention, FeedForward, AdaLayerNorm
-
-from einops import rearrange, repeat
-import pdb
-
-@dataclass
-class Transformer3DModelOutput(BaseOutput):
-    sample: torch.FloatTensor
-
-
-if is_xformers_available():
-    import xformers
-    import xformers.ops
-else:
-    xformers = None
-
-
-class Transformer3DModel(ModelMixin, ConfigMixin):
-    @register_to_config
-    def __init__(
-        self,
-        num_attention_heads: int = 16,
-        attention_head_dim: int = 88,
-        in_channels: Optional[int] = None,
-        num_layers: int = 1,
-        dropout: float = 0.0,
-        norm_num_groups: int = 32,
-        cross_attention_dim: Optional[int] = None,
-        attention_bias: bool = False,
-        activation_fn: str = "geglu",
-        num_embeds_ada_norm: Optional[int] = None,
-        use_linear_projection: bool = False,
-        only_cross_attention: bool = False,
-        upcast_attention: bool = False,
-
-        unet_use_cross_frame_attention=None,
-        unet_use_temporal_attention=None,
-    ):
-        super().__init__()
-        self.use_linear_projection = use_linear_projection
-        self.num_attention_heads = num_attention_heads
-        self.attention_head_dim = attention_head_dim
-        inner_dim = num_attention_heads * attention_head_dim
-
-        # Define input layers
-        self.in_channels = in_channels
-
-        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
-        if use_linear_projection:
-            self.proj_in = nn.Linear(in_channels, inner_dim)
-        else:
-            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
-
-        # Define transformers blocks
-        self.transformer_blocks = nn.ModuleList(
-            [
-                BasicTransformerBlock(
-                    inner_dim,
-                    num_attention_heads,
-                    attention_head_dim,
-                    dropout=dropout,
-                    cross_attention_dim=cross_attention_dim,
-                    activation_fn=activation_fn,
-                    num_embeds_ada_norm=num_embeds_ada_norm,
-                    attention_bias=attention_bias,
-                    only_cross_attention=only_cross_attention,
-                    upcast_attention=upcast_attention,
-
-                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
-                    unet_use_temporal_attention=unet_use_temporal_attention,
-                )
-                for d in range(num_layers)
-            ]
-        )
-
-        # 4. Define output layers
-        if use_linear_projection:
-            self.proj_out = nn.Linear(in_channels, inner_dim)
-        else:
-            self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
-
-    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True):
-        # Input
-        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
-        video_length = hidden_states.shape[2]
-        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
-        encoder_hidden_states = repeat(encoder_hidden_states, 'b n c -> (b f) n c', f=video_length)
-
-        batch, channel, height, weight = hidden_states.shape
-        residual = hidden_states
-
-        hidden_states = self.norm(hidden_states)
-        if not self.use_linear_projection:
-            hidden_states = self.proj_in(hidden_states)
-            inner_dim = hidden_states.shape[1]
-            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
-        else:
-            inner_dim = hidden_states.shape[1]
-            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
-            hidden_states = self.proj_in(hidden_states)
-
-        # Blocks
-        for block in self.transformer_blocks:
-            hidden_states = block(
-                hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                timestep=timestep,
-                video_length=video_length
-            )
-
-        # Output
-        if not self.use_linear_projection:
-            hidden_states = (
-                hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
-            )
-            hidden_states = self.proj_out(hidden_states)
-        else:
-            hidden_states = self.proj_out(hidden_states)
-            hidden_states = (
-                hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
-            )
-
-        output = hidden_states + residual
-
-        output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
-        if not return_dict:
-            return (output,)
-
-        return Transformer3DModelOutput(sample=output)
-
-
-class BasicTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        dropout=0.0,
-        cross_attention_dim: Optional[int] = None,
-        activation_fn: str = "geglu",
-        num_embeds_ada_norm: Optional[int] = None,
-        attention_bias: bool = False,
-        only_cross_attention: bool = False,
-        upcast_attention: bool = False,
-
-        unet_use_cross_frame_attention = None,
-        unet_use_temporal_attention = None,
-    ):
-        super().__init__()
-        self.only_cross_attention = only_cross_attention
-        self.use_ada_layer_norm = num_embeds_ada_norm is not None
-        self.unet_use_cross_frame_attention = unet_use_cross_frame_attention
-        self.unet_use_temporal_attention = unet_use_temporal_attention
-
-        # SC-Attn
-        assert unet_use_cross_frame_attention is not None
-        if unet_use_cross_frame_attention:
-            self.attn1 = SparseCausalAttention2D(
-                query_dim=dim,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
-                upcast_attention=upcast_attention,
-            )
-        else:
-            self.attn1 = CrossAttention(
-                query_dim=dim,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                upcast_attention=upcast_attention,
-            )
-        self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
-
-        # Cross-Attn
-        if cross_attention_dim is not None:
-            self.attn2 = CrossAttention(
-                query_dim=dim,
-                cross_attention_dim=cross_attention_dim,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                upcast_attention=upcast_attention,
-            )
-        else:
-            self.attn2 = None
-
-        if cross_attention_dim is not None:
-            self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
-        else:
-            self.norm2 = None
-
-        # Feed-forward
-        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
-        self.norm3 = nn.LayerNorm(dim)
-
-        # Temp-Attn
-        assert unet_use_temporal_attention is not None
-        if unet_use_temporal_attention:
-            self.attn_temp = CrossAttention(
-                query_dim=dim,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                upcast_attention=upcast_attention,
-            )
-            nn.init.zeros_(self.attn_temp.to_out[0].weight.data)
-            self.norm_temp = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
-
-    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
-        if not is_xformers_available():
-            print("Here is how to install it")
-            raise ModuleNotFoundError(
-                "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
-                " xformers",
-                name="xformers",
-            )
-        elif not torch.cuda.is_available():
-            raise ValueError(
-                "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
-                " available for GPU "
-            )
-        else:
-            try:
-                # Make sure we can run the memory efficient attention
-                _ = xformers.ops.memory_efficient_attention(
-                    torch.randn((1, 2, 40), device="cuda"),
-                    torch.randn((1, 2, 40), device="cuda"),
-                    torch.randn((1, 2, 40), device="cuda"),
-                )
-            except Exception as e:
-                raise e
-            self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
-            if self.attn2 is not None:
-                self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
-            # self.attn_temp._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
-
-    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, attention_mask=None, video_length=None):
-        # SparseCausal-Attention
-        norm_hidden_states = (
-            self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
-        )
-
-        # if self.only_cross_attention:
-        #     hidden_states = (
-        #         self.attn1(norm_hidden_states, encoder_hidden_states, attention_mask=attention_mask) + hidden_states
-        #     )
-        # else:
-        #     hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length) + hidden_states
-
-        # pdb.set_trace()
-        if self.unet_use_cross_frame_attention:
-            hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length) + hidden_states
-        else:
-            hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask) + hidden_states
-
-        if self.attn2 is not None:
-            # Cross-Attention
-            norm_hidden_states = (
-                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
-            )
-            hidden_states = (
-                self.attn2(
-                    norm_hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
-                )
-                + hidden_states
-            )
-
-        # Feed-forward
-        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
-
-        # Temporal-Attention
-        if self.unet_use_temporal_attention:
-            d = hidden_states.shape[1]
-            hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
-            norm_hidden_states = (
-                self.norm_temp(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_temp(hidden_states)
-            )
-            hidden_states = self.attn_temp(norm_hidden_states) + hidden_states
-            hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
-
-        return hidden_states
--- a/animatediff/models/motion_module.py
+++ b/animatediff/models/motion_module.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union

 import torch
 import numpy as np
@@ -8,324 +8,418 @@ from torch import nn
 import torchvision

 from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.modeling_utils import ModelMixin
+from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.models.attention import CrossAttention, FeedForward
+from diffusers.models.attention_processor import Attention
+from diffusers.models.attention import FeedForward
+
+from animatediff.utils.util import zero_rank_print

 from einops import rearrange, repeat
-import math
+import math, pdb
+import random


 def zero_module(module):
-    # Zero out the parameters of a module and return it.
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
+	# Zero out the parameters of a module and return it.
+	for p in module.parameters():
+		p.detach().zero_()
+	return module


@dataclass
 class TemporalTransformer3DModelOutput(BaseOutput):
-    sample: torch.FloatTensor
-
-
-if is_xformers_available():
-    import xformers
-    import xformers.ops
-else:
-    xformers = None
+	sample: torch.FloatTensor


 def get_motion_module(
-    in_channels,
-    motion_module_type: str, 
-    motion_module_kwargs: dict
+	in_channels,
+	motion_module_type: str, 
+	motion_module_kwargs: dict
 ):
-    if motion_module_type == "Vanilla":
-        return VanillaTemporalModule(in_channels=in_channels, **motion_module_kwargs,)    
-    else:
-        raise ValueError
-
+	if motion_module_type == "Vanilla":
+		return VanillaTemporalModule(in_channels=in_channels, **motion_module_kwargs)
+	elif motion_module_type == "Conv":
+		return ConvTemporalModule(in_channels=in_channels, **motion_module_kwargs)
+	else:
+		raise ValueError

 class VanillaTemporalModule(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        num_attention_heads                = 8,
-        num_transformer_block              = 2,
-        attention_block_types              =( "Temporal_Self", "Temporal_Self" ),
-        cross_frame_attention_mode         = None,
-        temporal_position_encoding         = False,
-        temporal_position_encoding_max_len = 24,
-        temporal_attention_dim_div         = 1,
-        zero_initialize                    = True,
-    ):
-        super().__init__()
-        
-        self.temporal_transformer = TemporalTransformer3DModel(
-            in_channels=in_channels,
-            num_attention_heads=num_attention_heads,
-            attention_head_dim=in_channels // num_attention_heads // temporal_attention_dim_div,
-            num_layers=num_transformer_block,
-            attention_block_types=attention_block_types,
-            cross_frame_attention_mode=cross_frame_attention_mode,
-            temporal_position_encoding=temporal_position_encoding,
-            temporal_position_encoding_max_len=temporal_position_encoding_max_len,
-        )
-        
-        if zero_initialize:
-            self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)
+	def __init__(
+		self,
+		in_channels,
+		num_attention_heads				   = 8,
+		num_transformer_block			   = 2,
+		attention_block_types			   =( "Temporal_Self", ),
+		spatial_position_encoding		   = False,
+		temporal_position_encoding		   = True,
+		temporal_position_encoding_max_len = 32,
+		temporal_attention_dim_div		   = 1,
+		zero_initialize					   = True,
+		
+		causal_temporal_attention			= False,
+		causal_temporal_attention_mask_type = "",
+	):
+		super().__init__()
+		
+		self.temporal_transformer = TemporalTransformer3DModel(
+			in_channels=in_channels,
+			num_attention_heads=num_attention_heads,
+			attention_head_dim=in_channels // num_attention_heads // temporal_attention_dim_div,
+			num_layers=num_transformer_block,
+			attention_block_types=attention_block_types,
+			temporal_position_encoding=temporal_position_encoding,
+			temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+			spatial_position_encoding = spatial_position_encoding,
+			causal_temporal_attention=causal_temporal_attention,
+			causal_temporal_attention_mask_type=causal_temporal_attention_mask_type,
+		)
+		
+		if zero_initialize:
+			self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)

-    def forward(self, input_tensor, temb, encoder_hidden_states, attention_mask=None, anchor_frame_idx=None):
-        hidden_states = input_tensor
-        hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask)
+	def forward(self, input_tensor, temb=None, encoder_hidden_states=None, attention_mask=None):
+		hidden_states = input_tensor
+		hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask)

-        output = hidden_states
-        return output
+		output = hidden_states
+		return output


-class TemporalTransformer3DModel(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        num_attention_heads,
-        attention_head_dim,
+class TemporalTransformer3DModel(nn.Module):	
+	def __init__(
+		self,
+		in_channels,
+		num_attention_heads,
+		attention_head_dim,
+		num_layers,
+		attention_block_types			   = ( "Temporal_Self", "Temporal_Self", ),		   
+		dropout							   = 0.0,
+		norm_num_groups					   = 32,
+		cross_attention_dim				   = 768,
+		activation_fn					   = "geglu",
+		attention_bias					   = False,
+		upcast_attention				   = False,
+		temporal_position_encoding		   = False,
+		temporal_position_encoding_max_len = 32,
+		spatial_position_encoding		   = False,
+		
+		causal_temporal_attention			= None,
+		causal_temporal_attention_mask_type = "",
+	):
+		super().__init__()
+		assert causal_temporal_attention is not None
+		self.causal_temporal_attention			 = causal_temporal_attention

-        num_layers,
-        attention_block_types              = ( "Temporal_Self", "Temporal_Self", ),        
-        dropout                            = 0.0,
-        norm_num_groups                    = 32,
-        cross_attention_dim                = 768,
-        activation_fn                      = "geglu",
-        attention_bias                     = False,
-        upcast_attention                   = False,
-        
-        cross_frame_attention_mode         = None,
-        temporal_position_encoding         = False,
-        temporal_position_encoding_max_len = 24,
-    ):
-        super().__init__()
+		assert (not causal_temporal_attention) or (causal_temporal_attention_mask_type != "")
+		self.causal_temporal_attention_mask_type = causal_temporal_attention_mask_type
+		self.causal_temporal_attention_mask		 = None
+		self.spatial_position_encoding = spatial_position_encoding
+		inner_dim = num_attention_heads * attention_head_dim

-        inner_dim = num_attention_heads * attention_head_dim
+		self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+		self.proj_in = nn.Linear(in_channels, inner_dim)
+		if spatial_position_encoding:
+			self.pos_encoder_2d = PositionalEncoding2D(inner_dim)
+		

-        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
-        self.proj_in = nn.Linear(in_channels, inner_dim)
+		self.transformer_blocks = nn.ModuleList(
+			[
+				TemporalTransformerBlock(
+					dim=inner_dim,
+					num_attention_heads=num_attention_heads,
+					attention_head_dim=attention_head_dim,
+					attention_block_types=attention_block_types,
+					dropout=dropout,
+					norm_num_groups=norm_num_groups,
+					cross_attention_dim=cross_attention_dim,
+					activation_fn=activation_fn,
+					attention_bias=attention_bias,
+					upcast_attention=upcast_attention,
+					temporal_position_encoding=temporal_position_encoding,
+					temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+				)
+				for d in range(num_layers)
+			]
+		)
+		self.proj_out = nn.Linear(inner_dim, in_channels)
+			
+	def get_causal_temporal_attention_mask(self, hidden_states):
+		batch_size, sequence_length, dim = hidden_states.shape
+		
+		if self.causal_temporal_attention_mask is None or self.causal_temporal_attention_mask.shape != (batch_size, sequence_length, sequence_length):
+			zero_rank_print(f"build attn mask of type {self.causal_temporal_attention_mask_type}")
+			if self.causal_temporal_attention_mask_type == "causal":
+				# 1. vanilla causal mask
+				mask = torch.tril(torch.ones(sequence_length, sequence_length))

-        self.transformer_blocks = nn.ModuleList(
-            [
-                TemporalTransformerBlock(
-                    dim=inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    attention_block_types=attention_block_types,
-                    dropout=dropout,
-                    norm_num_groups=norm_num_groups,
-                    cross_attention_dim=cross_attention_dim,
-                    activation_fn=activation_fn,
-                    attention_bias=attention_bias,
-                    upcast_attention=upcast_attention,
-                    cross_frame_attention_mode=cross_frame_attention_mode,
-                    temporal_position_encoding=temporal_position_encoding,
-                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
-                )
-                for d in range(num_layers)
-            ]
-        )
-        self.proj_out = nn.Linear(inner_dim, in_channels)    
-    
-    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
-        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
-        video_length = hidden_states.shape[2]
-        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+			elif self.causal_temporal_attention_mask_type == "2-seq":
+				# 2. 2-seq
+				mask = torch.zeros(sequence_length, sequence_length)
+				mask[:sequence_length // 2,  :sequence_length // 2]  = 1
+				mask[-sequence_length // 2:, -sequence_length // 2:] = 1
+			
+			elif self.causal_temporal_attention_mask_type == "0-prev":
+				# attn to the previous frame
+				indices			= torch.arange(sequence_length)
+				indices_prev	= indices - 1
+				indices_prev[0] = 0
+				mask = torch.zeros(sequence_length, sequence_length)
+				mask[:,  0]					= 1.
+				mask[indices, indices_prev] = 1.

-        batch, channel, height, weight = hidden_states.shape
-        residual = hidden_states
+			elif self.causal_temporal_attention_mask_type == "0":
+				# only attn to first frame
+				mask	  = torch.zeros(sequence_length, sequence_length)
+				mask[:,0] = 1

-        hidden_states = self.norm(hidden_states)
-        inner_dim = hidden_states.shape[1]
-        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
-        hidden_states = self.proj_in(hidden_states)
+			elif self.causal_temporal_attention_mask_type == "wo-self":
+				indices = torch.arange(sequence_length)
+				mask				   = torch.ones(sequence_length, sequence_length)
+				mask[indices, indices] = 0

-        # Transformer Blocks
-        for block in self.transformer_blocks:
-            hidden_states = block(hidden_states, encoder_hidden_states=encoder_hidden_states, video_length=video_length)
-        
-        # output
-        hidden_states = self.proj_out(hidden_states)
-        hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+			elif self.causal_temporal_attention_mask_type == "circle":
+				indices			= torch.arange(sequence_length)
+				indices_prev	= indices - 1
+				indices_prev[0] = 0

-        output = hidden_states + residual
-        output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
-        
-        return output
+				mask = torch.eye(sequence_length)
+				mask[indices, indices_prev] = 1
+				mask[0,-1]					= 1

+			else: raise ValueError
+
+			# for sanity check
+			if dim == 320: zero_rank_print(mask)
+
+			# generate attention mask fron binary values
+			mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+			mask = mask.unsqueeze(0)
+			mask = mask.repeat(batch_size, 1, 1)
+
+			self.causal_temporal_attention_mask = mask.to(hidden_states.device)
+		
+		return self.causal_temporal_attention_mask
+	
+	def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
+		residual = hidden_states
+		assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+		height, width = hidden_states.shape[-2:]
+		
+		hidden_states = self.norm(hidden_states)
+
+		hidden_states = rearrange(hidden_states, "b c f h w -> (b h w) f c")
+		hidden_states = self.proj_in(hidden_states)
+		if self.spatial_position_encoding:
+
+			video_length = hidden_states.shape[1]
+			hidden_states = rearrange(hidden_states, "(b h w) f c -> (b f) h w c", h=height, w=width)
+			pos_encoding = self.pos_encoder_2d(hidden_states)
+			pos_encoding = rearrange(pos_encoding, "(b f) h w c -> (b h w) f c", f = video_length)
+			hidden_states = rearrange(hidden_states, "(b f) h w c -> (b h w) f c", f=video_length)
+
+		attention_mask = self.get_causal_temporal_attention_mask(hidden_states) if self.causal_temporal_attention else attention_mask
+
+		# Transformer Blocks
+		for block in self.transformer_blocks:
+			if not self.spatial_position_encoding :
+				pos_encoding = None
+			
+			hidden_states = block(hidden_states, pos_encoding=pos_encoding, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask)
+
+		hidden_states = self.proj_out(hidden_states)
+
+		hidden_states = rearrange(hidden_states, "(b h w) f c -> b c f h w", h=height, w=width)
+
+		output = hidden_states + residual
+		# output = hidden_states
+
+		return output

 class TemporalTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim,
-        num_attention_heads,
-        attention_head_dim,
-        attention_block_types              = ( "Temporal_Self", "Temporal_Self", ),
-        dropout                            = 0.0,
-        norm_num_groups                    = 32,
-        cross_attention_dim                = 768,
-        activation_fn                      = "geglu",
-        attention_bias                     = False,
-        upcast_attention                   = False,
-        cross_frame_attention_mode         = None,
-        temporal_position_encoding         = False,
-        temporal_position_encoding_max_len = 24,
-    ):
-        super().__init__()
+	def __init__(
+		self,
+		dim,
+		num_attention_heads,
+		attention_head_dim,
+		attention_block_types			   = ( "Temporal_Self", "Temporal_Self", ),
+		dropout							   = 0.0,
+		norm_num_groups					   = 32,
+		cross_attention_dim				   = 768,
+		activation_fn					   = "geglu",
+		attention_bias					   = False,
+		upcast_attention				   = False,
+		temporal_position_encoding		   = False,
+		temporal_position_encoding_max_len = 32,
+	):
+		super().__init__()

-        attention_blocks = []
-        norms = []
-        
-        for block_name in attention_block_types:
-            attention_blocks.append(
-                VersatileAttention(
-                    attention_mode=block_name.split("_")[0],
-                    cross_attention_dim=cross_attention_dim if block_name.endswith("_Cross") else None,
-                    
-                    query_dim=dim,
-                    heads=num_attention_heads,
-                    dim_head=attention_head_dim,
-                    dropout=dropout,
-                    bias=attention_bias,
-                    upcast_attention=upcast_attention,
-        
-                    cross_frame_attention_mode=cross_frame_attention_mode,
-                    temporal_position_encoding=temporal_position_encoding,
-                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
-                )
-            )
-            norms.append(nn.LayerNorm(dim))
-            
-        self.attention_blocks = nn.ModuleList(attention_blocks)
-        self.norms = nn.ModuleList(norms)
-
-        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
-        self.ff_norm = nn.LayerNorm(dim)
+		attention_blocks = []
+		norms = []
+		
+		for block_name in attention_block_types:
+			attention_blocks.append(
+				TemporalSelfAttention(
+					attention_mode=block_name.split("_")[0],
+					cross_attention_dim=cross_attention_dim if block_name.endswith("_Cross") else None,
+					
+					query_dim=dim,
+					heads=num_attention_heads,
+					dim_head=attention_head_dim,
+					dropout=dropout,
+					bias=attention_bias,
+					upcast_attention=upcast_attention,
+		
+					temporal_position_encoding=temporal_position_encoding,
+					temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+				)
+			)
+			norms.append(nn.LayerNorm(dim))
+			
+		self.attention_blocks = nn.ModuleList(attention_blocks)
+		self.norms = nn.ModuleList(norms)
+		
+		self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+		self.ff_norm = nn.LayerNorm(dim)


-    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
-        for attention_block, norm in zip(self.attention_blocks, self.norms):
-            norm_hidden_states = norm(hidden_states)
-            hidden_states = attention_block(
-                norm_hidden_states,
-                encoder_hidden_states=encoder_hidden_states if attention_block.is_cross_attention else None,
-                video_length=video_length,
-            ) + hidden_states
-            
-        hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
-        
-        output = hidden_states  
-        return output
+	def forward(self, hidden_states, pos_encoding=None, encoder_hidden_states=None, attention_mask=None):
+		for attention_block, norm in zip(self.attention_blocks, self.norms):
+			if pos_encoding is not None:
+				hidden_states += pos_encoding
+			norm_hidden_states = norm(hidden_states)
+			hidden_states = attention_block(
+				norm_hidden_states,
+				encoder_hidden_states=encoder_hidden_states,
+				attention_mask=attention_mask,
+			) + hidden_states

+		hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
+		
+		output = hidden_states
+		return output
+
+
+def get_emb(sin_inp):
+	"""
+	Gets a base embedding for one dimension with sin and cos intertwined
+	"""
+	emb = torch.stack((sin_inp.sin(), sin_inp.cos()), dim=-1)
+	return torch.flatten(emb, -2, -1)
+
+class PositionalEncoding2D(nn.Module):
+	def __init__(self, channels):
+		"""
+		:param channels: The last dimension of the tensor you want to apply pos emb to.
+		"""
+		super(PositionalEncoding2D, self).__init__()
+		self.org_channels = channels
+		channels = int(np.ceil(channels / 4) * 2)
+		self.channels = channels
+		inv_freq = 1.0 / (10000 ** (torch.arange(0, channels, 2).float() / channels))
+		self.register_buffer("inv_freq", inv_freq)
+		self.register_buffer("cached_penc", None)
+
+	def forward(self, tensor):
+		"""
+		:param tensor: A 4d tensor of size (batch_size, x, y, ch)
+		:return: Positional Encoding Matrix of size (batch_size, x, y, ch)
+		"""
+		if len(tensor.shape) != 4:
+			raise RuntimeError("The input tensor has to be 4d!")
+
+		if self.cached_penc is not None and self.cached_penc.shape == tensor.shape:
+			return self.cached_penc
+
+		self.cached_penc = None
+		batch_size, x, y, orig_ch = tensor.shape
+		pos_x = torch.arange(x, device=tensor.device).type(self.inv_freq.type())
+		pos_y = torch.arange(y, device=tensor.device).type(self.inv_freq.type())
+		sin_inp_x = torch.einsum("i,j->ij", pos_x, self.inv_freq)
+		sin_inp_y = torch.einsum("i,j->ij", pos_y, self.inv_freq)
+		emb_x = get_emb(sin_inp_x).unsqueeze(1)
+		emb_y = get_emb(sin_inp_y)
+		emb = torch.zeros((x, y, self.channels * 2), device=tensor.device).type(
+			tensor.type()
+		)
+		emb[:, :, : self.channels] = emb_x
+		emb[:, :, self.channels : 2 * self.channels] = emb_y
+
+		self.cached_penc = emb[None, :, :, :orig_ch].repeat(tensor.shape[0], 1, 1, 1)
+		return self.cached_penc

 class PositionalEncoding(nn.Module):
-    def __init__(
-        self, 
-        d_model, 
-        dropout = 0., 
-        max_len = 24
-    ):
-        super().__init__()
-        self.dropout = nn.Dropout(p=dropout)
-        position = torch.arange(max_len).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
-        pe = torch.zeros(1, max_len, d_model)
-        pe[0, :, 0::2] = torch.sin(position * div_term)
-        pe[0, :, 1::2] = torch.cos(position * div_term)
-        self.register_buffer('pe', pe)
+	def __init__(
+		self, 
+		d_model,
+		dropout = 0., 
+		max_len = 32,
+	):
+		super().__init__()
+		self.dropout = nn.Dropout(p=dropout)
+		position = torch.arange(max_len).unsqueeze(1)
+		div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
+		pe = torch.zeros(1, max_len, d_model)
+		pe[0, :, 0::2] = torch.sin(position * div_term)
+		pe[0, :, 1::2] = torch.cos(position * div_term)
+		self.register_buffer('pe', pe)

-    def forward(self, x):
-        x = x + self.pe[:, :x.size(1)]
-        return self.dropout(x)
+	def forward(self, x):
+		# if x.size(1) < 16:
+		# 	start_idx = random.randint(0, 12)
+		# else:
+		# 	start_idx = 0
+		
+		x = x + self.pe[:, :x.size(1)]
+		return self.dropout(x)


-class VersatileAttention(CrossAttention):
-    def __init__(
-            self,
-            attention_mode                     = None,
-            cross_frame_attention_mode         = None,
-            temporal_position_encoding         = False,
-            temporal_position_encoding_max_len = 24,            
-            *args, **kwargs
-        ):
-        super().__init__(*args, **kwargs)
-        assert attention_mode == "Temporal"
+class TemporalSelfAttention(Attention):
+	def __init__(
+			self,
+			attention_mode					   = None,
+			temporal_position_encoding		   = False,
+			temporal_position_encoding_max_len = 32,
+			*args, **kwargs
+		):
+		super().__init__(*args, **kwargs)
+		assert attention_mode == "Temporal"

-        self.attention_mode = attention_mode
-        self.is_cross_attention = kwargs["cross_attention_dim"] is not None
-        
-        self.pos_encoder = PositionalEncoding(
-            kwargs["query_dim"],
-            dropout=0., 
-            max_len=temporal_position_encoding_max_len
-        ) if (temporal_position_encoding and attention_mode == "Temporal") else None
+		self.pos_encoder = PositionalEncoding(
+			kwargs["query_dim"],
+			max_len=temporal_position_encoding_max_len
+		) if temporal_position_encoding else None

-    def extra_repr(self):
-        return f"(Module Info) Attention_Mode: {self.attention_mode}, Is_Cross_Attention: {self.is_cross_attention}"
+	def set_use_memory_efficient_attention_xformers(
+		self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
+	):
+		# disable motion module efficient xformers to avoid bad results, don't know why
+		# TODO: fix this bug
+		pass

-    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
-        batch_size, sequence_length, _ = hidden_states.shape
+	def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
+		# The `Attention` class can call different attention processors / attention functions
+		# here we simply pass along all tensors to the selected processor class
+		# For standard processors that are defined here, `**cross_attention_kwargs` is empty

-        if self.attention_mode == "Temporal":
-            d = hidden_states.shape[1]
-            hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
-            
-            if self.pos_encoder is not None:
-                hidden_states = self.pos_encoder(hidden_states)
-            
-            encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b d) n c", d=d) if encoder_hidden_states is not None else encoder_hidden_states
-        else:
-            raise NotImplementedError
+		# add position encoding
+		hidden_states = self.pos_encoder(hidden_states)

-        encoder_hidden_states = encoder_hidden_states
+		if hasattr(self.processor, "__call__"):
+			return self.processor.__call__(
+				self,
+				hidden_states,
+				encoder_hidden_states=None,
+				attention_mask=attention_mask,
+				**cross_attention_kwargs,
+			)

-        if self.group_norm is not None:
-            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = self.to_q(hidden_states)
-        dim = query.shape[-1]
-        query = self.reshape_heads_to_batch_dim(query)
-
-        if self.added_kv_proj_dim is not None:
-            raise NotImplementedError
-
-        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
-        key = self.to_k(encoder_hidden_states)
-        value = self.to_v(encoder_hidden_states)
-
-        key = self.reshape_heads_to_batch_dim(key)
-        value = self.reshape_heads_to_batch_dim(value)
-
-        if attention_mask is not None:
-            if attention_mask.shape[-1] != query.shape[1]:
-                target_length = query.shape[1]
-                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
-                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
-
-        # attention, what we cannot get enough of
-        if self._use_memory_efficient_attention_xformers:
-            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
-            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
-            hidden_states = hidden_states.to(query.dtype)
-        else:
-            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
-                hidden_states = self._attention(query, key, value, attention_mask)
-            else:
-                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
-
-        # linear proj
-        hidden_states = self.to_out[0](hidden_states)
-
-        # dropout
-        hidden_states = self.to_out[1](hidden_states)
-
-        if self.attention_mode == "Temporal":
-            hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
-
-        return hidden_states
+		else:
+			return self.processor(
+				self,
+				hidden_states,
+				encoder_hidden_states=None,
+				attention_mask=attention_mask,
+				**cross_attention_kwargs,
+			)
--- a/animatediff/models/resnet.py
+++ b/animatediff/models/resnet.py
@@ -1,217 +0,0 @@
-# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from einops import rearrange
-
-
-class InflatedConv3d(nn.Conv2d):
-    def forward(self, x):
-        video_length = x.shape[2]
-
-        x = rearrange(x, "b c f h w -> (b f) c h w")
-        x = super().forward(x)
-        x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
-
-        return x
-
-
-class InflatedGroupNorm(nn.GroupNorm):
-    def forward(self, x):
-        video_length = x.shape[2]
-
-        x = rearrange(x, "b c f h w -> (b f) c h w")
-        x = super().forward(x)
-        x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
-
-        return x
-
-
-class Upsample3D(nn.Module):
-    def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_conv_transpose = use_conv_transpose
-        self.name = name
-
-        conv = None
-        if use_conv_transpose:
-            raise NotImplementedError
-        elif use_conv:
-            self.conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1)
-
-    def forward(self, hidden_states, output_size=None):
-        assert hidden_states.shape[1] == self.channels
-
-        if self.use_conv_transpose:
-            raise NotImplementedError
-
-        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
-        dtype = hidden_states.dtype
-        if dtype == torch.bfloat16:
-            hidden_states = hidden_states.to(torch.float32)
-
-        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
-        if hidden_states.shape[0] >= 64:
-            hidden_states = hidden_states.contiguous()
-
-        # if `output_size` is passed we force the interpolation output
-        # size and do not make use of `scale_factor=2`
-        if output_size is None:
-            hidden_states = F.interpolate(hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest")
-        else:
-            hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
-
-        # If the input is bfloat16, we cast back to bfloat16
-        if dtype == torch.bfloat16:
-            hidden_states = hidden_states.to(dtype)
-
-        # if self.use_conv:
-        #     if self.name == "conv":
-        #         hidden_states = self.conv(hidden_states)
-        #     else:
-        #         hidden_states = self.Conv2d_0(hidden_states)
-        hidden_states = self.conv(hidden_states)
-
-        return hidden_states
-
-
-class Downsample3D(nn.Module):
-    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.padding = padding
-        stride = 2
-        self.name = name
-
-        if use_conv:
-            self.conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
-        else:
-            raise NotImplementedError
-
-    def forward(self, hidden_states):
-        assert hidden_states.shape[1] == self.channels
-        if self.use_conv and self.padding == 0:
-            raise NotImplementedError
-
-        assert hidden_states.shape[1] == self.channels
-        hidden_states = self.conv(hidden_states)
-
-        return hidden_states
-
-
-class ResnetBlock3D(nn.Module):
-    def __init__(
-        self,
-        *,
-        in_channels,
-        out_channels=None,
-        conv_shortcut=False,
-        dropout=0.0,
-        temb_channels=512,
-        groups=32,
-        groups_out=None,
-        pre_norm=True,
-        eps=1e-6,
-        non_linearity="swish",
-        time_embedding_norm="default",
-        output_scale_factor=1.0,
-        use_in_shortcut=None,
-        use_inflated_groupnorm=None,
-    ):
-        super().__init__()
-        self.pre_norm = pre_norm
-        self.pre_norm = True
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-        self.time_embedding_norm = time_embedding_norm
-        self.output_scale_factor = output_scale_factor
-
-        if groups_out is None:
-            groups_out = groups
-
-        assert use_inflated_groupnorm != None
-        if use_inflated_groupnorm:
-            self.norm1 = InflatedGroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
-        else:
-            self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
-
-        self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
-
-        if temb_channels is not None:
-            if self.time_embedding_norm == "default":
-                time_emb_proj_out_channels = out_channels
-            elif self.time_embedding_norm == "scale_shift":
-                time_emb_proj_out_channels = out_channels * 2
-            else:
-                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
-
-            self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)
-        else:
-            self.time_emb_proj = None
-
-        if use_inflated_groupnorm:
-            self.norm2 = InflatedGroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
-        else:
-            self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
-
-        self.dropout = torch.nn.Dropout(dropout)
-        self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
-
-        if non_linearity == "swish":
-            self.nonlinearity = lambda x: F.silu(x)
-        elif non_linearity == "mish":
-            self.nonlinearity = Mish()
-        elif non_linearity == "silu":
-            self.nonlinearity = nn.SiLU()
-
-        self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
-
-        self.conv_shortcut = None
-        if self.use_in_shortcut:
-            self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
-
-    def forward(self, input_tensor, temb):
-        hidden_states = input_tensor
-
-        hidden_states = self.norm1(hidden_states)
-        hidden_states = self.nonlinearity(hidden_states)
-
-        hidden_states = self.conv1(hidden_states)
-
-        if temb is not None:
-            temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None]
-
-        if temb is not None and self.time_embedding_norm == "default":
-            hidden_states = hidden_states + temb
-
-        hidden_states = self.norm2(hidden_states)
-
-        if temb is not None and self.time_embedding_norm == "scale_shift":
-            scale, shift = torch.chunk(temb, 2, dim=1)
-            hidden_states = hidden_states * (1 + scale) + shift
-
-        hidden_states = self.nonlinearity(hidden_states)
-
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.conv2(hidden_states)
-
-        if self.conv_shortcut is not None:
-            input_tensor = self.conv_shortcut(input_tensor)
-
-        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
-
-        return output_tensor
-
-
-class Mish(torch.nn.Module):
-    def forward(self, hidden_states):
-        return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states))
--- a/animatediff/models/unet.py
+++ b/animatediff/models/unet.py
--- a/animatediff/models/unet_blocks.py
+++ b/animatediff/models/unet_blocks.py