support sdxl

2026-04-03 17:56:15 +02:00 · 2023-11-10 11:57:39 +08:00
parent 60dfd554c0
commit d6f459dbd6
111 changed files with 5620 additions and 3750 deletions
--- a/animatediff/models/motion_module.py
+++ b/animatediff/models/motion_module.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union

 import torch
 import numpy as np
@@ -8,324 +8,418 @@ from torch import nn
 import torchvision

 from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.modeling_utils import ModelMixin
+from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.models.attention import CrossAttention, FeedForward
+from diffusers.models.attention_processor import Attention
+from diffusers.models.attention import FeedForward
+
+from animatediff.utils.util import zero_rank_print

 from einops import rearrange, repeat
-import math
+import math, pdb
+import random


 def zero_module(module):
-    # Zero out the parameters of a module and return it.
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
+	# Zero out the parameters of a module and return it.
+	for p in module.parameters():
+		p.detach().zero_()
+	return module


@dataclass
 class TemporalTransformer3DModelOutput(BaseOutput):
-    sample: torch.FloatTensor
-
-
-if is_xformers_available():
-    import xformers
-    import xformers.ops
-else:
-    xformers = None
+	sample: torch.FloatTensor


 def get_motion_module(
-    in_channels,
-    motion_module_type: str, 
-    motion_module_kwargs: dict
+	in_channels,
+	motion_module_type: str, 
+	motion_module_kwargs: dict
 ):
-    if motion_module_type == "Vanilla":
-        return VanillaTemporalModule(in_channels=in_channels, **motion_module_kwargs,)    
-    else:
-        raise ValueError
-
+	if motion_module_type == "Vanilla":
+		return VanillaTemporalModule(in_channels=in_channels, **motion_module_kwargs)
+	elif motion_module_type == "Conv":
+		return ConvTemporalModule(in_channels=in_channels, **motion_module_kwargs)
+	else:
+		raise ValueError

 class VanillaTemporalModule(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        num_attention_heads                = 8,
-        num_transformer_block              = 2,
-        attention_block_types              =( "Temporal_Self", "Temporal_Self" ),
-        cross_frame_attention_mode         = None,
-        temporal_position_encoding         = False,
-        temporal_position_encoding_max_len = 24,
-        temporal_attention_dim_div         = 1,
-        zero_initialize                    = True,
-    ):
-        super().__init__()
-        
-        self.temporal_transformer = TemporalTransformer3DModel(
-            in_channels=in_channels,
-            num_attention_heads=num_attention_heads,
-            attention_head_dim=in_channels // num_attention_heads // temporal_attention_dim_div,
-            num_layers=num_transformer_block,
-            attention_block_types=attention_block_types,
-            cross_frame_attention_mode=cross_frame_attention_mode,
-            temporal_position_encoding=temporal_position_encoding,
-            temporal_position_encoding_max_len=temporal_position_encoding_max_len,
-        )
-        
-        if zero_initialize:
-            self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)
+	def __init__(
+		self,
+		in_channels,
+		num_attention_heads				   = 8,
+		num_transformer_block			   = 2,
+		attention_block_types			   =( "Temporal_Self", ),
+		spatial_position_encoding		   = False,
+		temporal_position_encoding		   = True,
+		temporal_position_encoding_max_len = 32,
+		temporal_attention_dim_div		   = 1,
+		zero_initialize					   = True,
+		
+		causal_temporal_attention			= False,
+		causal_temporal_attention_mask_type = "",
+	):
+		super().__init__()
+		
+		self.temporal_transformer = TemporalTransformer3DModel(
+			in_channels=in_channels,
+			num_attention_heads=num_attention_heads,
+			attention_head_dim=in_channels // num_attention_heads // temporal_attention_dim_div,
+			num_layers=num_transformer_block,
+			attention_block_types=attention_block_types,
+			temporal_position_encoding=temporal_position_encoding,
+			temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+			spatial_position_encoding = spatial_position_encoding,
+			causal_temporal_attention=causal_temporal_attention,
+			causal_temporal_attention_mask_type=causal_temporal_attention_mask_type,
+		)
+		
+		if zero_initialize:
+			self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)

-    def forward(self, input_tensor, temb, encoder_hidden_states, attention_mask=None, anchor_frame_idx=None):
-        hidden_states = input_tensor
-        hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask)
+	def forward(self, input_tensor, temb=None, encoder_hidden_states=None, attention_mask=None):
+		hidden_states = input_tensor
+		hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask)

-        output = hidden_states
-        return output
+		output = hidden_states
+		return output


-class TemporalTransformer3DModel(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        num_attention_heads,
-        attention_head_dim,
+class TemporalTransformer3DModel(nn.Module):	
+	def __init__(
+		self,
+		in_channels,
+		num_attention_heads,
+		attention_head_dim,
+		num_layers,
+		attention_block_types			   = ( "Temporal_Self", "Temporal_Self", ),		   
+		dropout							   = 0.0,
+		norm_num_groups					   = 32,
+		cross_attention_dim				   = 768,
+		activation_fn					   = "geglu",
+		attention_bias					   = False,
+		upcast_attention				   = False,
+		temporal_position_encoding		   = False,
+		temporal_position_encoding_max_len = 32,
+		spatial_position_encoding		   = False,
+		
+		causal_temporal_attention			= None,
+		causal_temporal_attention_mask_type = "",
+	):
+		super().__init__()
+		assert causal_temporal_attention is not None
+		self.causal_temporal_attention			 = causal_temporal_attention

-        num_layers,
-        attention_block_types              = ( "Temporal_Self", "Temporal_Self", ),        
-        dropout                            = 0.0,
-        norm_num_groups                    = 32,
-        cross_attention_dim                = 768,
-        activation_fn                      = "geglu",
-        attention_bias                     = False,
-        upcast_attention                   = False,
-        
-        cross_frame_attention_mode         = None,
-        temporal_position_encoding         = False,
-        temporal_position_encoding_max_len = 24,
-    ):
-        super().__init__()
+		assert (not causal_temporal_attention) or (causal_temporal_attention_mask_type != "")
+		self.causal_temporal_attention_mask_type = causal_temporal_attention_mask_type
+		self.causal_temporal_attention_mask		 = None
+		self.spatial_position_encoding = spatial_position_encoding
+		inner_dim = num_attention_heads * attention_head_dim

-        inner_dim = num_attention_heads * attention_head_dim
+		self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+		self.proj_in = nn.Linear(in_channels, inner_dim)
+		if spatial_position_encoding:
+			self.pos_encoder_2d = PositionalEncoding2D(inner_dim)
+		

-        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
-        self.proj_in = nn.Linear(in_channels, inner_dim)
+		self.transformer_blocks = nn.ModuleList(
+			[
+				TemporalTransformerBlock(
+					dim=inner_dim,
+					num_attention_heads=num_attention_heads,
+					attention_head_dim=attention_head_dim,
+					attention_block_types=attention_block_types,
+					dropout=dropout,
+					norm_num_groups=norm_num_groups,
+					cross_attention_dim=cross_attention_dim,
+					activation_fn=activation_fn,
+					attention_bias=attention_bias,
+					upcast_attention=upcast_attention,
+					temporal_position_encoding=temporal_position_encoding,
+					temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+				)
+				for d in range(num_layers)
+			]
+		)
+		self.proj_out = nn.Linear(inner_dim, in_channels)
+			
+	def get_causal_temporal_attention_mask(self, hidden_states):
+		batch_size, sequence_length, dim = hidden_states.shape
+		
+		if self.causal_temporal_attention_mask is None or self.causal_temporal_attention_mask.shape != (batch_size, sequence_length, sequence_length):
+			zero_rank_print(f"build attn mask of type {self.causal_temporal_attention_mask_type}")
+			if self.causal_temporal_attention_mask_type == "causal":
+				# 1. vanilla causal mask
+				mask = torch.tril(torch.ones(sequence_length, sequence_length))

-        self.transformer_blocks = nn.ModuleList(
-            [
-                TemporalTransformerBlock(
-                    dim=inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    attention_block_types=attention_block_types,
-                    dropout=dropout,
-                    norm_num_groups=norm_num_groups,
-                    cross_attention_dim=cross_attention_dim,
-                    activation_fn=activation_fn,
-                    attention_bias=attention_bias,
-                    upcast_attention=upcast_attention,
-                    cross_frame_attention_mode=cross_frame_attention_mode,
-                    temporal_position_encoding=temporal_position_encoding,
-                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
-                )
-                for d in range(num_layers)
-            ]
-        )
-        self.proj_out = nn.Linear(inner_dim, in_channels)    
-    
-    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
-        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
-        video_length = hidden_states.shape[2]
-        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+			elif self.causal_temporal_attention_mask_type == "2-seq":
+				# 2. 2-seq
+				mask = torch.zeros(sequence_length, sequence_length)
+				mask[:sequence_length // 2,  :sequence_length // 2]  = 1
+				mask[-sequence_length // 2:, -sequence_length // 2:] = 1
+			
+			elif self.causal_temporal_attention_mask_type == "0-prev":
+				# attn to the previous frame
+				indices			= torch.arange(sequence_length)
+				indices_prev	= indices - 1
+				indices_prev[0] = 0
+				mask = torch.zeros(sequence_length, sequence_length)
+				mask[:,  0]					= 1.
+				mask[indices, indices_prev] = 1.

-        batch, channel, height, weight = hidden_states.shape
-        residual = hidden_states
+			elif self.causal_temporal_attention_mask_type == "0":
+				# only attn to first frame
+				mask	  = torch.zeros(sequence_length, sequence_length)
+				mask[:,0] = 1

-        hidden_states = self.norm(hidden_states)
-        inner_dim = hidden_states.shape[1]
-        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
-        hidden_states = self.proj_in(hidden_states)
+			elif self.causal_temporal_attention_mask_type == "wo-self":
+				indices = torch.arange(sequence_length)
+				mask				   = torch.ones(sequence_length, sequence_length)
+				mask[indices, indices] = 0

-        # Transformer Blocks
-        for block in self.transformer_blocks:
-            hidden_states = block(hidden_states, encoder_hidden_states=encoder_hidden_states, video_length=video_length)
-        
-        # output
-        hidden_states = self.proj_out(hidden_states)
-        hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+			elif self.causal_temporal_attention_mask_type == "circle":
+				indices			= torch.arange(sequence_length)
+				indices_prev	= indices - 1
+				indices_prev[0] = 0

-        output = hidden_states + residual
-        output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
-        
-        return output
+				mask = torch.eye(sequence_length)
+				mask[indices, indices_prev] = 1
+				mask[0,-1]					= 1

+			else: raise ValueError
+
+			# for sanity check
+			if dim == 320: zero_rank_print(mask)
+
+			# generate attention mask fron binary values
+			mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+			mask = mask.unsqueeze(0)
+			mask = mask.repeat(batch_size, 1, 1)
+
+			self.causal_temporal_attention_mask = mask.to(hidden_states.device)
+		
+		return self.causal_temporal_attention_mask
+	
+	def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
+		residual = hidden_states
+		assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+		height, width = hidden_states.shape[-2:]
+		
+		hidden_states = self.norm(hidden_states)
+
+		hidden_states = rearrange(hidden_states, "b c f h w -> (b h w) f c")
+		hidden_states = self.proj_in(hidden_states)
+		if self.spatial_position_encoding:
+
+			video_length = hidden_states.shape[1]
+			hidden_states = rearrange(hidden_states, "(b h w) f c -> (b f) h w c", h=height, w=width)
+			pos_encoding = self.pos_encoder_2d(hidden_states)
+			pos_encoding = rearrange(pos_encoding, "(b f) h w c -> (b h w) f c", f = video_length)
+			hidden_states = rearrange(hidden_states, "(b f) h w c -> (b h w) f c", f=video_length)
+
+		attention_mask = self.get_causal_temporal_attention_mask(hidden_states) if self.causal_temporal_attention else attention_mask
+
+		# Transformer Blocks
+		for block in self.transformer_blocks:
+			if not self.spatial_position_encoding :
+				pos_encoding = None
+			
+			hidden_states = block(hidden_states, pos_encoding=pos_encoding, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask)
+
+		hidden_states = self.proj_out(hidden_states)
+
+		hidden_states = rearrange(hidden_states, "(b h w) f c -> b c f h w", h=height, w=width)
+
+		output = hidden_states + residual
+		# output = hidden_states
+
+		return output

 class TemporalTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim,
-        num_attention_heads,
-        attention_head_dim,
-        attention_block_types              = ( "Temporal_Self", "Temporal_Self", ),
-        dropout                            = 0.0,
-        norm_num_groups                    = 32,
-        cross_attention_dim                = 768,
-        activation_fn                      = "geglu",
-        attention_bias                     = False,
-        upcast_attention                   = False,
-        cross_frame_attention_mode         = None,
-        temporal_position_encoding         = False,
-        temporal_position_encoding_max_len = 24,
-    ):
-        super().__init__()
+	def __init__(
+		self,
+		dim,
+		num_attention_heads,
+		attention_head_dim,
+		attention_block_types			   = ( "Temporal_Self", "Temporal_Self", ),
+		dropout							   = 0.0,
+		norm_num_groups					   = 32,
+		cross_attention_dim				   = 768,
+		activation_fn					   = "geglu",
+		attention_bias					   = False,
+		upcast_attention				   = False,
+		temporal_position_encoding		   = False,
+		temporal_position_encoding_max_len = 32,
+	):
+		super().__init__()

-        attention_blocks = []
-        norms = []
-        
-        for block_name in attention_block_types:
-            attention_blocks.append(
-                VersatileAttention(
-                    attention_mode=block_name.split("_")[0],
-                    cross_attention_dim=cross_attention_dim if block_name.endswith("_Cross") else None,
-                    
-                    query_dim=dim,
-                    heads=num_attention_heads,
-                    dim_head=attention_head_dim,
-                    dropout=dropout,
-                    bias=attention_bias,
-                    upcast_attention=upcast_attention,
-        
-                    cross_frame_attention_mode=cross_frame_attention_mode,
-                    temporal_position_encoding=temporal_position_encoding,
-                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
-                )
-            )
-            norms.append(nn.LayerNorm(dim))
-            
-        self.attention_blocks = nn.ModuleList(attention_blocks)
-        self.norms = nn.ModuleList(norms)
-
-        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
-        self.ff_norm = nn.LayerNorm(dim)
+		attention_blocks = []
+		norms = []
+		
+		for block_name in attention_block_types:
+			attention_blocks.append(
+				TemporalSelfAttention(
+					attention_mode=block_name.split("_")[0],
+					cross_attention_dim=cross_attention_dim if block_name.endswith("_Cross") else None,
+					
+					query_dim=dim,
+					heads=num_attention_heads,
+					dim_head=attention_head_dim,
+					dropout=dropout,
+					bias=attention_bias,
+					upcast_attention=upcast_attention,
+		
+					temporal_position_encoding=temporal_position_encoding,
+					temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+				)
+			)
+			norms.append(nn.LayerNorm(dim))
+			
+		self.attention_blocks = nn.ModuleList(attention_blocks)
+		self.norms = nn.ModuleList(norms)
+		
+		self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+		self.ff_norm = nn.LayerNorm(dim)


-    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
-        for attention_block, norm in zip(self.attention_blocks, self.norms):
-            norm_hidden_states = norm(hidden_states)
-            hidden_states = attention_block(
-                norm_hidden_states,
-                encoder_hidden_states=encoder_hidden_states if attention_block.is_cross_attention else None,
-                video_length=video_length,
-            ) + hidden_states
-            
-        hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
-        
-        output = hidden_states  
-        return output
+	def forward(self, hidden_states, pos_encoding=None, encoder_hidden_states=None, attention_mask=None):
+		for attention_block, norm in zip(self.attention_blocks, self.norms):
+			if pos_encoding is not None:
+				hidden_states += pos_encoding
+			norm_hidden_states = norm(hidden_states)
+			hidden_states = attention_block(
+				norm_hidden_states,
+				encoder_hidden_states=encoder_hidden_states,
+				attention_mask=attention_mask,
+			) + hidden_states

+		hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
+		
+		output = hidden_states
+		return output
+
+
+def get_emb(sin_inp):
+	"""
+	Gets a base embedding for one dimension with sin and cos intertwined
+	"""
+	emb = torch.stack((sin_inp.sin(), sin_inp.cos()), dim=-1)
+	return torch.flatten(emb, -2, -1)
+
+class PositionalEncoding2D(nn.Module):
+	def __init__(self, channels):
+		"""
+		:param channels: The last dimension of the tensor you want to apply pos emb to.
+		"""
+		super(PositionalEncoding2D, self).__init__()
+		self.org_channels = channels
+		channels = int(np.ceil(channels / 4) * 2)
+		self.channels = channels
+		inv_freq = 1.0 / (10000 ** (torch.arange(0, channels, 2).float() / channels))
+		self.register_buffer("inv_freq", inv_freq)
+		self.register_buffer("cached_penc", None)
+
+	def forward(self, tensor):
+		"""
+		:param tensor: A 4d tensor of size (batch_size, x, y, ch)
+		:return: Positional Encoding Matrix of size (batch_size, x, y, ch)
+		"""
+		if len(tensor.shape) != 4:
+			raise RuntimeError("The input tensor has to be 4d!")
+
+		if self.cached_penc is not None and self.cached_penc.shape == tensor.shape:
+			return self.cached_penc
+
+		self.cached_penc = None
+		batch_size, x, y, orig_ch = tensor.shape
+		pos_x = torch.arange(x, device=tensor.device).type(self.inv_freq.type())
+		pos_y = torch.arange(y, device=tensor.device).type(self.inv_freq.type())
+		sin_inp_x = torch.einsum("i,j->ij", pos_x, self.inv_freq)
+		sin_inp_y = torch.einsum("i,j->ij", pos_y, self.inv_freq)
+		emb_x = get_emb(sin_inp_x).unsqueeze(1)
+		emb_y = get_emb(sin_inp_y)
+		emb = torch.zeros((x, y, self.channels * 2), device=tensor.device).type(
+			tensor.type()
+		)
+		emb[:, :, : self.channels] = emb_x
+		emb[:, :, self.channels : 2 * self.channels] = emb_y
+
+		self.cached_penc = emb[None, :, :, :orig_ch].repeat(tensor.shape[0], 1, 1, 1)
+		return self.cached_penc

 class PositionalEncoding(nn.Module):
-    def __init__(
-        self, 
-        d_model, 
-        dropout = 0., 
-        max_len = 24
-    ):
-        super().__init__()
-        self.dropout = nn.Dropout(p=dropout)
-        position = torch.arange(max_len).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
-        pe = torch.zeros(1, max_len, d_model)
-        pe[0, :, 0::2] = torch.sin(position * div_term)
-        pe[0, :, 1::2] = torch.cos(position * div_term)
-        self.register_buffer('pe', pe)
+	def __init__(
+		self, 
+		d_model,
+		dropout = 0., 
+		max_len = 32,
+	):
+		super().__init__()
+		self.dropout = nn.Dropout(p=dropout)
+		position = torch.arange(max_len).unsqueeze(1)
+		div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
+		pe = torch.zeros(1, max_len, d_model)
+		pe[0, :, 0::2] = torch.sin(position * div_term)
+		pe[0, :, 1::2] = torch.cos(position * div_term)
+		self.register_buffer('pe', pe)

-    def forward(self, x):
-        x = x + self.pe[:, :x.size(1)]
-        return self.dropout(x)
+	def forward(self, x):
+		# if x.size(1) < 16:
+		# 	start_idx = random.randint(0, 12)
+		# else:
+		# 	start_idx = 0
+		
+		x = x + self.pe[:, :x.size(1)]
+		return self.dropout(x)


-class VersatileAttention(CrossAttention):
-    def __init__(
-            self,
-            attention_mode                     = None,
-            cross_frame_attention_mode         = None,
-            temporal_position_encoding         = False,
-            temporal_position_encoding_max_len = 24,            
-            *args, **kwargs
-        ):
-        super().__init__(*args, **kwargs)
-        assert attention_mode == "Temporal"
+class TemporalSelfAttention(Attention):
+	def __init__(
+			self,
+			attention_mode					   = None,
+			temporal_position_encoding		   = False,
+			temporal_position_encoding_max_len = 32,
+			*args, **kwargs
+		):
+		super().__init__(*args, **kwargs)
+		assert attention_mode == "Temporal"

-        self.attention_mode = attention_mode
-        self.is_cross_attention = kwargs["cross_attention_dim"] is not None
-        
-        self.pos_encoder = PositionalEncoding(
-            kwargs["query_dim"],
-            dropout=0., 
-            max_len=temporal_position_encoding_max_len
-        ) if (temporal_position_encoding and attention_mode == "Temporal") else None
+		self.pos_encoder = PositionalEncoding(
+			kwargs["query_dim"],
+			max_len=temporal_position_encoding_max_len
+		) if temporal_position_encoding else None

-    def extra_repr(self):
-        return f"(Module Info) Attention_Mode: {self.attention_mode}, Is_Cross_Attention: {self.is_cross_attention}"
+	def set_use_memory_efficient_attention_xformers(
+		self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
+	):
+		# disable motion module efficient xformers to avoid bad results, don't know why
+		# TODO: fix this bug
+		pass

-    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
-        batch_size, sequence_length, _ = hidden_states.shape
+	def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
+		# The `Attention` class can call different attention processors / attention functions
+		# here we simply pass along all tensors to the selected processor class
+		# For standard processors that are defined here, `**cross_attention_kwargs` is empty

-        if self.attention_mode == "Temporal":
-            d = hidden_states.shape[1]
-            hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
-            
-            if self.pos_encoder is not None:
-                hidden_states = self.pos_encoder(hidden_states)
-            
-            encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b d) n c", d=d) if encoder_hidden_states is not None else encoder_hidden_states
-        else:
-            raise NotImplementedError
+		# add position encoding
+		hidden_states = self.pos_encoder(hidden_states)

-        encoder_hidden_states = encoder_hidden_states
+		if hasattr(self.processor, "__call__"):
+			return self.processor.__call__(
+				self,
+				hidden_states,
+				encoder_hidden_states=None,
+				attention_mask=attention_mask,
+				**cross_attention_kwargs,
+			)

-        if self.group_norm is not None:
-            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = self.to_q(hidden_states)
-        dim = query.shape[-1]
-        query = self.reshape_heads_to_batch_dim(query)
-
-        if self.added_kv_proj_dim is not None:
-            raise NotImplementedError
-
-        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
-        key = self.to_k(encoder_hidden_states)
-        value = self.to_v(encoder_hidden_states)
-
-        key = self.reshape_heads_to_batch_dim(key)
-        value = self.reshape_heads_to_batch_dim(value)
-
-        if attention_mask is not None:
-            if attention_mask.shape[-1] != query.shape[1]:
-                target_length = query.shape[1]
-                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
-                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
-
-        # attention, what we cannot get enough of
-        if self._use_memory_efficient_attention_xformers:
-            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
-            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
-            hidden_states = hidden_states.to(query.dtype)
-        else:
-            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
-                hidden_states = self._attention(query, key, value, attention_mask)
-            else:
-                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
-
-        # linear proj
-        hidden_states = self.to_out[0](hidden_states)
-
-        # dropout
-        hidden_states = self.to_out[1](hidden_states)
-
-        if self.attention_mode == "Temporal":
-            hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
-
-        return hidden_states
+		else:
+			return self.processor(
+				self,
+				hidden_states,
+				encoder_hidden_states=None,
+				attention_mask=attention_mask,
+				**cross_attention_kwargs,
+			)