o
    
j                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlm	  m
Z d dlmZmZ d dlmZm	Z	 dZedd	d
gZedg dZdd Zdd Zdd ZG dd dZG dd dZG dd dZdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Z G d(d) d)e	j!Z"G d*d+ d+e	j!Z#G d,d- d-e	j!Z$G d.d/ d/e	j!Z%G d0d1 d1e	j!Z&G d2d3 d3e&Z'G d4d5 d5e	j!Z(d6d7 Z)d8d9 Z*G d:d; d;e	j!Z+G d<d= d=e	j!Z,G d>d? d?e	j!Z-G d@dA dAe	j!Z.G dBdC dCe	j!Z/G dDdE dEe	j!Z0G dFdG dGe	j!Z1d`dHdIZ2G dJdK dKe	j!Z3G dLdM dMe	j!Z4G dNdO dOe	j!Z5G dPdQ dQe	j!Z6G dRdS dSe	j!Z7G dTdU dUe7Z8G dVdW dWe7Z9G dXdY dYe7Z:G dZd[ d[e	j!Z;G d\d] d]e	j!Z<G d^d_ d_e	j!Z=dS )a    N)
namedtuple)partial)
isfunction)	rearrangerepeat)einsumnn@   Intermediatespre_softmax_attnpost_softmax_attnhiddensattn_intermediatespast_key_valuesc                 C   s   | d uS N valr   r   X/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/tortoise/xtransformers.pyexists   s   r   c                 C   s   t | r| S t|r| S |S r   )r   r   )r   dr   r   r   default    s   r   c                 C   s   t | tr| S | f| S r   )
isinstancetuple)r   depthr   r   r   
cast_tuple&   s   r   c                   @      e Zd Zdd Zdd ZdS )alwaysc                 C   
   || _ d S r   r   selfr   r   r   r   __init__+      
zalways.__init__c                 O      | j S r   r   )r!   argskwargsr   r   r   __call__.   s   zalways.__call__N__name__
__module____qualname__r"   r'   r   r   r   r   r   *       r   c                   @   r   )
not_equalsc                 C   r   r   r   r    r   r   r   r"   3   r#   znot_equals.__init__c                 O   s
   || j kS r   r   r!   xr%   r&   r   r   r   r'   6   r#   znot_equals.__call__Nr(   r   r   r   r   r-   2   r,   r-   c                   @   r   )equalsc                 C   r   r   r   r    r   r   r   r"   ;   r#   zequals.__init__c                 O   s
   || j kS r   r   r.   r   r   r   r'   >   r#   zequals.__call__Nr(   r   r   r   r   r0   :   r,   r0   c                 C   s   t | jj S r   )torchfinfodtypemax)tensorr   r   r   max_neg_valueB      r6   c                 C   s   t j| dddS )N   )pdim)F	normalizetr   r   r   l2normF   r7   r@   c                 C   s2   t j| jd t| jrt j| jd d S d S )N        )r   init	constant_weightr   bias)layerr   r   r   
init_zero_M   s   
rG   c                    s$   t t fdd| }tt| |S )Nc                    s
     | S r   )pop)keyr   r   r   <lambda>W      
 zpick_and_pop.<locals>.<lambda>)listmapdictzip)keysr   valuesr   rJ   r   pick_and_popV   s   rS   c                 C   sJ   t  t  g}| D ]}t| |}t| }|| || |< q
g |R S r   )rO   rQ   boolint)condr   
return_valrI   matchindr   r   r   group_dict_by_key[   s   

rZ   c                 C   s
   | | S r   )
startswith)prefixstrr   r   r   string_begins_withd   r#   r^   c                 C   s   t tt| |S r   )rZ   r   r^   )r\   r   r   r   r   group_by_key_prefixh   r7   r_   c                    s:   t tt |\}}tt fddt| }||fS )Nc                    s   | d t  d  | d fS Nr      )lenr/   r\   r   r   rK   n       z)groupby_prefix_and_trim.<locals>.<lambda>)rZ   r   r^   rO   rN   r   items)r\   r   kwargs_with_prefixr&   kwargs_without_prefixr   rd   r   groupby_prefix_and_triml   s   ri   c                   @   s   e Zd Zdd ZdS )ReluSquaredc                 C   s   t |d S Nr8   )r<   relur!   r/   r   r   r   forwardv   s   zReluSquared.forwardN)r)   r*   r+   rn   r   r   r   r   rj   u   s    rj   c                       $   e Zd Z fddZdd Z  ZS )AbsolutePositionalEmbeddingc                    s&   t    |d | _t||| _d S N      )superr"   scaler   	Embeddingemb)r!   r;   max_seq_len	__class__r   r   r"   ~   s   

z$AbsolutePositionalEmbedding.__init__c                 C   s4   t j|jd |jd}| |}t|d}|| j S )Nra   devicen d -> () n d)r1   arangeshaper{   rv   r   rt   )r!   r/   npos_embr   r   r   rn      s   


z#AbsolutePositionalEmbedding.forwardr)   r*   r+   r"   rn   __classcell__r   r   rx   r   rp   }       rp   c                       s&   e Zd Z fddZdddZ  ZS )FixedPositionalEmbeddingc                    8   t    ddtd|d |   }| d| d S Ng      ?i'  r   r8   inv_freqrs   r"   r1   r}   floatregister_bufferr!   r;   r   rx   r   r   r"         
z!FixedPositionalEmbedding.__init__ra   r   c                 C   sV   t j|j| |jd| j| }t d|| j}t j| |	 fdd}t
|dS )Nrz   i , j -> i jr9   r;   r|   )r1   r}   r~   r{   type_asr   r   catsincosr   )r!   r/   seq_dimoffsetr?   sinusoid_inprv   r   r   r   rn      s   "
z FixedPositionalEmbedding.forward)ra   r   r   r   r   rx   r   r      s    r   c                       s4   e Zd Zd fdd	Zeddd	Zd
d Z  ZS )RelativePositionBiasF          c                    s4   t    || _|| _|| _|| _t||| _d S r   )	rs   r"   rt   causalnum_bucketsmax_distancer   ru   relative_attention_bias)r!   rt   r   r   r   headsrx   r   r   r"      s   
zRelativePositionBias.__init__Tc           	      C   s   d}|  }|s|d }||dk   | 7 }t|}n	t|t|}|d }||k }|t| | t||  ||     }t|t	||d }|t
|||7 }|S )Nr   r8   ra   )longr1   absr4   
zeros_likelogr   mathmin	full_likewhere)	relative_positionr   r   r   retr   	max_exactis_smallval_if_larger   r   r   _relative_position_bucket   s    *z.RelativePositionBias._relative_position_bucketc                 C   s   g |j dd  |jR \}}}tj|tj|d}tj|tj|d}|d d d f |d d d f  }| j|| j| j| jd}| 	|}	t
|	d}
||
| j  S )N)r3   r{   )r   r   r   zi j h -> () h i j)r~   r{   r1   r}   r   r   r   r   r   r   r   rt   )r!   qk_dotsijr{   q_posk_posrel_pos	rp_bucketrR   rE   r   r   r   rn      s     

zRelativePositionBias.forward)Fr   r   r   )Tr   r   )r)   r*   r+   r"   staticmethodr   rn   r   r   r   rx   r   r      s
    r   c                       s0   e Zd Z fddZedd Zdd Z  ZS )AlibiPositionalBiasc                    sN   t    || _t| |}t|d}| jd|dd | jdd dd d S )Nzh -> () h () ()slopesF
persistentrE   )rs   r"   r   r1   Tensor_get_slopesr   r   )r!   r   r&   r   rx   r   r   r"      s   

zAlibiPositionalBias.__init__c                 C   s\   dd }t |  r|| S dt t |  }|||d| dd d d | |   S )Nc                    s6   ddt | d       fddt| D S )Nr8      c                    s   g | ]} |  qS r   r   ).0r   ratiostartr   r   
<listcomp>   s    zRAlibiPositionalBias._get_slopes.<locals>.get_slopes_power_of_2.<locals>.<listcomp>)r   log2range)r   r   r   r   get_slopes_power_of_2   s   z>AlibiPositionalBias._get_slopes.<locals>.get_slopes_power_of_2r8   r   )r   r   
is_integerfloor)r   r   closest_power_of_2r   r   r   r      s    zAlibiPositionalBias._get_slopesc              	   C   s   g |j dd  |jR \}}}}t| jr)| jj d |kr)|| jdd |f  S tj||d}t|d}|| j }||j d  }t	|ddddd|f}| j
d|d	d
 || j S )Nr9   .rz   j -> () () () jra   r   rE   Fr   )r~   r{   r   rE   r1   r}   r   r   r<   padr   )r!   r   hr   r   r{   rE   num_heads_unalibiedr   r   r   rn      s   "


zAlibiPositionalBias.forward)r)   r*   r+   r"   r   r   rn   r   r   r   rx   r   r      s
    
r   c                       &   e Zd Zd fdd	Zdd Z  ZS )LearnedAlibiPositionalBiasFc                    sD   t  | t| j}t|| _|| _| jr t|| _	d S d S r   )
rs   r"   r1   r   r   r   	Parameterlearned_logslopesbidirectionallearned_logslopes_future)r!   r   r   
los_slopesrx   r   r   r"      s   z#LearnedAlibiPositionalBias.__init__c                    s   g |j dd  |jR \ }}} fdd}t| jr1| jj d |kr1| jdd |d |f }n tj||d}tj||d}t|dt|d }| jd	|d
d | jrp|| j	}	|| j
}
t||	 t||
  }|| S || j	}|| }|| S )Nr   c                    s&   t |  ddddd | jd  fS r`   )r<   r   expr~   )paramr   r   r   
get_slopes   s   &z6LearnedAlibiPositionalBias.forward.<locals>.get_slopesr9   .rz   zj -> 1 1 1 jzi -> 1 1 i 1rE   Fr   )r~   r{   r   rE   r1   r}   r   r   r   r   r   triltriu)r!   r   r   r   r{   r   rE   i_arangej_arangepast_slopesfuture_slopesr   r   r   r   rn      s    "


z"LearnedAlibiPositionalBias.forwardFr   r   r   rx   r   r      s    	r   c                       ro   )RotaryEmbeddingc                    r   r   r   r   rx   r   r   r"     r   zRotaryEmbedding.__init__c                 C   sB   t j||d| j}t d|| j}t j||fdd}t|dS )Nrz   r   r9   r   zn d -> () () n d)r1   r}   r   r   r   r   r   )r!   rw   r{   r?   freqsrv   r   r   r   rn     s   
zRotaryEmbedding.forwardr   r   r   rx   r   r     r   r   c                 C   s2   t | ddd} | jdd\}}tj| |fddS )Nz... (j d) -> ... j dr8   )r   r   r   r9   )r   unbindr1   r   )r/   x1x2r   r   r   rotate_half  s   r   c                 C   sB   | j d }|d d d d | d f }| |  t| |   S )Nr   )r~   r   r   r   )r?   r   seq_lenr   r   r   apply_rotary_pos_emb$  s   
r   c                       ro   )Scalec                    s   t    || _|| _d S r   )rs   r"   valuefn)r!   r   r   rx   r   r   r"   .  s   

zScale.__init__c                    L    j |fi |} fdd}t|ts||S ||d g|dd  R S )Nc                    
   |  j  S r   r   r>   r!   r   r   rK   5  rL   zScale.forward.<locals>.<lambda>r   ra   r   r   r   )r!   r/   r&   outscale_fnr   r   r   rn   3  
   
zScale.forwardr   r   r   rx   r   r   -  r   r   c                       ro   )Rezeroc                    s&   t    || _ttd| _d S )Nra   )rs   r"   r   r   r   r1   zerosg)r!   r   rx   r   r   r"   >     
zRezero.__init__c                    r   )Nc                    r   r   )r   r>   r   r   r   rK   E  rL   z Rezero.forward.<locals>.<lambda>r   ra   r   )r!   r/   r&   r   	rezero_fnr   r   r   rn   C  r   zRezero.forwardr   r   r   rx   r   r   =  r   r   c                       r   )	ScaleNormh㈵>c                    s0   t    |d | _|| _ttd| _d S )Nrr   ra   	rs   r"   rt   epsr   r   r1   onesr   r!   r;   r   rx   r   r   r"   N     

zScaleNorm.__init__c                 C   .   t j|ddd| j }||j| jd | j S Nr9   Tr;   keepdimr   r1   normrt   clampr   r   r!   r/   r  r   r   r   rn   T     zScaleNorm.forward)r   r   r   r   rx   r   r   M      r   c                       r   )RMSNorm:0yE>c                    s0   t    |d | _|| _tt|| _d S rq   r   r   rx   r   r   r"   Z  r   zRMSNorm.__init__c                 C   r   r   r  r  r   r   r   rn   `  r  zRMSNorm.forwardr
  r   r   r   rx   r   r	  Y  r  r	  c                       r   )RMSScaleShiftNormr
  c                    sF   t    |d | _|| _tt|| _t	|d |d | _
d S )Nrr   r8   )rs   r"   rt   r   r   r   r1   r   r   Linearscale_shift_processr   rx   r   r   r"   f  s
   

zRMSScaleShiftNorm.__init__c                 C   sl   t j|ddd| j }||j| jd | j }| |}t j|ddd\}}|d|d  |d }|S )Nr9   Tr   r  r8   ra   r   )	r1   r  rt   r  r   r   r  chunk	unsqueeze)r!   r/   norm_scale_shift_inpr  ss_embrt   shiftr   r   r   r   rn   m  s   
zRMSScaleShiftNorm.forwardr  r   r   r   rx   r   r  e  s    r  c                       r   )ResidualFc                    s.   t    |rtt|| _d S d | _d S r   )rs   r"   r   r   r1   r   residual_scaler!   r;   scale_residualrx   r   r   r"   {  s   
$zResidual.__init__c                 C   s   t | jr
|| j }|| S r   )r   r  )r!   r/   residualr   r   r   rn     s   

zResidual.forwardr   r   r   r   rx   r   r  z  s    r  c                       r   )	GRUGatingFc                    s<   t    t||| _|rtt|| _d S d | _d S r   )	rs   r"   r   GRUCellgrur   r1   r   r  r  rx   r   r   r"     s   
$zGRUGating.__init__c                 C   s6   t | jr
|| j }| t|dt|d}||S )Nzb n d -> (b n) d)r   r  r  r   
reshape_as)r!   r/   r  gated_outputr   r   r   rn     s   


zGRUGating.forwardr   r   r   r   rx   r   r    s    r  c                 C   s@   |dkr| S t |r| |d  d} tj| dd|| fddS )Nr   ).NrA   r   )r   masked_fillr<   r   )r?   amountmaskr   r   r   r    s
   r  c                       ro   )ShiftTokensc                    s   t    || _t|| _d S r   )rs   r"   r   r   shifts)r!   r"  r   rx   r   r   r"     s   
zShiftTokens.__init__c           	         s   | dd  | j}t|}|jd | }|j|dd}|d | ||d  }}tt fddt||}tj	g ||R dd}| j
|fi |S )Nr   r9   r   c                    s   t | d iS )Nr   )r  )r%   r   r   r   rK         z%ShiftTokens.forward.<locals>.<lambda>)getr"  rb   r~   splitrM   rN   rP   r1   r   r   )	r!   r/   r&   r"  segmentsfeats_per_shiftsplittedsegments_to_shiftrestr   r#  r   rn     s   zShiftTokens.forwardr   r   r   rx   r   r!    r   r!  c                       ro   )GLUc                    s&   t    || _t||d | _d S rk   )rs   r"   actr   r  proj)r!   dim_indim_out
activationrx   r   r   r"     r   zGLU.__init__c                 C   s&   |  |jddd\}}|| | S )Nr8   r9   r   )r.  r  r-  )r!   r/   gater   r   r   rn     s   zGLU.forwardr   r   r   rx   r   r,    r   r,  c                       s4   e Zd Z							d	 fdd	Zdd Z  ZS )
FeedForwardN   FrA   c	              	      s   t    t|| }	t||}|rt nt }
|s%tt||	|
nt	||	|
}t||r5t
|	nt t|t|	|| _|rOt| jd  d S d S )Nr9   )rs   r"   rU   r   rj   r   GELU
Sequentialr  r,  	LayerNormIdentityDropoutnetrG   )r!   r;   r0  multglurelu_squaredpost_act_lndropoutzero_init_output	inner_dimr1  
project_inrx   r   r   r"     s   

"
zFeedForward.__init__c                 C   s
   |  |S r   )r:  rm   r   r   r   rn     r#   zFeedForward.forward)Nr4  FFFrA   Fr   r   r   rx   r   r3    s    r3  c                       sb   e Zd Zedddddddddddddddddddf fd	d
	Z									dddZ  ZS )	Attentionr   Fg333333?Nr   rA   r   r   c                    s(  t    |d | _|| _|| _|| _||  }}|| _| jr/t|| }t	t
||| _tj||dd| _tj||dd| _tj||dd| _t|| _d | _|rnt||| _tj| jjd tj| jjd || _|rt|d}t	t
d|dd| | _|| _|rt	t
||| _t	t
||| _|| _|rt	t
d|dd| _|	| _ t!j"| _#|| _$|dkrt	t
|||| _%t	t
|||| _&|| _'|rt(t||d t) nt||| _*|| _+|r||ksJ dt,|d	 ||||d
| _-|rt.| j* d S d S )Nrr   F)rE   r   ra   r   r8   zXnumber of relative position buckets must be less than the relative position max distance      ?)rt   r   r   r   r   )/rs   r"   rt   r   r   max_attend_pastcollab_headsrU   r   r   r1   randncollab_mixingr  to_qto_kto_vr9  r?  	to_v_gaterB   rC   rD   rE   qk_normr   r   talking_headspre_softmax_projpost_softmax_proj
head_scalehead_scale_paramssparse_topkr<   softmaxattn_fn
num_mem_kvmem_kmem_vattn_on_attnr6  r,  to_outrel_pos_biasr   r   rG   )r!   r;   dim_headr   r   rN  rQ  rF  collab_compressionrS  use_entmax15rV  r?  on_attngate_valuesr@  rE  rM  scale_init_valuer[  rel_pos_num_bucketsrel_pos_max_distanceqk_dimv_dimrx   r   r   r"     sn   

.
	zAttention.__init__c           7   
      s  g |j | j| j| j| j| j|jt|R \
 }}}}}}t||}|}|}|}t|	rGt	j
|	|fdd}t	j
|	|fdd}t|rc|j d |j d  }||||d }||| }| |}| || |}|stfdd||f\}}ntd|| j}tdt|dd	}|
d ur|
\}}t	j
|gddt	j
||gdd}}|}tr|sj d
 tfdd||f\\}}\}}\} }!tfdd||| f\}}} tdd ||f||f| |!ff\}}d }"ttt||fr0t| fdd}#t|s|#n|}$t|$ fdd}$t|#d}#t|$d}$|#|$ }"| jdkrgt fdd| j| jf\}%}&t	j
|%fddt	j
|&|fdd}t|"rgtj|"| jdfdd}"|rrd
d
d
| jrtt|f\}d| j jdd }td|| }'t|'}(t|r|'| }'|' })|rtd|'| j  }'| j!r| "|'}'t|"r|'#|" |( ~"t|rd|j$  krdksJ d J d|j$dkrt|d}n|j$dkrt|d }|'#| |( t| j%r3|'j dd  \}*}+t	j&|+|* |+d!},t	j&|+d!}-t|,d"t|-d# }.|.| j%k}|'#||( ~| j'rd|'j dd  \}*}+t	j&|*d!}/t|/d"t|/d#k }tj||+|* dfd$d}|'#||( ~t| j(r| j(|'j d
 k r|'j)| j(d
d\}0}|0d% *d
+|'}1|'|1k }|'#||( ~| j,|'d
d}2|2 }3| -|2}2|rtd|2| j.  }2td&|2|}4|r|4| j/ }4t|4d'}4t| j0r| 0|}5|4|51  }4t2|)|3d(}6| 3|4|6||fS ))Nr   r   )r   c                       t | d dS )Nb n (h d) -> b h n dr   )r   r>   r   r   r   rK   ~  r$  z#Attention.forward.<locals>.<lambda>zb i d, h d -> b h i dzb n d -> b () n drg  r   r9   c                    s    | dd  f | d d f fS )N.r   r>   )lr   r   rK     s     c                    s
   t |  S r   )r   r>   )rotary_pos_embr   r   rK     rL   c                 S   s   t j| ddS )Nr9   r   )r1   r   r>   r   r   r   rK     r$  c                      s   t j fd S )Nrz   )r1   r   rT   r   )br{   r   r   r   rK     s    c                      s   t j jd fd S )Nr   rz   )r1   r   r~   rT   r   )rj  r{   kr   r   rK     re   zb i -> b () i ()zb j -> b () () jr   c                    rf  )Nzh n d -> b h n drj  )r   r>   rl  r   r   rK     r$  Tr   ra   g{Gz?r  zb h i d, b h j d -> b h i jzb h i j, h k -> b k i jr8   r4  zNattention mask must have greater than 2 dimensions but less than or equal to 4zi j -> () () i jr   zh i j -> () h i jrz   zi -> () () i ()r   F).r9   zb h i j, b h j d -> b h i dzb h n d -> b n (h d))r   r   )4r~   r   rN  rF  rQ  rt   r{   r   r   r1   r   rI  rJ  rK  rN   r   rH  r   anyrV  rW  rX  r<   r   expandrM  r@   r   r  r6   clonerO  
contiguousr[  r   masked_fill_ndimrE  r}   r   rS  topkr  	expand_asrU  r?  rP  rR  rL  sigmoidr
   rZ  )7r!   r/   contextr   context_mask	attn_masksinusoidal_embri  	prev_attnmem
layer_past_rN  rF  rQ  rt   has_contextkv_inputq_inputk_inputv_inputr   qvpast_key
past_valuek_cachev_cacheqlqrklkrvlvr
input_maskq_maskk_maskrW  rX  dots
mask_valuer   r   r   range_qrange_kdistrtopvkattnr   r   gatesintermediatesr   )rj  r{   r   rk  rh  r   ri  r   rn   R  s   




 

*&












zAttention.forward)	NNNNNNNNN)r)   r*   r+   DEFAULT_DIM_HEADr"   rn   r   r   r   rx   r   rC    s@    krC  c                       st   e Zd Z																												d
 fdd	Z										ddd	Z  ZS )AttentionLayersr   FNTr   c           @         s  t    td|\} }td|\}!}"|!dt}#|| _|| _tg | _	|| _
d|!v }$|p1|$p1|| _|r9t|nd | _tt||#d d}|rLt|nd | _|rW|$rWJ d|rwt||}||ksfJ d|sj|sltnt}%|%|| d	| _nd | _|s|rJ d
|| _|| _|| _|| _|| _|rtntj}&|	rtn|&}&|rtn|&}&t|&|}'|
rtjn|'}'|
rt nd }(|r|sd})n	|r|rd})nd})|rd|) })|rt!|rt"#t"$|d |  nd }*i |!d|*d}!|ri |!ddi}!i | ddi} t!|r|}+nt!|rb|t%|) },d|  k r|,ks#J d J dt&t't(d|)})|,| }-|,d d }.|.|.|-  |- }/t%|)|/ksIJ d|)d|/t%|)   }0|0|- }1|1d|,t%|1   }+n&t!|r|dkrq||ksuJ dd| |)||   d|  }+n|)| }+|+| _)t%t*t't+d|+| _,t-|t%|+}t.t/| j)|D ]\}2\}3}4|2t%| j)d k}5|3dkrt0|f||d|!}6n/|3dkrt0|fd|i|!}6n|3dkrt1|fi | }6|s|6nt2d|6}6nt3d|3 |4dkr|4d }7|s|4 nd}8t4t5|8|7|6}6t!|(r|(|6}6|rt6nt7}9|9||d }:|o*|3d!v };|r4|;s4|' nd }<|s<|;r?|' nd }=|sJ|5sJ|' nd }>t|<|=|>g}?| j	8t|?|6|:g qd S )"Nff_attn_r\  r[  r8   r   zRyou can only choose Alibi positional bias or T5 relative positional bias, not bothzAnumber of ALiBi heads must be less than the total number of heads)r   r   z3sandwich norm cannot be used when not using prenorm)acf)r  r  )r  r  )r  T)rM  ra  r@  ra   zpar ratio out of ranger  r   z(default block is too large for par_ratior   z2sandwich coefficient should be less than the depth)r  r  )r   r   r  r   rD  zinvalid layer type )r  r  r  )9rs   r"   ri   r%  r  r;   r   r   
ModuleListlayersr   has_pos_embr   pia_pos_embr4   r   r   ri  r   r   r   pre_normsandwich_normresidual_attncross_residual_attncross_attendr   r7  r	  r  r   r8  r   r   r   r   r   rb   r   filterr-   layer_typesrM   r0   num_attn_layersr   	enumeraterP   rC  r3  r   	Exceptionr!  r   r  r  append)@r!   r;   r   r   r   r  
only_crossuse_scalenormuse_rms_scaleshift_normuse_rmsnorm
use_rezeroalibi_pos_biasalibi_num_headsalibi_learnedposition_infused_attnri  rotary_emb_dimcustom_layerssandwich_coef	par_ratior  r  macaronr  gate_residualr  shift_tokensr  use_qk_norm_attnqk_norm_attn_seq_lenzero_init_branch_outputr&   	ff_kwargsattn_kwargsr}  r\  r[  alibi_pos_klass
norm_classnorm_fn	branch_fndefault_blockattn_scale_init_valuer  	par_depthpar_attn	depth_cut	par_width	par_blockpar_headrY   
layer_typelayer_shift_tokensis_last_layerrF   shift_range_uppershift_range_lowerresidual_fnr  layer_uses_qk_normpre_branch_normpost_branch_normpost_main_normnormsrx   r   r   r"     s   
"



(





zAttentionLayers.__init__c           %         sP  | j t|p	t|A rJ d|d u s|d u sJ dg }g }d }d }t|r+| nd g| j }i }t|	r;|	|d< d }t| jr| jsQ| jrQ|d usPJ dn|d u rWd}jd  |
d urk |
d d jd 7  tt	t
 fdd	||g }| |j}g }d}tt| j| jD ]\}\}\}}}|d
kr|r|dnd }}|\}}}t|r|fi ||d
ks|dkr|
d ur|
d}tfdd|D } nd } |d
kr|d |d || j|||| 
\}!}"}#}$n9|dkrt|r||| ||d d d |d | 
\}!}"}#}$n||||d d d |d | 
\}!}"}#}$n	|dkr(|}!|d
ks7|dkrB|d urB||# |$ f t|rO||!fi |}!||!||dv r^||" |d
krk| jrk|"j}n|dkrw| jrw|"j}t|r|fi ||dkr|d7 }|dkr| q|rt|||d}|fS S )Nz8context must be passed in if cross_attend is set to Truez3only one of full_context or context can be providedr  zVTo decode a transformer with rotary embeddings, you must specify an `expected_seq_len`r   ra   r   c                    s   t | r| jd   S d  S )Nra   r   )r   r~   )m)r   r   r   rK     s    z)AttentionLayers.forward.<locals>.<lambda>r  r  c                 3   s    | ]	}|  jV  qd S r   )tor{   )r   src   r   r   	<genexpr>  s    z*AttentionLayers.forward.<locals>.<genexpr>r  r  r   )r  r   copyr  ri  trainingr   r~   r4   rM   rN   r{   r  rP   r  r  rH   r   r  r  detachr  r   r  LayerIntermediates)%r!   r/   rv  full_contextr   rw  rx  memsreturn_hiddensr  r   expected_seq_lenr   r  rz  prev_cross_attn	norm_argsri  max_rotary_emb_lengthpresent_key_valuescross_attn_countrY   r  r  blockr  	layer_memr  r  r  r  layer_kvr|  r   interrk  r  r   )r   r/   r   rn     s   


&












zAttentionLayers.forward)r   FFFFFFFFNFFFNNNNFFFTFFr   FFNF)
NNNNNNFNNNr   r   r   rx   r   r    sT     .r  c                          e Zd Z fddZ  ZS )Encoderc                    *   d|vsJ dt  jdddi| d S )Nr   zcannot set causality on encoderFr   rs   r"   r!   r&   rx   r   r   r"        zEncoder.__init__r)   r*   r+   r"   r   r   r   rx   r   r        r  c                       r  )Decoderc                    r  )Nr   zcannot set causality on decoderTr   r  r  rx   r   r   r"   #  r  zDecoder.__init__r  r   r   rx   r   r  "  r  r  c                       r  )CrossAttenderc                    s   t  jdddd| d S )NT)r  r  r   r  r  rx   r   r   r"   )  s   zCrossAttender.__init__r  r   r   rx   r   r  (  r  r  c                       s0   e Zd Zdddd fdd
Zd	ddZ  ZS )
ViTransformerWrapperNrA   )num_classesr?  emb_dropoutc          
         s   t    t|tsJ d|| dksJ d|j}|| d }d|d  }	|| _tt	d|d || _
t|	|| _tt	dd|| _t|| _|| _t|| _t|rft|||d| _d S d | _d S )Nz#attention layers must be an Encoderr   z4image dimensions must be divisible by the patch sizer8   r   ra   )r0  r?  )rs   r"   r   r  r;   
patch_sizer   r   r1   rG  pos_embeddingr  patch_to_embedding	cls_tokenr9  r?  attn_layersr7  r  r   r3  mlp_head)
r!   
image_sizer  r  r   r?  r  r;   num_patches	patch_dimrx   r   r   r"   .  s   
&zViTransformerWrapper.__init__Fc           	      C   s   | j }t|d||d}| |}|j\}}}t| jd|d}tj||fdd}|| jd d d |d f  }| 	|}| 
|}| |}t| jrL|rN|S | |d d df S )Nz&b c (h p1) (w p2) -> b (h w) (p1 p2 c))p1p2z() n d -> b n drl  ra   r   r   )r  r   r  r~   r   r  r1   r   r  r?  r  r  r   r  )	r!   imgreturn_embeddingsr:   r/   rj  r   r}  
cls_tokensr   r   r   rn   A  s   



zViTransformerWrapper.forwardr   r   r   r   rx   r   r  -  s    r  c                       sL   e Zd Zdddddddd fdd
Zd	d
 Z						dddZ  ZS )TransformerWrapperNrA   r   FT)emb_dimmax_mem_lenshift_mem_downr  num_memory_tokenstie_embeddinguse_pos_embc       
            s   t    t|tsJ d|j}t||}| _| _| _t	
|| _|
r0|js0t||ntd _t	| _||krEt	||nt	  _| _t	| _   |	s_t	||n fdd _t|d}| _|dkr~t	t|| _d S d S )N2attention layers must be one of Encoder or Decoderr   c                    s   |  j j  S r   )	token_embrD   r?   r>   r   r   r   rK   }  s    z-TransformerWrapper.__init__.<locals>.<lambda>)rs   r"   r   r  r;   r   rw   r  r  r   ru   r  r  rp   r   r   r9  r  r  r8  project_embr  r7  r  init_	to_logitsr  r   r1   rG  memory_tokens)r!   
num_tokensrw   r  r  r  r  r  r  r  r  r;   rx   r   r   r"   W  s2   


zTransformerWrapper.__init__c                 C   s   t j| jj d S r   )r   rB   kaiming_normal_r  rD   r   r   r   r   r    s   zTransformerWrapper.init_c                 K   s  g |j |j| jR \}	}
}}| |}|| | }| |}| |}|dkrIt| jd|	d}t	j
||fdd}t|rItj||dfdd}| jret|re|d | j || jd  }}g ||}| j|f||dd|\}}| |}|d d d |f |d d |d f }}|s| |n|}|r|j}||fS |g}|rttd	d
 |j}|| |r||j t|dkrt|S |d S )Nr   zn d -> b n drl  ra   r   Tr   r   r  r  c                 S   r$   r   r   r>   r   r   r   rK         z,TransformerWrapper.forward.<locals>.<lambda>)r~   r{   r  r  r   r  r  r   r  r1   r   r   r<   r   r  r  r  r  r   rM   rN   r   r  r   rb   r   )r!   r/   r  r   r  return_attnr  	use_cacher&   rj  r   r{   num_memr{  mems_lmems_rr  r   r   res	attn_mapsr   r   r   rn     s:    



*
zTransformerWrapper.forward)FNFFNF)r)   r*   r+   r"   r  rn   r   r   r   rx   r   r  V  s"    .r  c                       s4   e Zd Zdddddd fdd
Zd
dd	Z  ZS )ContinuousTransformerWrapperNrA   T)r/  r0  r  r  r  c          	         s   t    t|tsJ d|j}|| _|r|jst||ntd| _	t
|| _t|r3t
||nt
 | _|| _t
|| _t|rNt
||| _d S t
 | _d S )Nr  r   )rs   r"   r   r  r;   rw   r  rp   r   r   r   r9  r  r   r  r8  rB  r  r7  r  project_out)	r!   rw   r  r/  r0  r  r  r  r;   rx   r   r   r"     s   
(z%ContinuousTransformerWrapper.__init__Fc                 K   s   g |j |jR \}}	}
}| |}|| | }| |}| j|f||dd|\}}| |}|s9| |n|}|g}|rOtt	dd |j
}|| |rW||j t|dkrat|S |d S )NTr  c                 S   r$   r   r   r>   r   r   r   rK     r!  z6ContinuousTransformerWrapper.forward.<locals>.<lambda>ra   r   )r~   r{   rB  r   r  r  r  r*  rM   rN   r   r  r   rb   r   )r!   r/   r  r   r"  r  r#  r&   rj  r   r}  r{   r  r   r'  r(  r   r   r   rn     s    



z$ContinuousTransformerWrapper.forward)FNFNFr   r   r   rx   r   r)    s    
r)  r   )>r   collectionsr   	functoolsr   inspectr   r1   torch.nn.functionalr   
functionalr<   einopsr   r   r   r  r
   r  r   r   r   r   r-   r0   r6   r@   rG   rS   rZ   r^   r_   ri   Modulerj   rp   r   r   r   r   r   r   r   r   r   r   r	  r  r  r  r  r!  r,  r3  rC  r  r  r  r  r  r  r)  r   r   r   r   <module>   sx    			-*#	

'    *)g