o
    
j8                     @   sF  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlm  mZ	 d dl
Z
d dlmZ d dlmZmZ dd ZG dd dejZdd	 ZG d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdZG dd dejZG dd dejZG dd dejZG dd deZdS )    N)LogitsWarper)ContinuousTransformerWrapperRelativePositionBiasc                 C   s   |   D ]}|   q| S )z<
    Zero out the parameters of a module and return it.
    )
parametersdetachzero_)modulep r
   U/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/tortoise/arch_utils.pyzero_module   s   r   c                       s   e Zd Z fddZ  ZS )GroupNorm32c                    s   t  | |jS N)superforwardfloattypedtypeselfx	__class__r
   r   r      s   zGroupNorm32.forward)__name__
__module____qualname__r   __classcell__r
   r
   r   r   r      s    r   c                 C   sX   d}| dkr	d}n| dkrd}| | dkr!t |d }| | dks|dks'J t|| S )z
    Make a standard normalization layer.

    :param channels: number of input channels.
    :return: an nn.Module for normalization.
              @   r      )intr   )channelsgroupsr
   r
   r   normalization   s   
r%   c                       s*   e Zd ZdZ fddZdddZ  ZS )QKVAttentionLegacyzi
    A module which performs QKV attention. Matches legacy QKVAttention + input/output heads shaping
    c                       t    || _d S r   )r   __init__n_heads)r   r)   r   r
   r   r(   4      

zQKVAttentionLegacy.__init__Nc                 C   s"  |j \}}}|d| j  dksJ |d| j  }||| j |d |j|dd\}}	}
dtt| }td|| |	| }|durd|||| j|j d |j d || j |j d |j d }tj|	 dd
|j}|dur|| jdd}|| }td	||
}||d|S )
z
        Apply QKV attention.

        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
        :return: an [N x (H * C) x T] tensor after attention.
           r      dimzbct,bcs->btsNzbts,bcs->bct)shaper)   reshapesplitmathsqrttorcheinsumsoftmaxr   r   r   repeat	unsqueeze)r   qkvmaskrel_posbswidthlengthchqkvscaleweightar
   r
   r   r   8   s    ("zQKVAttentionLegacy.forward)NNr   r   r   __doc__r(   r   r   r
   r
   r   r   r&   /   s    r&   c                       s4   e Zd ZdZ				d fdd	Zdd	d
Z  ZS )AttentionBlocka  
    An attention block that allows spatial positions to attend to each other.

    Originally ported from here, but adapted to the N-d case.
    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
    r,   r0   TFc                    s   t    || _|| _|dkr|| _n|| dks#J d| d| || | _t|| _t||d d| _	t
| j| _tt||d| _|rZt|| j d d|d	d
d| _d S d | _d S )Nr0   r   zq,k,v channels z' is not divisible by num_head_channels r+   r,   g      ?Fr   r    )rE   causalheadsnum_bucketsmax_distance)r   r(   r#   do_checkpoint	num_headsr%   normnnConv1dr;   r&   	attentionr   proj_outr   relative_pos_embeddings)r   r#   rP   num_head_channelsrO   rV   r   r
   r   r(   [   s,   



zAttentionBlock.__init__Nc                 C   s\   |j ^}}}|||d}| | |}| ||| j}| |}|| j||g|R  S Nr0   )r1   r2   r;   rQ   rT   rV   rU   )r   r   r<   bcspatialr;   hr
   r
   r   r   ~   s   
zAttentionBlock.forward)r,   r0   TFr   rH   r
   r
   r   r   rJ   S   s    
#rJ   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	Upsamplez
    An upsampling layer with an optional convolution.

    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    N   c                    sR   t    || _|p|| _|| _|| _|r'd}d}tj| j| j||d| _d S d S )N   r!   padding)	r   r(   r#   out_channelsuse_convfactorrR   rS   conv)r   r#   rc   rb   rd   ksizepadr   r
   r   r(      s   

zUpsample.__init__c                 C   s:   |j d | jks
J tj|| jdd}| jr| |}|S )Nr,   nearest)scale_factormode)r1   r#   Finterpolaterd   rc   re   r   r
   r
   r   r      s
   
zUpsample.forward)Nr^   rH   r
   r
   r   r   r]      s    r]   c                       s*   e Zd ZdZd
 fdd	Zdd	 Z  ZS )
Downsamplez
    A downsampling layer with an optional convolution.

    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    Nr^   r_   r!   c                    sj   t    || _|p|| _|| _|}|r#tj| j| j|||d| _d S | j| jks+J tj||d| _d S )N)stridera   )kernel_sizern   )	r   r(   r#   rb   rc   rR   rS   op	AvgPool1d)r   r#   rc   rb   rd   rf   rg   rn   r   r
   r   r(      s   

zDownsample.__init__c                 C   s   |j d | jks
J | |S )Nr,   )r1   r#   rp   r   r
   r
   r   r      s   
zDownsample.forward)Nr^   r_   r!   rH   r
   r
   r   r   rm      s    rm   c                       s2   e Zd Z						d fdd	Zdd Z  ZS )	ResBlockNFr+   c	           
         s@  t    || _|| _|p|| _|| _|| _|dkrdnd}	tt	|t
 tj|| j||	d| _|p5|| _|rFt|d| _t|d| _n|rUt|d| _t|d| _nt  | _| _tt	| jt
 tj|dttj| j| j||	d| _| j|krt | _d S |rtj|| j||	d| _d S t|| jd| _d S )Nr+   r,   r!   r`   F)r	   )r   r(   r#   dropoutrb   rc   use_scale_shift_normrR   
Sequentialr%   SiLUrS   	in_layersupdownr]   h_updx_updrm   IdentityDropoutr   
out_layersskip_connection)
r   r#   rs   rb   rc   rt   updownro   ra   r   r
   r   r(      s>   




zResBlock.__init__c                 C   sh   | j r#| jd d | jd }}||}| |}| |}||}n| |}| |}| || S rX   )rx   rw   ry   rz   r}   r~   )r   r   in_restin_convr\   r
   r
   r   r      s   




zResBlock.forward)NFFFFr+   r   r   r   r(   r   r   r
   r
   r   r   rr      s    2rr   c                       s6   e Zd Z								d
 fdd	Zdd	 Z  ZS )AudioMiniEncoder   r!   r^   r   r+   c              	      s   t    ttj||ddd| _|}g }t|D ]$}t|D ]}|t|||
d q |t	|d|d |	d |d9 }qtj| | _
tt|t t||d| _g }t|D ]
}|t|| q\tj| | _|| _d S )Nr+   r,   r`   )ro   Tr!   )rc   rb   rd   )r   r(   rR   ru   rS   initrangeappendrr   rm   resr%   rv   finalrJ   attnr.   )r   spec_dimembedding_dimbase_channelsdepthresnet_blocksattn_blocksnum_attn_headsrs   downsample_factorro   rA   r   lrr   rG   r   r
   r   r(      s*   

"
zAudioMiniEncoder.__init__c                 C   s>   |  |}| |}| |}| |}|d d d d df S )Nr   )r   r   r   r   )r   r   r\   r
   r
   r   r      s
   



zAudioMiniEncoder.forward)r   r!   r!   r^   r^   r   r!   r+   r   r
   r
   r   r   r      s    #r   z;https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pthc                	       s8   e Zd Zddddddddef	 fdd		Zd
d Z  ZS )TorchMelSpectrogrami      P   r   i@  i"V  Fc
                    s   t    || _|| _|| _|| _|| _|| _|| _t	j
j| j| j| jd|| j| j| j| jdd
| _|	| _| jd urYt| j}
t|
| _W d    d S 1 sRw   Y  d S d | _d S )Nr!   slaney)
n_fft
hop_length
win_lengthpower
normalizedsample_ratef_minf_maxn_melsrQ   )r   r(   filter_lengthr   r   n_mel_channelsmel_fminmel_fmaxsampling_rate
torchaudio
transformsMelSpectrogrammel_stftmel_norm_filefsspecopenr6   load	mel_norms)r   r   r   r   r   r   r   r   	normalizer   fr   r
   r   r(   ,  s4   

"
zTorchMelSpectrogram.__init__c                 C   s   t |jdkr|d}t |jdksJ | j|j| _| |}ttj|dd}| j	d urD| j	|j| _	|| j	
d
d }|S )Nr+   r,   r!   gh㈵>)minr   r0   )lenr1   squeezer   todevicer6   logclampr   r:   )r   inpmelr
   r
   r   r   T  s   


zTorchMelSpectrogram.forward)r   r   r   DEFAULT_MEL_NORM_FILEr(   r   r   r
   r
   r   r   r   +  s    (r   c                       s(   e Zd ZdZ fddZdd Z  ZS )CheckpointedLayerz
    Wraps a module. When forward() is called, passes kwargs that require_grad through torch.checkpoint() and bypasses
    checkpoint for all other args.
    c                    r'   r   )r   r(   wrap)r   r   r   r
   r   r(   j  r*   zCheckpointedLayer.__init__c                 O   sL   |  D ]\}}t|tjr|jrJ qtj| jfi |}||g|R  S r   )items
isinstancer6   Tensorrequires_grad	functoolspartialr   )r   r   argskwargsrC   rD   r   r
   r
   r   r   n  s
   zCheckpointedLayer.forwardrH   r
   r
   r   r   r   d  s    r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )CheckpointedXTransformerEncoderz
    Wraps a ContinuousTransformerWrapper and applies CheckpointedLayer to each layer and permutes from channels-mid
    to channels-last that XTransformer expects.
    Tc           	         s|   t    tdi || _|| _|| _|sd S tt| jjj	D ]}| jjj	| \}}}t
|t||g| jjj	|< q d S )Nr
   )r   r(   r   transformerneeds_permuteexit_permuter   r   attn_layerslayersrR   
ModuleListr   )	r   r   r   
checkpointxtransformer_kwargsinrY   r   r   r
   r   r(   {  s   
 z(CheckpointedXTransformerEncoder.__init__c                 K   s>   | j r
|ddd}| j|fi |}| jr|ddd}|S )Nr   r!   r,   )r   permuter   r   )r   r   r   r\   r
   r
   r   r     s   z'CheckpointedXTransformerEncoder.forward)TTTrH   r
   r
   r   r   r   u  s    r   c                   @   sL   e Zd Zded dfdededefddZd	ejd
ejdejfddZ	dS )TypicalLogitsWarperg?Infr,   massfilter_valuemin_tokens_to_keepc                 C   s   || _ || _|| _d S r   )r   r   r   )r   r   r   r   r
   r
   r   r(     s   
zTypicalLogitsWarper.__init__	input_idsscoresreturnc                 C   s   t jjj|dd}t |}|| jddd }t | | }t j|dd\}}|d|}	|	j	ddj
dd}
|
| jk jdd}d||dk < ||d|ddk}| jdkrbd|d	d | jf< |d||}||| j}|S )
Nr0   r-   T)keepdimF)
descendingr,   r   .)r6   rR   
functionallog_softmaxexpnansumabssortgatherr8   cumsumr   sumviewr   scattermasked_fillr   )r   r   r   r   r	   entshifted_scoressorted_scoressorted_indicessorted_logitscumulative_probslast_indsorted_indices_to_removeindices_to_remover
   r
   r   __call__  s   

zTypicalLogitsWarper.__call__N)
r   r   r   r   r"   r(   r6   
LongTensorFloatTensorr   r
   r
   r
   r   r     s    
 
r   ) r   r4   osr   r6   torch.nnrR   torch.nn.functionalr   rk   r   transformersr   %TTS.tts.layers.tortoise.xtransformersr   r   r   	GroupNormr   r%   Moduler&   rJ   r]   rm   rr   r   r   r   r   r   r   r
   r
   r
   r   <module>   s0    	$4@,9