o
    ´‹
j±D  ã                   @   sr   d dl Z d dlZd dlmZ d dlmZ d dlmZmZ G dd„ dej	ƒZ
G dd„ dej	ƒZG d	d
„ d
ej	ƒZdS )é    N)Únn)Ú
functional)Ú	LayerNormÚ
LayerNorm2c                       s†   e Zd ZdZ						d‡ fdd„	Zddd	„Zdd
d„Zedd„ ƒZedd„ ƒZ	dd„ Z
edd„ ƒZedd„ ƒZedd„ ƒZ‡  ZS )Ú"RelativePositionMultiHeadAttentionaµ  Multi-head attention with Relative Positional embedding.
    https://arxiv.org/pdf/1809.04281.pdf

    It learns positional embeddings for a window of neighbours. For keys and values,
    it learns different set of embeddings. Key embeddings are agregated with the attention
    scores and value embeddings are aggregated with the output.

    Note:
        Example with relative attention window size 2

        - input = [a, b, c, d, e]
        - rel_attn_embeddings = [e(t-2), e(t-1), e(t+1), e(t+2)]

        So it learns 4 embedding vectors (in total 8) separately for key and value vectors.

        Considering the input c

        - e(t-2) corresponds to c -> a
        - e(t-2) corresponds to c -> b
        - e(t-2) corresponds to c -> d
        - e(t-2) corresponds to c -> e

        These embeddings are shared among different time steps. So input a, b, d and e also uses
        the same embeddings.

        Embeddings are ignored when the relative window is out of limit for the first and the last
        n items.

    Args:
        channels (int): input and inner layer channels.
        out_channels (int): output channels.
        num_heads (int): number of attention heads.
        rel_attn_window_size (int, optional): relation attention window size.
            If 4, for each time step next and previous 4 time steps are attended.
            If default, relative encoding is disabled and it is a regular transformer.
            Defaults to None.
        heads_share (bool, optional): [description]. Defaults to True.
        dropout_p (float, optional): dropout rate. Defaults to 0..
        input_length (int, optional): intput length for positional encoding. Defaults to None.
        proximal_bias (bool, optional): enable/disable proximal bias as in the paper. Defaults to False.
        proximal_init (bool, optional): enable/disable poximal init as in the paper.
            Init key and query layer weights the same. Defaults to False.
    NTç        Fc
                    sˆ  t ƒ  ¡  || dksJ dƒ‚|| _|| _|| _|| _|| _|| _|| _|| _	d | _
|| | _t ||d¡| _t ||d¡| _t ||d¡| _t ||d¡| _t |¡| _|d ur’|r]dn|}
| jd }t t |
|d d | j¡| ¡}t t |
|d d | j¡| ¡}|  d|¡ |  d|¡ tj | jj¡ tj | jj¡ |	rº| jjj | jjj¡ | jjj | jjj¡ tj | jj¡ d S )Nr   z/ [!] channels should be divisible by num_heads.é   g      à¿é   Ú	emb_rel_kÚ	emb_rel_v)ÚsuperÚ__init__ÚchannelsÚout_channelsÚ	num_headsÚrel_attn_window_sizeÚheads_shareÚinput_lengthÚproximal_biasÚ	dropout_pÚattnÚ
k_channelsr   ÚConv1dÚconv_qÚconv_kÚconv_vÚconv_oÚDropoutÚdropoutÚ	ParameterÚtorchÚrandnÚregister_parameterÚinitÚxavier_uniform_ÚweightÚdataÚcopy_Úbias)Úselfr   r   r   r   r   r   r   r   Úproximal_initÚn_heads_relÚ
rel_stddevr
   r   ©Ú	__class__© úV/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/glow_tts/transformer.pyr   7   sD   


ÿÿz+RelativePositionMultiHeadAttention.__init__c                 C   sD   |   |¡}|  |¡}|  |¡}| j||||d\}| _|  |¡}|S )z
        Shapes:
            - x: :math:`[B, C, T]`
            - c: :math:`[B, C, T]`
            - attn_mask: :math:`[B, 1, T, T]`
        )Úmask)r   r   r   Ú	attentionr   r   )r)   ÚxÚcÚ	attn_maskÚqÚkÚvr/   r/   r0   Úforwardm   s   



z*RelativePositionMultiHeadAttention.forwardc                 C   sè  g |  ¡ ¢|  d¡‘R \}}}}| || j| j|¡ dd¡}| || j| j|¡ dd¡}| || j| j|¡ dd¡}t || dd¡¡t | j¡ }	| j	d uru||ksWJ dƒ‚|  
| j|¡}
|  ||
¡}|  |¡}|t | j¡ }|	| }	| jrŽ||ks€J dƒ‚|	|  |¡j|	j|	jd }	|d ur¸|	 |dkd	¡}	| jd ur¸t |	¡ d| j ¡ | j¡}|	| d	d
|   }	tj|	dd}|  |¡}t ||¡}| j	d urã|  |¡}|  
| j|¡}||  ||¡ }| dd¡ ¡  |||¡}||fS )Nr	   é   éþÿÿÿéÿÿÿÿz8Relative attention is only available for self-attention.z3Proximal bias is only available for self-attention.)ÚdeviceÚdtyper   g     ˆÃÀr   )Údim)ÚsizeÚviewr   r   Ú	transposer    ÚmatmulÚmathÚsqrtr   Ú_get_relative_embeddingsr
   Ú_matmul_with_relative_keysÚ'_relative_position_to_absolute_positionr   Ú_attn_proximity_biasÚtor=   r>   Úmasked_fillr   Ú	ones_likeÚtriuÚtrilÚFÚsoftmaxr   Ú'_absolute_position_to_relative_positionr   Ú_matmul_with_relative_valuesÚ
contiguous)r)   ÚqueryÚkeyÚvaluer1   ÚbÚdÚt_sÚt_tÚscoresÚkey_relative_embeddingsÚ
rel_logitsÚscores_localÚ
block_maskÚp_attnÚoutputÚrelative_weightsÚvalue_relative_embeddingsr/   r/   r0   r2   {   s:     





z,RelativePositionMultiHeadAttention.attentionc                 C   s   t  | | d¡¡}|S )a  
        Args:
            p_attn (Tensor): attention weights.
            re (Tensor): relative value embedding vector. (a_(i,j)^V)

        Shapes:
            -p_attn: :math:`[B, H, T, V]`
            -re: :math:`[H or 1, V, D]`
            -logits: :math:`[B, H, T, D]`
        r   )r    rC   Ú	unsqueeze)r`   ÚreÚlogitsr/   r/   r0   rR   ¥   s   z?RelativePositionMultiHeadAttention._matmul_with_relative_valuesc                 C   s   t  | | d¡ dd¡¡}|S )a&  
        Args:
            query (Tensor): batch of query vectors. (x*W^Q)
            re (Tensor): relative key embedding vector. (a_(i,j)^K)

        Shapes:
            - query: :math:`[B, H, T, D]`
            - re: :math:`[H or 1, V, D]`
            - logits: :math:`[B, H, T, V]`
        r   r;   r<   )r    rC   rd   rB   )rT   re   rf   r/   r/   r0   rG   ´   s   z=RelativePositionMultiHeadAttention._matmul_with_relative_keysc              	   C   sv   t || jd  dƒ}t | jd | dƒ}|d|  d }|dkr-t |dd||ddg¡}n|}|dd…||…f }|S )z3Convert embedding vestors to a tensor of embeddingsr   r   r	   N)Úmaxr   rO   Úpad)r)   Úrelative_embeddingsÚlengthÚ
pad_lengthÚslice_start_positionÚslice_end_positionÚpadded_relative_embeddingsÚused_relative_embeddingsr/   r/   r0   rF   Ä   s   z;RelativePositionMultiHeadAttention._get_relative_embeddingsc              	   C   s–   |   ¡ \}}}}t | g d¢¡} |  |||d | g¡}t |d|d ddddg¡}| |||d d| d g¡dd…dd…d|…|d d…f }|S )zÐConverts tensor from relative to absolute indexing for local attention.
        Shapes:
            x: :math:`[B, C, T, 2 * T - 1]`
        Returns:
            A Tensor of shape :math:`[B, C, T, T]`
        )r   r   r   r   r   r   r   r   r	   r   r   N©r@   rO   rh   rA   ©r3   ÚbatchÚheadsrj   Ú_Úx_flatÚx_finalr/   r/   r0   rH   Ñ   s   >zJRelativePositionMultiHeadAttention._relative_position_to_absolute_positionc                 C   sž   |   ¡ \}}}}t | d|d ddddddg¡} |  |||d ||d   g¡}t ||dddddg¡}| |||d| g¡dd…dd…dd…dd…f }|S )zk
        Shapes:
            - x: :math:`[B, C, T, T]`
            - ret: :math:`[B, C, T, 2*T-1]`
        r   r   r	   Nrp   rq   r/   r/   r0   rQ   ã   s     2zJRelativePositionMultiHeadAttention._absolute_position_to_relative_positionc                 C   sJ   t j| t jd}t  |d¡t  |d¡ }t  t  |¡¡ }| d¡ d¡S )zÖProduce an attention mask that discourages distant
        attention values.
        Args:
            length (int): an integer scalar.
        Returns:
            a Tensor with shape :math:`[1, 1, T, T]`
        )r>   r   r   )r    ÚarangeÚfloat32rd   Úlog1pÚabs)rj   ÚrÚdiffr/   r/   r0   rI   ó   s   
z7RelativePositionMultiHeadAttention._attn_proximity_bias)NTr   NFF©N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r9   r2   ÚstaticmethodrR   rG   rF   rH   rQ   rI   Ú__classcell__r/   r/   r-   r0   r   
   s,    1ö
6
*



r   c                       sF   e Zd ZdZd‡ fdd„	Zdd„ Zdd	„ Zd
d„ Zedd„ ƒZ	‡  Z
S )ÚFeedForwardNetworkaU  Feed Forward Inner layers for Transformer.

    Args:
        in_channels (int): input tensor channels.
        out_channels (int): output tensor channels.
        hidden_channels (int): inner layers hidden channels.
        kernel_size (int): conv1d filter kernel size.
        dropout_p (float, optional): dropout rate. Defaults to 0.
    r   Fc                    sn   t ƒ  ¡  || _|| _|| _|| _|| _|r| j| _n| j	| _t
 |||¡| _t
 |||¡| _t
 |¡| _d S r}   )r   r   Úin_channelsr   Úhidden_channelsÚkernel_sizer   Ú_causal_paddingÚpaddingÚ_same_paddingr   r   Úconv_1Úconv_2r   r   )r)   r…   r   r†   r‡   r   Úcausalr-   r/   r0   r     s   

zFeedForwardNetwork.__init__c                 C   sD   |   |  || ¡¡}t |¡}|  |¡}|  |  || ¡¡}|| S r}   )r‹   r‰   r    Úrelur   rŒ   )r)   r3   Úx_maskr/   r/   r0   r9   "  s
   

zFeedForwardNetwork.forwardc                 C   sH   | j dkr|S | j d }d}ddgddg||gg}t ||  |¡¡}|S )Nr   r   ©r‡   rO   rh   Ú
_pad_shape©r)   r3   Úpad_lÚpad_rr‰   r/   r/   r0   rˆ   )  s   

z"FeedForwardNetwork._causal_paddingc                 C   sR   | j dkr|S | j d d }| j d }ddgddg||gg}t ||  |¡¡}|S )Nr   r	   r   r   r’   r/   r/   r0   rŠ   2  s   

z FeedForwardNetwork._same_paddingc                 C   s    | d d d… }dd„ |D ƒ}|S )Nr<   c                 S   s   g | ]	}|D ]}|‘qqS r/   r/   )Ú.0ÚsublistÚitemr/   r/   r0   Ú
<listcomp>>  s    z1FeedForwardNetwork._pad_shape.<locals>.<listcomp>r/   )r‰   ÚlÚ	pad_shaper/   r/   r0   r‘   ;  s   zFeedForwardNetwork._pad_shape)r   F)r~   r   r€   r   r   r9   rˆ   rŠ   r‚   r‘   rƒ   r/   r/   r-   r0   r„     s    
		r„   c                       sZ   e Zd ZdZ					ddededed	ed
ededededef‡ fdd„Zdd„ Z‡  ZS )ÚRelativePositionTransformeraî  Transformer with Relative Potional Encoding.
    https://arxiv.org/abs/1803.02155

    Args:
        in_channels (int): number of channels of the input tensor.
        out_chanels (int): number of channels of the output tensor.
        hidden_channels (int): model hidden channels.
        hidden_channels_ffn (int): hidden channels of FeedForwardNetwork.
        num_heads (int): number of attention heads.
        num_layers (int): number of transformer layers.
        kernel_size (int, optional): kernel size of feed-forward inner layers. Defaults to 1.
        dropout_p (float, optional): dropout rate for self-attention and feed-forward inner layers_per_stack. Defaults to 0.
        rel_attn_window_size (int, optional): relation attention window size.
            If 4, for each time step next and previous 4 time steps are attended.
            If default, relative encoding is disabled and it is a regular transformer.
            Defaults to None.
        input_length (int, optional): input lenght to limit position encoding. Defaults to None.
        layer_norm_type (str, optional): type "1" uses torch tensor operations and type "2" uses torch layer_norm
            primitive. Use type "2", type "1: is for backward compat. Defaults to "1".
    r   r   NÚ1r…   r   r†   Úhidden_channels_ffnr   Ú
num_layersr   r   Úlayer_norm_typec                    s  t ƒ  ¡  || _|| _|| _|| _|| _|| _|	| _t	 
|¡| _t	 ¡ | _t	 ¡ | _t	 ¡ | _t	 ¡ | _t| jƒD ]Œ}| j t|dkrE|n||||	||
d¡ |dkr\| j t|ƒ¡ n|dkri| j t|ƒ¡ ntdƒ‚||kr€|d | jkr€t	 ||d¡| _| j t||d | jkrŽ|n||||d¡ |dkr¬| j t|d | jkr§|n|ƒ¡ q9|dkrÂ| j t|d | jkr½|n|ƒ¡ q9tdƒ‚d S )Nr   )r   r   r   rœ   Ú2z [!] Unknown layer norm typer   )r   )r   r   r†   r   r   rž   r‡   r   r   r   r   r   Ú
ModuleListÚattn_layersÚnorm_layers_1Ú
ffn_layersÚnorm_layers_2ÚrangeÚappendr   r   r   Ú
ValueErrorr   Úprojr„   )r)   r…   r   r†   r   r   rž   r‡   r   r   r   rŸ   Úidxr-   r/   r0   r   X  s\   




úÿ
ûÿ
$$Üz$RelativePositionTransformer.__init__c                 C   s´   |  d¡|  d¡ }t| jƒD ]D}|| }| j| |||ƒ}|  |¡}| j| || ƒ}| j| ||ƒ}|  |¡}|d | jkrJt| dƒrJ|  |¡}| j	| || ƒ}q|| }|S )zd
        Shapes:
            - x: :math:`[B, C, T]`
            - x_mask: :math:`[B, 1, T]`
        r	   r<   r   r©   )
rd   r¦   rž   r¢   r   r£   r¤   Úhasattrr©   r¥   )r)   r3   r   r5   ÚiÚyr/   r/   r0   r9   ›  s   


z#RelativePositionTransformer.forward)r   r   NNrœ   )	r~   r   r€   r   ÚintÚstrr   r9   rƒ   r/   r/   r-   r0   r›   B  s6    ôþýüûúù
öõôCr›   )rD   r    r   Útorch.nnr   rO   Ú$TTS.tts.layers.generic.normalizationr   r   ÚModuler   r„   r›   r/   r/   r/   r0   Ú<module>   s     }<