o
    
jO                     @   s8   d dl mZ d dlZd dlmZ G dd dejjZdS )    )TupleN)nnc                       sh   e Zd ZdZ				d fdd	Zdd Z		dd
ejdejdejdejdeejejf f
ddZ	  Z
S )AlignmentNetworka  Aligner Network for learning alignment between the input text and the model output with Gaussian Attention.

    ::

        query -> conv1d -> relu -> conv1d -> relu -> conv1d -> L2_dist -> softmax -> alignment
        key   -> conv1d -> relu -> conv1d -----------------------^

    Args:
        in_query_channels (int): Number of channels in the query network. Defaults to 80.
        in_key_channels (int): Number of channels in the key network. Defaults to 512.
        attn_channels (int): Number of inner channels in the attention layers. Defaults to 80.
        temperature (float): Temperature for the softmax. Defaults to 0.0005.
    P      Mb@?c                    s   t    || _tjjdd| _tjjdd| _t	tj
||d ddddtj tj
|d |dddd| _t	tj
||d ddddtj tj
|d |ddddtj tj
||dddd| _|   d S )N   )dim      T)kernel_sizepaddingbiasr   )super__init__temperaturetorchr   Softmaxsoftmax
LogSoftmaxlog_softmax
SequentialConv1dReLU	key_layerquery_layerinit_layers)selfin_query_channelsin_key_channelsattn_channelsr   	__class__ Q/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/generic/aligner.pyr      s:   
zAlignmentNetwork.__init__c                 C   s   t jjj| jd jt jjdd t jjj| jd jt jjdd t jjj| jd jt jjdd t jjj| jd jt jjdd t jjj| jd jt jjdd d S )Nr   relu)gainr
   linear   )r   r   initxavier_uniform_r   weightcalculate_gainr   )r   r#   r#   r$   r   >   s
   $$$$(zAlignmentNetwork.init_layersNquerieskeysmask
attn_priorreturnc           
      C   s   |  |}| |}|dddddddf |dddddf  d }| j |jddd }|durF| |t|dddf d  }|durZ|j|	 
d td  | |}	|	|fS )ut  Forward pass of the aligner encoder.
        Shapes:
            - queries: :math:`[B, C, T_de]`
            - keys: :math:`[B, C_emb, T_en]`
            - mask: :math:`[B, T_de]`
        Output:
            attn (torch.tensor): :math:`[B, 1, T_en, T_de]` soft attention mask.
            attn_logp (torch.tensor): :math:`[ßB, 1, T_en , T_de]` log probabilities.
        Nr
   r   T)keepdimg:0yE>inf)r   r   r   sumr   r   logdatamasked_fill_bool	unsqueezefloatr   )
r   r-   r.   r/   r0   key_out	query_outattn_factor	attn_logpattnr#   r#   r$   forwardE   s   

6$ 
zAlignmentNetwork.forward)r   r   r   r   )NN)__name__
__module____qualname____doc__r   r   r   tensorr   r@   __classcell__r#   r#   r!   r$   r      s(    (r   )typingr   r   r   Moduler   r#   r#   r#   r$   <module>   s    