o
    ´‹
jï=  ã                   @   sˆ   d dl Z d dl mZ d dlmZ ddlmZ ddlmZm	Z	 G dd„ dej
ƒZG d	d
„ d
ej
ƒZG dd„ dej
ƒZG dd„ dej
ƒZdS )é    N)Únn)Ú
functionalé   )Ú	init_attn)ÚLinearÚPrenetc                       ó*   e Zd ZdZd‡ fdd„	Zdd„ Z‡  ZS )ÚConvBNBlockat  Convolutions with Batch Normalization and non-linear activation.

    Args:
        in_channels (int): number of input channels.
        out_channels (int): number of output channels.
        kernel_size (int): convolution kernel size.
        activation (str): 'relu', 'tanh', None (linear).

    Shapes:
        - input: (B, C_in, T)
        - output: (B, C_out, T)
    Nc                    s˜   t ƒ  ¡  |d d dksJ ‚|d d }tj||||d| _tj|ddd| _tjdd	| _|d
kr:t 	¡ | _
d S |dkrEt ¡ | _
d S t ¡ | _
d S )Nr   é   r   )Úpaddingçš™™™™™¹?gñhãˆµøä>)ÚmomentumÚepsç      à?)ÚpÚreluÚtanh)ÚsuperÚ__init__r   ÚConv1dÚconvolution1dÚBatchNorm1dÚbatch_normalizationÚDropoutÚdropoutÚReLUÚ
activationÚTanhÚIdentity)ÚselfÚin_channelsÚout_channelsÚkernel_sizer   r   ©Ú	__class__© úT/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/tacotron/tacotron2.pyr      s   
zConvBNBlock.__init__c                 C   s,   |   |¡}|  |¡}|  |¡}|  |¡}|S ©N)r   r   r   r   )r   ÚxÚor%   r%   r&   Úforward'   s
   



zConvBNBlock.forwardr'   ©Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r*   Ú__classcell__r%   r%   r#   r&   r	      s    r	   c                       r   )ÚPostnetz¥Tacotron2 Postnet

    Args:
        in_out_channels (int): number of output channels.

    Shapes:
        - input: (B, C_in, T)
        - output: (B, C_in, T)
    é   c              	      st   t ƒ  ¡  t ¡ | _| j t|dddd¡ td|d ƒD ]}| j tddddd¡ q| j td|dd d¡ d S )Né   r2   r   )r"   r   r   )r   r   r   Ú
ModuleListÚconvolutionsÚappendr	   Úrange)r   Úin_out_channelsÚ	num_convsÚ_r#   r%   r&   r   :   s   

zPostnet.__init__c                 C   s   |}| j D ]}||ƒ}q|S r'   )r5   )r   r(   r)   Úlayerr%   r%   r&   r*   B   s   

zPostnet.forward)r2   r+   r%   r%   r#   r&   r1   /   s    
r1   c                       s2   e Zd ZdZd	‡ fdd„	Zdd„ Zdd„ Z‡  ZS )
ÚEncoderz¯Tacotron2 Encoder

    Args:
        in_out_channels (int): number of input and output channels.

    Shapes:
        - input: (B, C_in, T)
        - output: (B, C_in, T)
    r3   c                    sb   t ƒ  ¡  t ¡ | _tdƒD ]}| j t||ddƒ¡ qtj|t	|d ƒddddd| _
d | _d S )Né   r2   r   r
   r   T)Ú
num_layersÚbatch_firstÚbiasÚbidirectional)r   r   r   r4   r5   r7   r6   r	   ÚLSTMÚintÚlstmÚ	rnn_state)r   r8   r:   r#   r%   r&   r   T   s   

ÿ
zEncoder.__init__c                 C   sn   |}| j D ]}||ƒ}q| dd¡}tjjj|| ¡ dd}| j ¡  |  |¡\}}tjjj	|dd\}}|S )Nr   r
   T)r?   )
r5   Ú	transposer   ÚutilsÚrnnÚpack_padded_sequenceÚcpurD   Úflatten_parametersÚpad_packed_sequence)r   r(   Úinput_lengthsr)   r;   r:   r%   r%   r&   r*   ^   s   


zEncoder.forwardc                 C   s6   |}| j D ]}||ƒ}q| dd¡}|  |¡\}}|S )Nr   r
   )r5   rF   rD   )r   r(   r)   r;   r:   r%   r%   r&   Ú	inferencei   s   

zEncoder.inference)r3   )r,   r-   r.   r/   r   r*   rN   r0   r%   r%   r#   r&   r<   I   s
    

r<   c                       s|   e Zd ZdZ‡ fdd„Zdd„ Zdd„ Zdd	d
„Zdd„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ Zdd„ Zddd„Z‡  ZS )ÚDecodera†  Tacotron2 decoder. We don't use Zoneout but Dropout between RNN layers.

    Args:
        in_channels (int): number of input channels.
        frame_channels (int): number of feature frame channels.
        r (int): number of outputs per time step (reduction rate).
        memory_size (int): size of the past window. if <= 0 memory_size = r
        attn_type (string): type of attention used in decoder.
        attn_win (bool): if true, define an attention window centered to maximum
            attention response. It provides more robust attention alignment especially
            at interence time.
        attn_norm (string): attention normalization function. 'sigmoid' or 'softmax'.
        prenet_type (string): 'original' or 'bn'.
        prenet_dropout (float): prenet dropout rate.
        forward_attn (bool): if true, use forward attention method. https://arxiv.org/abs/1807.06736
        trans_agent (bool): if true, use transition agent. https://arxiv.org/abs/1807.06736
        forward_attn_mask (bool): if true, mask attention values smaller than a threshold.
        location_attn (bool): if true, use location sensitive attention.
        attn_K (int): number of attention heads for GravesAttention.
        separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow.
        max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 10000.
    c                    s&  t ƒ  ¡  || _|| _|| _|| _|| _|| _d| _d| _	d| _
d| _d| _d| _d| _| j}t|||| j| jgdd| _tj| j| | j	dd	| _t|| j	|d|d
d|||	|
||d| _tj| j	| | j
dd	| _t| j
| | j| j ƒ| _t t d¡t| j
| j| j  dddd¡| _d | _d S )Nr   i   é   é€   r   F)Úout_featuresr@   T)r@   é    é   )Ú	attn_typeÚ	query_dimÚembedding_dimÚattention_dimÚlocation_attentionÚattention_location_n_filtersÚattention_location_kernel_sizeÚ	windowingÚnormÚforward_attnÚtrans_agentÚforward_attn_maskÚattn_Kr   Úsigmoid)r@   Ú	init_gain)r   r   Úframe_channelsÚr_initÚrÚencoder_embedding_dimÚseparate_stopnetÚmax_decoder_stepsÚstop_thresholdrV   Údecoder_rnn_dimÚ
prenet_dimÚattn_dimÚp_attention_dropoutÚp_decoder_dropoutr   Úprenetr   ÚLSTMCellÚattention_rnnr   Ú	attentionÚdecoder_rnnr   Úlinear_projectionÚ
Sequentialr   ÚstopnetÚmemory_truncated)r   r    rd   rf   rU   Úattn_winÚ	attn_normÚprenet_typeÚprenet_dropoutr^   r_   r`   Úlocation_attnra   rh   ri   rl   r#   r%   r&   r   Ž   sR   
ÿóþ
zDecoder.__init__c                 C   s
   || _ d S r'   )rf   )r   Únew_rr%   r%   r&   Úset_rÓ   s   
zDecoder.set_rc                 C   s.   |  d¡}tjd|jd || j| j ¡}|S ©Nr   r   )Údevice)ÚsizeÚtorchÚzerosr   Úrepeatrd   rf   )r   ÚinputsÚBÚmemoryr%   r%   r&   Úget_go_frameÖ   s   
 zDecoder.get_go_frameFc                 C   s¸   |  d¡}|sMtjd|jd || j¡| _tjd|jd || j¡| _tjd|jd || j¡| _	tjd|jd || j¡| _
tjd|jd || j¡| _|| _| j |¡| _|| _d S r€   )r‚   rƒ   r„   r   r…   rV   ÚqueryÚattention_rnn_cell_staterk   Údecoder_hiddenÚdecoder_cellrg   Úcontextr†   rs   Úpreprocess_inputsÚprocessed_inputsÚmask)r   r†   r‘   Úkeep_statesr‡   r%   r%   r&   Ú_init_statesÛ   s   

zDecoder._init_statesc                 C   s@   |  d¡| jkr| |jd |  d¡| j d¡}| dd¡}|S )z8
        Reshape the spectrograms for given 'r'
        éÿÿÿÿr   r   )r‚   rd   ÚviewÚshaperf   rF   ©r   rˆ   r%   r%   r&   Ú_reshape_memoryè   s    zDecoder._reshape_memoryc                 C   sf   t  |¡ dd¡}t  |¡ dd¡}t  |¡ dd¡ ¡ }| | d¡d| j¡}| dd¡}|||fS )Nr   r   r”   r
   )rƒ   ÚstackrF   Ú
contiguousr•   r‚   rd   )r   ÚoutputsÚstop_tokensÚ
alignmentsr%   r%   r&   Ú_parse_outputsó   s   
zDecoder._parse_outputsc                 C   sT   t |jƒdkr|d d …| j| jd  d …f S |d d …d d …| j| jd  d …f S )Nr
   r   )Úlenr–   rd   rf   r—   r%   r%   r&   Ú_update_memoryû   s    &zDecoder._update_memoryc                 C   s8  t  || jfd¡}|  || j| jf¡\| _| _t | j| j| j	¡| _t | j| j| j	¡| _|  
| j| j| j| j¡| _t  | j| jfd¡}|  || j| jf¡\| _| _t | j| j| j	¡| _t j| j| jfdd}|  |¡}t j| j|fdd}| jr‚|  | ¡ ¡}n|  |¡}|dd…d| j| j …f }|| j
j|fS )zJ
        shapes:
           - memory: B x r * self.frame_channels
        r”   r   ©ÚdimN)rƒ   ÚcatrŽ   rr   rŠ   r‹   ÚFr   rn   Útrainingrs   r†   r   r‘   rt   rŒ   r   ro   ru   rh   rw   Údetachrf   rd   Úattention_weights)r   rˆ   Úquery_inputÚdecoder_rnn_inputÚdecoder_hidden_contextÚdecoder_outputÚstopnet_inputÚ
stop_tokenr%   r%   r&   Údecode   s,   ÿÿÿ

zDecoder.decodec                 C   sú   |   |¡ d¡}|  |¡}tj||fdd}|  |¡}|  |¡}| j||d | j 	|¡ g g g }}}t
|ƒ| d¡d k rn|t
|ƒ }|  |¡\}}	}
|| d¡g7 }||
 d¡g7 }||	g7 }t
|ƒ| d¡d k s@|  |||¡\}}}|||fS )a«  Train Decoder with teacher forcing.
        Args:
            inputs: Encoder outputs.
            memories: Feature frames for teacher-forcing.
            mask: Attention mask for sequence padding.

        Shapes:
            - inputs: (B, T, D_out_enc)
            - memory: (B, T_mel, D_mel)
            - outputs: (B, T_mel, D_mel)
            - alignments: (B, T_in, T_out)
            - stop_tokens: (B, T_out)
        r   r¡   ©r‘   r   )r‰   Ú	unsqueezer˜   rƒ   r£   r    rp   r“   rs   Úinit_statesrŸ   r‚   r®   Úsqueezerž   )r   r†   Úmemoriesr‘   rˆ   r›   rœ   r   r«   r§   r­   r%   r%   r&   r*   '  s"   



û
zDecoder.forwardc           
      C   sþ   |   |¡}|  |¡}| j|dd | j |¡ g g g df\}}}}	 |  |¡}|  |¡\}}}	t |	j	¡}	|| 
d¡g7 }||	g7 }||g7 }|	| jkrV||jd d krVnt|ƒ| jkrftd| j› ƒ n
|  |¡}|d7 }q"|  |||¡\}}}|||fS )aA  Decoder inference without teacher forcing and use
        Stopnet to stop decoder.
        Args:
            inputs: Encoder outputs.

        Shapes:
            - inputs: (B, T, D_out_enc)
            - outputs: (B, T_mel, D_mel)
            - alignments: (B, T_in, T_out)
            - stop_tokens: (B, T_out)
        Nr¯   r   Tr   r
   z.   > Decoder stopped with `max_decoder_steps` )r‰   r    r“   rs   r±   rp   r®   rƒ   rb   Údatar²   rj   r–   rŸ   ri   Úprintrž   )
r   r†   rˆ   r›   rœ   r   Útr«   Ú	alignmentr­   r%   r%   r&   rN   I  s,   





ñ
zDecoder.inferencec           
      C   sö   | j du r|  |¡| _ | j|ddd n| j|ddd | j |¡ g g g df\}}}}	 |  | j ¡}|  |¡\}}}	t |	j	¡}	|| 
d¡g7 }||	g7 }||g7 }|	dkrXnt|ƒ| jkrdtdƒ n|| _ |d7 }q-|  |||¡\}}}|||fS )	zB
        Preserve decoder states for continuous inference
        NF)r‘   r’   Tr   r   gffffffæ?z.   | > Decoder stopped with 'max_decoder_steps)rx   r‰   r“   rs   r±   rp   r®   rƒ   rb   r´   r²   rŸ   ri   rµ   rž   )
r   r†   r›   rœ   r   r¶   rˆ   r«   r·   r­   r%   r%   r&   Úinference_truncatedq  s.   


ñ
zDecoder.inference_truncatedNc                 C   sT   |dkr|   |¡}| j|dd |  |¡}|  |¡\}}}t |j¡}|}|||fS )z$
        For debug purposes
        r   Nr¯   )r‰   r“   rp   r®   rƒ   rb   r´   )r   r†   r¶   rˆ   r«   r­   r·   r%   r%   r&   Úinference_step’  s   


zDecoder.inference_step)Fr'   )r,   r-   r.   r/   r   r   r‰   r“   r˜   rž   r    r®   r*   rN   r¸   r¹   r0   r%   r%   r#   r&   rO   t   s    E
'"(!rO   )rƒ   r   Útorch.nnr   r¤   Ú
attentionsr   Úcommon_layersr   r   ÚModuler	   r1   r<   rO   r%   r%   r%   r&   Ú<module>   s    $+