o
    
jaI                     @   s   d dl Z d dl mZ ddlmZ ddlmZ G dd dejZG dd	 d	ejZG d
d dejZ	G dd dejZ
G dd dejZG dd dejZG dd dejZG dd dejZdS )    N)nn   )	init_attn)Prenetc                       s2   e Zd ZdZd	 fdd	Zdd Zdd Z  ZS )
BatchNormConv1da4  A wrapper for Conv1d with BatchNorm. It sets the activation
    function between Conv and BatchNorm layers. BatchNorm layer
    is initialized with the TF default values for momentum and eps.

    Args:
        in_channels: size of each input sample
        out_channels: size of each output samples
        kernel_size: kernel size of conv filters
        stride: stride of conv filters
        padding: padding of conv filters
        activation: activation function set b/w Conv1d and BatchNorm

    Shapes:
        - input: (B, D)
        - output: (B, D)
    Nc                    sR   t    || _t|d| _tj||||ddd| _tj|ddd| _	|| _
d S )Nr   F)kernel_sizestridepaddingbiasgGz?gMbP?)momentumeps)super__init__r	   r   ConstantPad1dpadderConv1dconv1dBatchNorm1dbn
activation)selfin_channelsout_channelsr   r   r	   r   	__class__ S/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/tacotron/tacotron.pyr      s   

zBatchNormConv1d.__init__c                 C   sh   t | jtjjrd}nt | jtjjrd}n| jd u rd}ntdtjjj| j	j
tjj|d d S )NrelutanhlinearzUnknown activation functiongain)
isinstancer   torchr   ReLUTanhRuntimeErrorinitxavier_uniform_r   weightcalculate_gain)r   w_gainr   r   r   init_layers)   s   
$zBatchNormConv1d.init_layersc                 C   s6   |  |}| |}| |}| jd ur| |}|S N)r   r   r   r   r   xr   r   r   forward4   s   




zBatchNormConv1d.forwardr-   __name__
__module____qualname____doc__r   r,   r0   __classcell__r   r   r   r   r      s
    r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )Highwaya  Highway layers as explained in https://arxiv.org/abs/1505.00387

    Args:
        in_features (int): size of each input sample
        out_feature (int): size of each output sample

    Shapes:
        - input: (B, *, H_in)
        - output: (B, *, H_out)
    c                    s\   t    t||| _| jjj  t||| _| jjj	d t
 | _t | _d S N)r   r   r   LinearHr
   datazero_Tfill_r$   r   Sigmoidsigmoid)r   in_featuresout_featurer   r   r   r   J   s   

zHighway.__init__c                 C   sD   t jjj| jjt jjdd t jjj| jjt jjdd d S )Nr   r    rA   )r#   r   r'   r(   r;   r)   r*   r>   r   r   r   r   r,   T   s    $zHighway.init_layersc                 C   s4   |  | |}| | |}|| |d|   S )Ng      ?)r   r;   rA   r>   )r   inputsr;   r>   r   r   r   r0   X   s   zHighway.forwardr1   r   r   r   r   r7   =   s
    
r7   c                       s:   e Zd ZdZddddgdddf fdd	Zdd Z  ZS )	CBHGa  CBHG module: a recurrent neural network composed of:
    - 1-d convolution banks
    - Highway networks + residual connections
    - Bidirectional gated recurrent units

    Args:
        in_features (int): sample size
        K (int): max filter size in conv bank
        projections (list): conv channel sizes for conv projections
        num_highways (int): number of highways layers

    Shapes:
        - input: (B, C, T_in)
        - output: (B, T_in, C*2)
             c              	      s4  t    _ __|_|_t _	t
 fddtd|d D _|  g|d d  }j	gt|d  }	|	d g7 }	g }
t|||	D ]\}}}t||ddddg|d}|
| qOt
|
_j|d kr~tj|d dd_t
fd	dt|D _tj||dd
d
d_d S )Nc              
      s2   g | ]}t  |d |d  d |d gjdqS )r      r   r   r	   r   )r   r   ).0k)conv_bank_featuresrB   r   r   r   
<listcomp>   s    	z!CBHG.__init__.<locals>.<listcomp>r   r9      rK   F)r
   c                    s   g | ]}t   qS r   )r7   rL   _)highway_featuresr   r   rO      s    T)batch_firstbidirectional)r   r   rB   rN   rS   gru_featuresconv_projectionsr   r$   r   
ModuleListrangeconv1d_bankslenzipr   appendconv1d_projectionsr:   pre_highwayhighwaysGRUgru)r   rB   KrN   rW   rS   rV   num_highwaysout_featuresactivations	layer_setin_sizeout_sizeaclayerr   )rN   rS   rB   r   r   r   p   s0   


	
zCBHG.__init__c           	      C   s   |}g }| j D ]}||}|| qtj|dd}|d| jt| j  ks)J | jD ]}||}q,||7 }|dd}| j	| j
d krJ| |}| jD ]}||}qM| j  | |\}}|S )Nr   dimrJ   r9   )rZ   r]   r#   catsizerN   r[   r^   	transposerS   rW   r_   r`   rb   flatten_parameters)	r   rE   r/   outsr   outhighwayoutputsrR   r   r   r   r0      s$   






zCBHG.forwardr2   r3   r4   r5   r   r0   r6   r   r   r   r   rF   ^   s    2rF   c                       (   e Zd ZdZ fddZdd Z  ZS )EncoderCBHGz+CBHG module with Encoder specific argumentsc              	      s*   t    tdddddgdddd| _d S )NrH   rG   rI   rc   rN   rW   rS   rV   rd   r   r   rF   cbhgrD   r   r   r   r         
zEncoderCBHG.__init__c                 C   
   |  |S r-   r{   r.   r   r   r   r0         
zEncoderCBHG.forwardrv   r   r   r   r   rx      s    rx   c                       rw   )EncoderzStack Prenet and CBHG module for encoder
    Args:
        inputs (FloatTensor): embedding features

    Shapes:
        - inputs: (B, T, D_in)
        - outputs: (B, T, 128 * 2)
    c                    s(   t    t|ddgd| _t | _d S )N   rH   re   )r   r   r   prenetrx   r{   r   rB   r   r   r   r      s   
zEncoder.__init__c                 C   s    |  |}| |dd}|S )Nr   rJ   )r   r{   rp   r   rE   ru   r   r   r   r0      s   
zEncoder.forwardrv   r   r   r   r   r      s    	r   c                       s$   e Zd Z fddZdd Z  ZS )PostCBHGc              	      s*   t    t|ddd|gdddd| _d S )N   rH   r   rI   ry   rz   )r   mel_dimr   r   r   r      r|   zPostCBHG.__init__c                 C   r}   r-   r~   r.   r   r   r   r0      r   zPostCBHG.forward)r2   r3   r4   r   r0   r6   r   r   r   r   r      s    r   c                       sb   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d ZdddZ	dd Z
dd Zdd Z  ZS )Decodera  Tacotron decoder.

    Args:
        in_channels (int): number of input channels.
        frame_channels (int): number of feature frame channels.
        r (int): number of outputs per time step (reduction rate).
        memory_size (int): size of the past window. if <= 0 memory_size = r
        attn_type (string): type of attention used in decoder.
        attn_windowing (bool): if true, define an attention window centered to maximum
            attention response. It provides more robust attention alignment especially
            at interence time.
        attn_norm (string): attention normalization function. 'sigmoid' or 'softmax'.
        prenet_type (string): 'original' or 'bn'.
        prenet_dropout (float): prenet dropout rate.
        forward_attn (bool): if true, use forward attention method. https://arxiv.org/abs/1807.06736
        trans_agent (bool): if true, use transition agent. https://arxiv.org/abs/1807.06736
        forward_attn_mask (bool): if true, mask attention values smaller than a threshold.
        location_attn (bool): if true, use location sensitive attention.
        attn_K (int): number of attention heads for GravesAttention.
        separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow.
        d_vector_dim (int): size of speaker embedding vector, for multi-speaker training.
        max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 500.
    c                    s  t    || _|| _|| _|| _|dk| _|dkr|n|| _|| _|| _	d| _
| jr0|| j n|}t|||	ddgd| _t|d | j
| _t|| j
|d|dd|||
|||d| _td| d| _tdd	 td
D | _td|| j | _td|| j  | _d S )Nr   r   rH   r          )	attn_type	query_dimembedding_dimattention_dimlocation_attentionattention_location_n_filtersattention_location_kernel_size	windowingnormforward_attntrans_agentforward_attn_maskattn_Kc                 S   s   g | ]}t d d qS )r   )r   GRUCellrQ   r   r   r   rO   J  s    z$Decoder.__init__.<locals>.<listcomp>rJ   )r   r   r_initrr   max_decoder_stepsuse_memory_queuememory_sizeframe_channelsseparate_stopnetr   r   r   r   r   attention_rnnr   	attentionr:   project_to_decoder_inrX   rY   decoder_rnnsproj_to_melStopNetstopnet)r   r   r   r   r   r   attn_windowing	attn_normprenet_typeprenet_dropoutr   r   r   location_attnr   r   r   
prenet_dimr   r   r   r     s@   

zDecoder.__init__c                 C   s
   || _ d S r-   )r   )r   new_rr   r   r   set_rP  r   zDecoder.set_rc                 C   s@   | d| jkr||jd | d| j d}|dd}|S )z8
        Reshape the spectrograms for given 'r'
        r9   r   r   )ro   r   viewshaper   rp   )r   memoryr   r   r   _reshape_memoryS  s    zDecoder._reshape_memoryc                    s    d | jrtjdjd | j| j | _ntjdjd | j| _tjdjd d| _	 fddt
t| jD | _j | j | _| j| _dS )z2
        Initialization of decoder states
        r   r   devicer   c                    s$   g | ]}t jd jd dqS )r   r   r   )r#   zerosr   repeat)rL   idxBrE   r   r   rO   j  s    z(Decoder._init_states.<locals>.<listcomp>N)ro   r   r#   r   r   r   r   r   memory_inputattention_rnn_hiddenrY   r[   r   decoder_rnn_hiddensr<   newr   r=   context_vecr   preprocess_inputsprocessed_inputs)r   rE   r   r   r   _init_states^  s   
$zDecoder._init_statesc                 C   sf   t |dd}t |dd}t |dd }||dd| j}|dd}|||fS )Nr   r   r9   rJ   )r#   stackrp   
contiguousr   ro   r   )r   ru   
attentionsstop_tokensr   r   r   _parse_outputsq  s   
zDecoder._parse_outputsNc                 C   s   |  | j}| t|| jfd| j| _| | j|| j|| _| 	t| j| jfd}t
| jD ]\}}||| j| | j|< | j| | }q2|}| |}t||gd}	| jrc| |	 }
n| |	}
|d d d | j| j f }||
| jjfS r8   )r   r   r   r#   rn   r   r   r   r   r   	enumerater   r   r   r   r   detachr   r   attention_weights)r   rE   maskprocessed_memorydecoder_inputr   decoder_rnndecoder_outputoutputstopnet_input
stop_tokenr   r   r   decodez  s"   

zDecoder.decodec                 C   s   | j r8| j| jkr'tj|| jd d d | j| j | j f  gdd| _d S |d d d | j| j f | _d S |d d | j| jd  d f | _d S )Nr9   rl   r   )r   r   r   r#   rn   r   r   clone)r   
new_memoryr   r   r   _update_memory_input  s   *"&zDecoder._update_memory_inputc                 C   s   |  |}g }g }g }d}| | | j| t||dk rX|dkr0||d  }| | | ||\}	}
}||	g7 }||g7 }||
dg7 }|d7 }t||dk s!| 	|||S )a  
        Args:
            inputs: Encoder outputs.
            memory: Decoder memory (autoregression. If None (at eval-time),
              decoder outputs are used as decoder inputs. If None, it uses the last
              output as the input.
            mask: Attention mask for sequence padding.

        Shapes:
            - inputs: (B, T, D_out_enc)
            - memory: (B, T_mel, D_mel)
        r   r   )
r   r   r   init_statesr[   ro   r   r   squeezer   )r   rE   r   r   ru   r   r   tr   r   r   r   r   r   r   r0     s$   





zDecoder.forwardc           
      C   s   g }g }g }d}|  | | j| 	 |dkr!|d }| | | |d\}}}	t|j}||g7 }||	g7 }||g7 }|d7 }||jd d kr]|dks\|	dddf 	 dkr]n|| j
krgtd nq| |||S )	z
        Args:
            inputs: encoder outputs.
        Shapes:
            - inputs: batch x time x encoder_out_dim
        r   Tr9   Nr   rI   g333333?z.   | > Decoder stopped with 'max_decoder_steps)r   r   r   r   r   r#   rA   r<   r   itemr   printr   )
r   rE   ru   r   r   r   r   r   r   r   r   r   r   	inference  s.   




2
zDecoder.inferencer-   )r2   r3   r4   r5   r   r   r   r   r   r   r   r0   r   r6   r   r   r   r   r      s    ;
	!r   c                       rw   )r   zsStopnet signalling decoder to stop inference.
    Args:
        in_features (int): feature dimension of input.
    c                    sH   t    td| _t|d| _tjjj	| jj
tjjdd d S )Ng?r   r   r    )r   r   r   Dropoutdropoutr:   r   r#   r'   r(   r)   r*   r   r   r   r   r     s   
$zStopNet.__init__c                 C   s   |  |}| |}|S r-   )r   r   r   r   r   r   r0     s   

zStopNet.forwardrv   r   r   r   r   r     s    r   )r#   r   r   r   common_layersr   Moduler   r7   rF   rx   r   r   r   r   r   r   r   r   <module>   s   2!a p