o
    
j-                     @   s   d dl mZmZ d dlZd dlm  mZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ G dd dejZG d	d
 d
ejZG dd dejZG dd dZdS )    )ListTupleN)nn)tqdmLinear)ConvBNBlockc                       sP   e Zd ZdZd fdd	Zdejdejdeejejf fd	d
Z	dd Z
  ZS )Encoderan  Neural HMM Encoder

    Same as Tacotron 2 encoder but increases the input length by states per phone

    Args:
        num_chars (int): Number of characters in the input.
        state_per_phone (int): Number of states per phone.
        in_out_channels (int): number of input and output channels.
        n_convolutions (int): number of convolutional layers.
          c                    s   t    || _|| _t||| _t | _t	|D ]}| j
t||dd qtj|t|d | ddddd| _d | _d S )N   relu      T)
num_layersbatch_firstbiasbidirectional)super__init__state_per_phonein_out_channelsr   	Embeddingemb
ModuleListconvolutionsrangeappendr   LSTMintlstm	rnn_state)self	num_charsr   r   n_convolutions_	__class__ X/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/overflow/common_layers.pyr      s    


zEncoder.__init__xx_lenreturnc                 C   s   |j \}}| |dd}| jD ]}||}q|dd}tjjj|| dd}| j	
  | 	|\}}tjjj|dd\}}|||| j | j}|| j }||fS )a  Forward pass to the encoder.

        Args:
            x (torch.FloatTensor): input text indices.
                - shape: :math:`(b, T_{in})`
            x_len (torch.LongTensor): input text lengths.
                - shape: :math:`(b,)`

        Returns:
            Tuple[torch.FloatTensor, torch.LongTensor]: encoder outputs and output lengths.
                -shape: :math:`((b, T_{in} * states_per_phone, in_out_channels), (b,))`
        r   r   Tr   )shaper   	transposer   r   utilsrnnpack_padded_sequencecpur    flatten_parameterspad_packed_sequencereshaper   r   r"   r*   r+   bTolayerr%   r(   r(   r)   forward,   s   




zEncoder.forwardc                 C   sr   |j \}}| |dd}| jD ]}||}q|dd}| |\}}|||| j | j}|| j }||fS )a  Inference to the encoder.

        Args:
            x (torch.FloatTensor): input text indices.
                - shape: :math:`(b, T_{in})`
            x_len (torch.LongTensor): input text lengths.
                - shape: :math:`(b,)`

        Returns:
            Tuple[torch.FloatTensor, torch.LongTensor]: encoder outputs and output lengths.
                -shape: :math:`((b, T_{in} * states_per_phone, in_out_channels), (b,))`
        r   r   )r.   r   r/   r   r    r6   r   r   r7   r(   r(   r)   	inferenceF   s   



zEncoder.inference)r
   r   )__name__
__module____qualname____doc__r   torchFloatTensor
LongTensorr   r<   r=   __classcell__r(   r(   r&   r)   r	      s
    &r	   c                
       sJ   e Zd ZdZdee dedededef
 fddZd	d
 Zdd Z	  Z
S )ParameterModela  Main neural network of the outputnet

    Note: Do not put dropout layers here, the model will not converge.

    Args:
            outputnet_size (List[int]): the architecture of the parameter model
            input_size (int): size of input for the first layer
            output_size (int): size of output i.e size of the feature dim
            frame_channels (int): feature dim to set the flat start bias
            flat_start_params (dict): flat start parameters to set the bias
    outputnet_size
input_sizeoutput_sizeframe_channelsflat_start_paramsc                    sj   t    || _tdd t|g|d d  |D | _t|d || _| 	|d |d |d  d S )Nc                 S   s   g | ]	\}}t ||qS r(   r   ).0inpoutr(   r(   r)   
<listcomp>x   s    z+ParameterModel.__init__.<locals>.<listcomp>meanstdtransition_p)
r   r   rJ   r   r   ziplayersr   
last_layerflat_start_output_layer)r"   rG   rH   rI   rJ   rK   r&   r(   r)   r   l   s   
 zParameterModel.__init__c                 C   sd   | j jj  || j jjd| j< t|| j jj| jd| j < t|| j jjd| j d < d S )Nr   r   )	rV   weightdatazero_r   rJ   OverflowUtilsinverse_softplusinverse_sigmod)r"   rQ   rR   rS   r(   r(   r)   rW      s    "z&ParameterModel.flat_start_output_layerc                 C   s(   | j D ]	}t||}q| |}|S )N)rU   Fr   rV   )r"   r*   r;   r(   r(   r)   r<      s   

zParameterModel.forward)r>   r?   r@   rA   r   r   dictr   rW   r<   rE   r(   r(   r&   r)   rF   _   s    rF   c                       sR   e Zd ZdZ	ddedededee dedef fd	d
Zdd Z	dd Z
  ZS )	Outputnetz
    This network takes current state and previous observed values as input
    and returns its parameters, mean, standard deviation and probability
    of transition to the next state
    {Gz?encoder_dimmemory_rnn_dimrJ   rG   rK   	std_floorc           	         sH   t    || _|| _|| _|| }d| d }t|||||d| _d S )Nr   r   )rG   rH   rI   rK   rJ   )r   r   rJ   rK   rd   rF   parametermodel)	r"   rb   rc   rJ   rG   rK   rd   rH   rI   r&   r(   r)   r      s   
	zOutputnet.__init__c           	      C   s   |j d |j d }}|j d }|d|||}tj||fdd}| |}|ddddd| jf |dddd| jd| j f |ddddd| j df d}}}t	|}| 
|}|||fS )a  Inputs observation and returns the means, stds and transition probability for the current state

        Args:
            ar_mel_inputs (torch.FloatTensor): shape (batch, prenet_dim)
            states (torch.FloatTensor):  (batch, hidden_states, hidden_state_dim)

        Returns:
            means: means for the emission observation for each feature
                - shape: (B, hidden_states, feature_size)
            stds: standard deviations for the emission observation for each feature
                - shape: (batch, hidden_states, feature_size)
            transition_vectors: transition vector for the current hidden state
                - shape: (batch, hidden_states)
        r   r   r   dimN)r.   	unsqueezeexpandrB   catre   rJ   squeezer^   softplus
_floor_std)	r"   ar_melsinputs
batch_size
prenet_dimNrQ   rR   transition_vectorr(   r(   r)   r<      s   

 $



zOutputnet.forwardc                 C   s6   |   }tj|| jd}t||krtd |S )a@  
        It clamps the standard deviation to not to go below some level
        This removes the problem when the model tries to cheat for higher likelihoods by converting
        one of the gaussians to a point mass.

        Args:
            std (float Tensor): tensor containing the standard deviation to be
        minzg[*] Standard deviation was floored! The model is preventing overfitting, nothing serious to worry about)clonedetachrB   clamprd   anyprint)r"   rR   original_tensorr(   r(   r)   rm      s   	zOutputnet._floor_std)ra   )r>   r?   r@   rA   r   r   r_   floatr   r<   rm   rE   r(   r(   r&   r)   r`      s$    r`   c                   @   s~   e Zd ZedejjjdedefddZ	ee
 dd Zedd	d
Zedd Zedd Zedd Zedd ZdS )r[   data_loaderout_channelsstates_per_phonec                 C   s   d}d}d}d}t | ddD ].}|d }|d }	|d }
|t|7 }|t|
7 }|t|	7 }|tt|	d7 }q|||  }t|||  t|d }|t| j }|t| j }|| }d| }|||| fS )	a  Generates data parameters for flat starting the HMM.

        Args:
            data_loader (torch.utils.data.Dataloader): _description_
            out_channels (int): mel spectrogram channels
            states_per_phone (_type_): HMM states per phone
        r   F)leavetoken_id_lengthsmelmel_lengthsr   r   )r   rB   sumpowsqrtlendataset)r}   r~   r   total_state_lentotal_mel_lentotal_mel_sumtotal_mel_sq_sumbatchtext_lengthsmelsr   	data_meandata_stdaverage_num_statesaverage_mel_lenaverage_duration_each_stateinit_transition_probr(   r(   r)   "get_data_parameters_for_flat_start   s&   z0OverflowUtils.get_data_parameters_for_flat_startc                 C   s   | j jjdd| d S )Ng              ?)
neural_hmm
output_netre   rW   )modelrS   r(   r(   r)   update_flat_start_transition  s   z*OverflowUtils.update_flat_start_transition-C6?c                 C   s   t j| |d}t |S )z
        Avoids the log(0) problem

        Args:
            x (torch.tensor): input tensor
            eps (float, optional): lower bound. Defaults to 1e-04.

        Returns:
            torch.tensor: :math:`log(x)`
        rt   )rB   rx   log)r*   eps	clamped_xr(   r(   r)   log_clamped  s   
zOverflowUtils.log_clampedc                 C   s&   t | s
t | } t| d|   S )z1
        Inverse of the sigmoid function
        r   )rB   	is_tensortensorr[   r   r*   r(   r(   r)   r]     s   

zOverflowUtils.inverse_sigmodc                 C   s(   t | s
t | } tt | d S )z2
        Inverse of the softplus function
        r   )rB   r   r   r[   r   expr   r(   r(   r)   r\   #  s   

zOverflowUtils.inverse_softplusc                 C   sd   | j |d\}}|td k}| ||dj|d  j|d}||d ||td  S )a  
        Differentiable LogSumExp: Does not creates nan gradients
            when all the inputs are -inf yeilds 0 gradients.
        Args:
            x : torch.Tensor -  The input tensor
            dim: int - The dimension on which the log sum exp has to be applied
        rf   infr   r   )maxr|   masked_fill_rh   r   r   r   )r*   rg   mr%   masksr(   r(   r)   	logsumexp,  s   
$"zOverflowUtils.logsumexpc                    sD   dd dd | D D }t |  fdd| D }tjjj|ddS )z:
        Pads the list of tensors in 2 dimensions
        c                 S   s   g | ]}t |qS r(   )r   )rL   ar(   r(   r)   rO   @      z,OverflowUtils.double_pad.<locals>.<listcomp>c                 S   s   g | ]}|d  qS r   r(   )rL   ir(   r(   r)   rO   @  r   c              
      s(   g | ]}t |d  t|d   fqS r   )r^   padr   )rL   r*   second_dim_maxr(   r)   rO   B  s   ( Tr-   )r   r   r0   r1   pad_sequence)list_of_different_shape_tensorssecond_dim_lenspadded_xr(   r   r)   
double_pad;  s   zOverflowUtils.double_padN)r   )r>   r?   r@   staticmethodrB   r0   rY   
DataLoaderr   r   no_gradr   r   r]   r\   r   r   r(   r(   r(   r)   r[      s,    &


r[   )typingr   r   rB   torch.nn.functionalr   
functionalr^   	tqdm.autor   %TTS.tts.layers.tacotron.common_layersr   !TTS.tts.layers.tacotron.tacotron2r   Moduler	   rF   r`   r[   r(   r(   r(   r)   <module>   s    S-R