o
    
j#                     @   s   d dl mZmZmZ d dlZd dlmZ d dlm  mZ	 d dl
mZ d dlmZ d dlmZ dejdejfdd	ZddejdedejfddZG dd dejZG dd dejZG dd dejZdS )    )ListTupleUnionN)!ConformerMultiHeadedSelfAttention)CoordConv1d)STLlengthsreturnc                 C   sR   | j d }t|  }tjd|| jdd|d}|| dd|k}|S )Nr   )device   )shapetorchmaxitemaranger
   	unsqueezeexpand)r   
batch_sizemax_lenidsmask r   Y/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/delightful_tts/encoders.pyget_mask_from_lengths   s
   
 r      lensstridec                 C   s   t | |  S )N)r   ceilint)r   r   r   r   r   stride_lens   s   r    c                       s   e Zd ZdZdedeeeeeeeef  dedeeeeeeef  def
 fddZd	ej	d
ej	de
ej	ej	ej	f fddZdedededededefddZ  ZS )ReferenceEncodera  
    Referance encoder for utterance and phoneme prosody encoders. Reference encoder
    made up of convolution and RNN layers.

    Args:
        num_mels (int): Number of mel frames to produce.
        ref_enc_filters (list[int]): List of channel sizes for encoder layers.
        ref_enc_size (int): Size of the kernel for the conv layers.
        ref_enc_strides (List[int]): List of strides to use for conv layers.
        ref_enc_gru_size (int): Number of hidden features for the gated recurrent unit.

    Inputs: inputs, mask
        - **inputs** (batch, dim, time): Tensor containing mel vector
        - **lengths** (batch): Tensor containing the mel lengths.
    Returns:
        - **outputs** (batch, time, dim): Tensor produced by Reference Encoder.
    num_melsref_enc_filtersref_enc_sizeref_enc_stridesref_enc_gru_sizec           
         s   t    |}|| _t}| jg  dg| t d  d d d ddg} fddtd|D }	||	 t|| _	tfddt|D | _
tjd	 |dd
| _d S )Nr   r   r   T)in_channelsout_channelskernel_sizer   paddingwith_rc              	      s4   g | ]}t j |  |d   | d dqS )r   r   )r'   r(   r)   r   r*   )nnConv1d.0i)filtersr$   stridesr   r   
<listcomp>E   s    
z-ReferenceEncoder.__init__.<locals>.<listcomp>c                    s   g | ]}t j | d dqS )T)num_featuresaffine)r,   InstanceNorm1dr.   )r#   r   r   r3   R   s    r   )
input_sizehidden_sizebatch_first)super__init__n_mel_channelslenr   rangeextendr,   
ModuleListconvsnormsGRUgru)
selfr"   r#   r$   r%   r&   r<   KrA   convs2	__class__)r1   r#   r$   r2   r   r;   +   s4   




zReferenceEncoder.__init__xmel_lensr	   c                 C   s   t |d}||d}t| j| jD ]\}}||}t|d}||}qtdD ]}t	|}q+t |}||dd}|
d}tjjjj||  ddd}| j  | |\}}tjjjj|dd	\}}|||fS )
zR
        inputs --- [N,  n_mels, timesteps]
        outputs --- [N, E//2]
        r   r   g333333?r   )r   r   r   TF)r9   enforce_sorted)r9   )r   r   masked_fillziprA   rB   F
leaky_relur>   r    permuter   r,   utilsrnnpack_padded_sequencecpur   rD   flatten_parameterspad_packed_sequence)rE   rJ   rK   	mel_masksconvnorm_memoryr   r   r   forwardZ   s    


 

zReferenceEncoder.forwardLr)   r   padn_convsc                 C   s*   t |D ]}|| d|  | d }q|S )Nr   r   )r>   )rE   r^   r)   r   r_   r`   r[   r   r   r   calculate_channelsv   s   z#ReferenceEncoder.calculate_channels)__name__
__module____qualname____doc__r   r   r   r;   r   Tensorr   r]   ra   __classcell__r   r   rH   r   r!      s6    */r!   c                       s   e Zd Zdedeeeeeeeef  dedeeeeeeef  dedededed	ef fd
dZdej	dej	dej	fddZ
  ZS )UtteranceLevelProsodyEncoderr"   r#   r$   r%   r&   dropoutn_hiddenbottleneck_size_u	token_numc
                    sv   t    || _| | _| _|}
t|||||d| _t|| jd | _	t
||	d| _t| j|
| _t|| _dS )a*  
        Encoder to extract prosody from utterance. it is made up of a reference encoder
        with a couple of linear layers and style token layer with dropout.

        Args:
            num_mels (int): Number of mel frames to produce.
            ref_enc_filters (list[int]): List of channel sizes for ref encoder layers.
            ref_enc_size (int): Size of the kernel for the ref encoder conv layers.
            ref_enc_strides (List[int]): List of strides to use for teh ref encoder conv layers.
            ref_enc_gru_size (int): Number of hidden features for the gated recurrent unit.
            dropout (float): Probability of dropout.
            n_hidden (int): Size of hidden layers.
            bottleneck_size_u (int): Size of the bottle neck layer.

        Inputs: inputs, mask
            - **inputs** (batch, dim, time): Tensor containing mel vector
            - **lengths** (batch): Tensor containing the mel lengths.
        Returns:
            - **outputs** (batch, 1, dim): Tensor produced by Utterance Level Prosody Encoder.
        r#   r&   r$   r%   r"   r   )rj   rl   N)r:   r;   Ed_qd_kr!   encoderr,   Linearencoder_prjr   stlencoder_bottleneckDropoutri   )rE   r"   r#   r$   r%   r&   ri   rj   rk   rl   bottleneck_sizerH   r   r   r;      s   
 z%UtteranceLevelProsodyEncoder.__init__melsrK   r	   c                 C   sP   |  ||\}}}| |}| | |}| |}|dd|jd f}|S )z
        Shapes:
            mels: :math: `[B, C, T]`
            mel_lens: :math: `[B]`

        out --- [N, seq_len, E]
        r   r      )rq   rs   ru   rt   ri   viewr   )rE   rx   rK   r[   embedded_prosodyoutr   r   r   r]      s   

z$UtteranceLevelProsodyEncoder.forwardrb   rc   rd   r   r   r   floatr;   r   rf   r]   rg   r   r   rH   r   rh   ~   s*    	
$2rh   c                       s   e Zd Zdedeeeeeeeef  dedeeeeeeef  dedededed	ef fd
dZdej	dej	dej	dej	dej	dej	fddZ
  ZS )PhonemeLevelProsodyEncoderr"   r#   r$   r%   r&   ri   rj   n_headsbottleneck_size_pc
                    sd   t    || _| | _| _|	}
t|||||d| _t||| _	t
|||d| _t||
| _d S )Nrm   )d_model	num_heads	dropout_p)r:   r;   rn   ro   rp   r!   rq   r,   rr   rs   r   	attentionru   )rE   r"   r#   r$   r%   r&   ri   rj   r   r   rw   rH   r   r   r;      s$   
z#PhonemeLevelProsodyEncoder.__init__rJ   src_maskrx   rK   encodingr	   c           
      C   sl   |  ||\}}}| |}||jd dddf}	| j||||	|d\}}| |}||dd}|S )z
        x --- [N, seq_len, encoder_embedding_dim]
        mels --- [N, Ty/r, n_mels*r], r=1
        out --- [N, seq_len, bottleneck_size]
        attn --- [N, seq_len, ref_len], Ty/r = ref_len
        r   r   r   )querykeyvaluer   r   g        )rq   rs   rz   r   r   ru   rM   r   )
rE   rJ   r   rx   rK   r   r{   r[   rX   	attn_maskr   r   r   r]      s   


z"PhonemeLevelProsodyEncoder.forwardr}   r   r   rH   r   r      sB    	
!r   )r   )typingr   r   r   r   torch.nnr,   torch.nn.functional
functionalrO   'TTS.tts.layers.delightful_tts.conformerr   )TTS.tts.layers.delightful_tts.conv_layersr   &TTS.tts.layers.delightful_tts.networksr   rf   r   r   r    Moduler!   rh   r   r   r   r   r   <module>   s    fH