o
    
j                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d d	lmZ G d
d dejZdS )    N)nn)GatedConvBlock)ResidualConv1dBNBlock)TimeDepthSeparableConvBlock)DurationPredictor)ResidualConv1dLayerNormBlock)RelativePositionTransformer)sequence_maskc                       s4   e Zd ZdZ				d fdd	Zdd	d
Z  ZS )Encoderu  Glow-TTS encoder module.

    ::

        embedding -> <prenet> -> encoder_module -> <postnet> --> proj_mean
                                                             |
                                                             |-> proj_var
                                                             |
                                                             |-> concat -> duration_predictor
                                                                    ↑
                                                              speaker_embed

    Args:
        num_chars (int): number of characters.
        out_channels (int): number of output channels.
        hidden_channels (int): encoder's embedding size.
        hidden_channels_ffn (int): transformer's feed-forward channels.
        kernel_size (int): kernel size for conv layers and duration predictor.
        dropout_p (float): dropout rate for any dropout layer.
        mean_only (bool): if True, output only mean values and use constant std.
        use_prenet (bool): if True, use pre-convolutional layers before transformer layers.
        c_in_channels (int): number of channels in conditional input.

    Shapes:
        - input: (B, T, C)

    ::

        suggested encoder params...

        for encoder_type == 'rel_pos_transformer'
            encoder_params={
                'kernel_size':3,
                'dropout_p': 0.1,
                'num_layers': 6,
                'num_heads': 2,
                'hidden_channels_ffn': 768,  # 4 times the hidden_channels
                'input_length': None
            }

        for encoder_type == 'gated_conv'
            encoder_params={
                'kernel_size':5,
                'dropout_p': 0.1,
                'num_layers': 9,
            }

        for encoder_type == 'residual_conv_bn'
            encoder_params={
                "kernel_size": 4,
                "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
                "num_conv_blocks": 2,
                "num_res_blocks": 13
            }

         for encoder_type == 'time_depth_separable'
            encoder_params={
                "kernel_size": 5,
                'num_layers': 9,
            }
    皙?FTr   c                    s  t    || _|| _|| _|| _|| _|| _|	| _|
| _	|| _
t||| _tj| jjd|d  | dkrR|	rFt|||dddd| _t|||fi || _ng| dkrbt|fi || _nW| d	kr|	rxtt||d
t | _t|||fi || _tt| j| jd
t| j| _n#| dkr|	rt|||dddd| _t|||fi || _ntdt||d
| _|st||d
| _ t!||
 |d|| _"d S )Ng        g      rel_pos_transformer      g      ?)kernel_size
num_layers	dropout_p
gated_convresidual_conv_bn   time_depth_separablez [!] Unkown encoder type.)#super__init__	num_charsout_channelshidden_channelshidden_channels_dpdropout_p_dp	mean_only
use_prenetc_in_channelsencoder_typer   	Embeddingembinitnormal_weightlowerr   prenetr   encoderr   
SequentialConv1dReLUr   BatchNorm1dpostnetr   
ValueErrorproj_mproj_sr   duration_predictor)selfr   r   r   r   r    encoder_paramsr   r   r   r   	__class__ R/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/glow_tts/encoder.pyr   N   s`   



zEncoder.__init__Nc           
      C   s  |  |t| j }t|dd}tt||dd	|j
}t| dr0| jr0| ||}| ||}t| drB| || }|dur[|dd|d}t| |gd}n| }| || }| jsq| || }nt|}| ||}	|||	|fS )z
        Shapes:
            - x: :math:`[B, C, T]`
            - x_lengths: :math:`[B]`
            - g (optional): :math:`[B, 1, T]`
        r      r'   r-   N)r"   mathsqrtr   torch	transpose	unsqueezer	   sizetodtypehasattrr   r'   r(   r-   expandcatdetachr/   r   r0   
zeros_liker1   )
r2   x	x_lengthsgx_maskg_expx_dpx_mx_logslogwr6   r6   r7   forward   s$   	 

zEncoder.forward)r   FTr   )N)__name__
__module____qualname____doc__r   rP   __classcell__r6   r6   r4   r7   r
      s    FAr
   )r:   r<   r   !TTS.tts.layers.generic.gated_convr   "TTS.tts.layers.generic.res_conv_bnr   *TTS.tts.layers.generic.time_depth_sep_convr   *TTS.tts.layers.glow_tts.duration_predictorr   TTS.tts.layers.glow_tts.glowr   #TTS.tts.layers.glow_tts.transformerr   TTS.tts.utils.helpersr	   Moduler
   r6   r6   r6   r7   <module>   s    