o
    
ji                      @   s   d dl Z d dl mZ d dlmZmZmZ d dlmZ d dlm	Z	 d dl
mZ G dd dejZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZdS )    N)nn)Conv1dBNConv1dBNBlockResidualConv1dBNBlock)FFTransformerBlock)WNBlocks)RelativePositionTransformerc                       *   e Zd ZdZ fddZdddZ  ZS )WaveNetDecodera  WaveNet based decoder with a prenet and a postnet.

    prenet: conv1d_1x1
    postnet: 3 x [conv1d_1x1 -> relu] -> conv1d_1x1

    TODO: Integrate speaker conditioning vector.

    Note:
        default wavenet parameters;
            params = {
                "num_blocks": 12,
                "hidden_channels":192,
                "kernel_size": 5,
                "dilation_rate": 1,
                "num_layers": 4,
                "dropout_p": 0.05
            }

    Args:
        in_channels (int): number of input channels.
        out_channels (int): number of output channels.
        hidden_channels (int): number of hidden channels for prenet and postnet.
        params (dict): dictionary for residual convolutional blocks.
    c                    s   t    tj||d d| _t|d fd|i|| _tj|d |dtj tj||dtj tj||dtj tj||dg| _	tj
| j	 | _	d S )Nhidden_channels   c_in_channels)super__init__torchr   Conv1dprenetr   wnReLUpostnet
Sequential)selfin_channelsout_channelsr   r   params	__class__ V/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/feed_forward/decoder.pyr   $   s   
	zWaveNetDecoder.__init__Nc                 C   s.   |  || }| |||}| || }|S N)r   r   r   r   xx_maskgor   r   r   forward6   s   zWaveNetDecoder.forwardNN__name__
__module____qualname____doc__r   r%   __classcell__r   r   r   r   r
   
   s    r
   c                       r	   )"RelativePositionTransformerDecodera  Decoder with Relative Positional Transformer.

    Note:
        Default params
            params={
                'hidden_channels_ffn': 128,
                'num_heads': 2,
                "kernel_size": 3,
                "dropout_p": 0.1,
                "num_layers": 8,
                "rel_attn_window_size": 4,
                "input_length": None
            }

    Args:
        in_channels (int): number of input channels.
        out_channels (int): number of output channels.
        hidden_channels (int): number of hidden channels including Transformer layers.
        params (dict): dictionary for residual convolutional blocks.
    c                    s4   t    t||dd| _t|||fi || _d S Nr   )r   r   r   r   r   rel_pos_transformerr   r   r   r   r   r   r   r   r   S   s   
z+RelativePositionTransformerDecoder.__init__Nc                 C   s   |  || }| ||}|S r   )r   r/   r    r   r   r   r%   X   s   z*RelativePositionTransformerDecoder.forwardr&   r'   r   r   r   r   r-   =   s    r-   c                       r	   )FFTransformerDecodera  Decoder with FeedForwardTransformer.

    Default params
            params={
                'hidden_channels_ffn': 1024,
                'num_heads': 2,
                "dropout_p": 0.1,
                "num_layers": 6,
            }

    Args:
        in_channels (int): number of input channels.
        out_channels (int): number of output channels.
        hidden_channels (int): number of hidden channels including Transformer layers.
        params (dict): dictionary for residual convolutional blocks.
    c                    s0   t    t|fi || _t||d| _d S r.   )r   r   r   transformer_blockr   r   r   )r   r   r   r   r   r   r   r   p   s   
zFFTransformerDecoder.__init__Nc                 C   s0   |d u rdn|}|  || }| || }|S r.   )r2   r   r    r   r   r   r%   u   s   zFFTransformerDecoder.forwardr&   r'   r   r   r   r   r1   ^   s    r1   c                       r	   )ResidualConv1dBNDecodera  Residual Convolutional Decoder as in the original Speedy Speech paper

    TODO: Integrate speaker conditioning vector.

    Note:
        Default params
                params = {
                    "kernel_size": 4,
                    "dilations": 4 * [1, 2, 4, 8] + [1],
                    "num_conv_blocks": 2,
                    "num_res_blocks": 17
                }

    Args:
        in_channels (int): number of input channels.
        out_channels (int): number of output channels.
        hidden_channels (int): number of hidden channels including ResidualConv1dBNBlock layers.
        params (dict): dictionary for residual convolutional blocks.
    c              
      s`   t    t|||fi || _t||d| _tt||||d dddt||d| _	d S )Nr   kernel_size   )num_conv_blocks)
r   r   r   res_conv_blockr   r   	post_convr   r   r   r0   r   r   r   r      s   

z ResidualConv1dBNDecoder.__init__Nc                 C   s(   |  ||}| || }| || S r   )r7   r8   r   r    r   r   r   r%      s   zResidualConv1dBNDecoder.forwardr&   r'   r   r   r   r   r3   }   s    r3   c                       sJ   e Zd ZdZdddg d dg dddd	f fd
d	ZdddZ  ZS )Decodera$  Decodes the expanded phoneme encoding into spectrograms
    Args:
        out_channels (int): number of output channels.
        in_hidden_channels (int): input and hidden channels. Model keeps the input channels for the intermediate layers.
        decoder_type (str): decoder layer types. 'transformers' or 'residual_conv_bn'. Default 'residual_conv_bn'.
        decoder_params (dict): model parameters for specified decoder type.
        c_in_channels (int): number of channels for conditional input.

    Shapes:
        - input: (B, C, T)
    residual_conv_bn   )r   r5   r;      r   r5      )r4   	dilationsr6   num_res_blocksr   c                    s   t    | dkrt||||d| _d S | dkr't||||d| _d S | dkr9t|||||d| _d S | dkrHt|||| _d S td| )Nrelative_position_transformer)r   r   r   r   r:   wavenet)r   r   r   r   r   fftransformerz[!] Unknown decoder type - )	r   r   lowerr-   decoderr3   r
   r1   
ValueError)r   r   in_hidden_channelsdecoder_typedecoder_paramsr   r   r   r   r      s4   
zDecoder.__init__Nc                 C   s   |  |||}|S )zi
        Args:
            x: [B, C, T]
            x_mask: [B, 1, T]
            g: [B, C_g, 1]
        )rD   r    r   r   r   r%      s   zDecoder.forwardr   r'   r   r   r   r   r9      s    *r9   )r   r   "TTS.tts.layers.generic.res_conv_bnr   r   r   "TTS.tts.layers.generic.transformerr   TTS.tts.layers.generic.wavenetr   #TTS.tts.layers.glow_tts.transformerr   Moduler
   r-   r1   r3   r9   r   r   r   r   <module>   s    3!&