o
    
j]                     @   s\  d dl mZ d dlZd dlmZ d dlm  mZ d dlm	Z	 d dl
mZ dedeeef fddZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejjjZG dd  d ejjjZG d!d" d"ejjZdS )#    )TupleN)parametrize)KernelPredictorkernel_sizereturnc                 C   s   | d }||| d d  fS )N       )r   padr	   r	   \/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/delightful_tts/conv_layers.pycalc_same_padding   s   r   c                       s:   e Zd ZdZ							d fdd	Zdd	d
Z  ZS )ConvNorma  A 1-dimensional convolutional layer with optional weight normalization.

    This layer wraps a 1D convolutional layer from PyTorch and applies
    optional weight normalization. The layer can be used in a similar way to
    the convolutional layers in PyTorch's `torch.nn` module.

    Args:
        in_channels (int): The number of channels in the input signal.
        out_channels (int): The number of channels in the output signal.
        kernel_size (int, optional): The size of the convolving kernel.
            Defaults to 1.
        stride (int, optional): The stride of the convolution. Defaults to 1.
        padding (int, optional): Zero-padding added to both sides of the input.
            If `None`, the padding will be calculated so that the output has
            the same length as the input. Defaults to `None`.
        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
        bias (bool, optional): If `True`, add bias after convolution. Defaults to `True`.
        w_init_gain (str, optional): The weight initialization function to use.
            Can be either 'linear' or 'relu'. Defaults to 'linear'.
        use_weight_norm (bool, optional): If `True`, apply weight normalization
            to the convolutional weights. Defaults to `False`.

    Shapes:
     - Input: :math:`[N, D, T]`

    - Output: :math:`[N, out_dim, T]` where `out_dim` is the number of output dimensions.

    r   NTlinearFc
              	      s   t t|   |d u r|d dksJ t||d  d }|| _|| _|	| _tj}
|
|||||||d| _	tj
j| j	jtj
|d | jrQtjj| j	| _	d S d S )Nr   r   )r   stridepaddingdilationbias)gain)superr   __init__intr   r   use_weight_normnnConv1dconvinitxavier_uniform_weightcalculate_gainutilsparametrizationsweight_norm)selfin_channelsout_channelsr   r   r   r   r   w_init_gainr   conv_fn	__class__r	   r   r   .   s*   	zConvNorm.__init__c                 C   s   |  |}|d ur|| }|S Nr   )r"   signalmaskconv_signalr	   r	   r   forwardO   s   
zConvNorm.forward)r   r   Nr   Tr   Fr)   )__name__
__module____qualname____doc__r   r.   __classcell__r	   r	   r'   r   r      s    !!r   c                       sB   e Zd Z						d fdd	Zd	d
 Zdd Zdd Z  ZS )ConvLSTMLinearr         皙?bilstmTc	              
      s.  t t|   || _|| _|| _tj|d| _g }	t	|D ](}
t
|
dkr&|n|||dt|d d ddd}tjjj|jdd}|	| qt|	| _| jsQ|}| jd	krd
}|}| jdkrgd}t|d }tj||dd|d| _tjj}|| jd| _| jdkr|| jd| _| jrt||| _d S d S )N)pr   r   r   relu)r   r   r   r   r%   r   )name Fr8   T)batch_firstbidirectionalweight_hh_l0weight_hh_l0_reverse)r   r4   r   out_dim	lstm_type
use_linearr   Dropoutdropoutranger   r   r   r    r!   r   append
ModuleListconvolutionsLSTMr8   spectral_normLineardense)r"   in_dimrA   n_layers
n_channelsr   	p_dropoutrB   rC   rI   i
conv_layer
use_bilstmlstm_channelslstm_norm_fn_pntrr'   r	   r   r   Y   sF   	


zConvLSTMLinear.__init__c                 C   s   g }t | d D ]0}|||d d d d || f  }| jD ]}| t||}q"||d dd q
t	j
jj|dd}|S )Nr   r   Tr=   )rF   sizeclonerI   rE   Fr:   rG   	transposer   r   rnnpad_sequence)r"   contextlenscontext_embeddedb_indcurr_contextr   r	   r	   r   run_padded_sequence   s   &
z"ConvLSTMLinear.run_padded_sequencec                 C   s   t j|dd\}}dg|d }tt|D ]}|||| < q|  }|| }tjj	j
||dd}||d }tjj	j|ddd }|| }|S )NT)
descendingr   rW   )torchsortrX   rF   lenlongcpur   r   r\   pack_padded_sequencepad_packed_sequence)r"   fnr^   r_   lens_sorted
ids_sorted
unsort_idsrR   r	   r	   r   run_unsorted_inputs   s   z"ConvLSTMLinear.run_unsorted_inputsc                 C   s   |  d dkr| ||}|dd}n| jD ]}| t||}q| jdkrO|dd}| j	  |d urB| 
| j||}n| |d }|dd}|}| jra| |dddd}|S )Nr   r   r   r<   )rX   rc   r[   rI   rE   rZ   r:   rB   r8   flatten_parametersrp   rC   rM   )r"   r^   r_   r   x_hatr	   r	   r   r.      s    


zConvLSTMLinear.forward)r   r5   r6   r7   r8   T)r/   r0   r1   r   rc   rp   r.   r3   r	   r	   r'   r   r4   X   s    4
r4   c                       sD   e Zd Zdedededef fddZdejdejfd	d
Z  ZS )DepthWiseConv1dr#   r$   r   r   c                    s$   t    tj|||||d| _d S )N)r   groupsr   r   r   r   r   r"   r#   r$   r   r   r'   r	   r   r      s   
zDepthWiseConv1d.__init__xr   c                 C   
   |  |S r)   r*   r"   rw   r	   r	   r   r.         
zDepthWiseConv1d.forward)	r/   r0   r1   r   r   re   Tensorr.   r3   r	   r	   r'   r   rs      s    rs   c                       sP   e Zd Z			ddededededef
 fd	d
ZdejdejfddZ  Z	S )PointwiseConv1dr   r   Tr#   r$   r   r   r   c                    s&   t    tj||d|||d| _d S )Nr   )r#   r$   r   r   r   r   ru   )r"   r#   r$   r   r   r   r'   r	   r   r      s   
zPointwiseConv1d.__init__rw   r   c                 C   rx   r)   r*   ry   r	   r	   r   r.      rz   zPointwiseConv1d.forward)r   r   T
r/   r0   r1   r   boolr   re   r{   r.   r3   r	   r	   r'   r   r|      s     r|   c                       H   e Zd ZdZdedededef fddZdejd	ejfd
dZ  Z	S )BSConv1d$https://arxiv.org/pdf/2003.13549.pdfchannels_inchannels_outr   r   c                    6   t    tj||dd| _tj|||||d| _d S Nr   )r   r   r   rt   )r   r   r   r   	pointwise	depthwiser"   r   r   r   r   r'   r	   r   r         
zBSConv1d.__init__rw   r   c                 C      |  |}| |}|S r)   r   r   r"   rw   x1x2r	   r	   r   r.         

zBSConv1d.forward
r/   r0   r1   r2   r   r   re   r{   r.   r3   r	   r	   r'   r   r          r   c                       r   )BSConv2dr   r   r   r   r   c                    r   r   )r   r   r   Conv2dr   r   r   r'   r	   r   r      r   zBSConv2d.__init__rw   r   c                 C   r   r)   r   r   r	   r	   r   r.     r   zBSConv2d.forwardr   r	   r	   r'   r   r      r   r   c                       sN   e Zd ZdZdedededef fddZdejd	ejd
ejfddZ  Z	S )	Conv1dGLUzFrom DeepVoice 3d_modelr   r   embedding_dimc                    s^   t    t|d| ||d| _t||| _| dt	t
dgd tj | _d S )Nr   r   r   sqrt      ?r   )r   r   r   r   r   rL   embedding_projregister_bufferre   r   FloatTensorsqueezeSoftsignsoftsign)r"   r   r   r   r   r'   r	   r   r     s
   
 zConv1dGLU.__init__rw   
embeddingsr   c                 C   s   | d}|}| |}d}|j||d |d\}}| |d}| |}||}|| }|t	| }|| }|| j
 }| d}|S )Nr   r   r   r   r   dim)permuter   splitrX   r   	unsqueezer   	expand_asre   sigmoidr   )r"   rw   r   residualsplitdimabr   r	   r	   r   r.     s   





zConv1dGLU.forwardr   r	   r	   r'   r   r     s    $r   c                	       sN   e Zd ZdZ		ddedededef fdd	Zd
ejdejfddZ  Z	S )ConvTransposeda  
    A 1D convolutional transposed layer for PyTorch.
    This layer applies a 1D convolutional transpose operation to its input tensor,
    where the number of channels of the input tensor is the same as the number of channels of the output tensor.

    Attributes:
        in_channels (int): The number of channels in the input tensor.
        out_channels (int): The number of channels in the output tensor.
        kernel_size (int): The size of the convolutional kernel. Default: 1.
        padding (int): The number of padding elements to add to the input tensor. Default: 0.
        conv (BSConv1d): The 1D convolutional transpose layer.
    r   r   r#   r$   r   r   c                    s    t    t||||d| _d S )Nr   )r   r   r   r   rv   r'   r	   r   r   1  s   
zConvTransposed.__init__rw   r   c                 C   s.   |  dd}| |}|  dd}|S )Nr   r   )
contiguousr[   r   ry   r	   r	   r   r.   @  s   
zConvTransposed.forward)r   r   r   r	   r	   r'   r   r   #  s    r   c                	       sF   e Zd Zddedededef fdd	Zd
ejdejfddZ  Z	S )DepthwiseConvModule      333333?r   r   	expansionlrelu_slopec                    sd   t    t|}tj||| ||d |d| _t|| _t|| |ddd| _t	|| _
d S )Nr   r   r   )r   r   r   r   r   r   	LeakyReLUactout	LayerNormln)r"   r   r   r   r   r   r'   r	   r   r   H  s   
zDepthwiseConvModule.__init__rw   r   c                 C   s@   |  |}|d}| |}| |}| |}|d}|S )Nr   )r   r   r   r   r   ry   r	   r	   r   r.   V  s   





zDepthwiseConvModule.forward)r   r   r   )
r/   r0   r1   r   floatr   re   r{   r.   r3   r	   r	   r'   r   r   G  s     r   c                       s>   e Zd Zd
dedef fddZdejdejfdd	Z  Z	S )	AddCoordsFrankwith_rc                    s   t    || _|| _d S r)   )r   r   r   r   )r"   r   r   r'   r	   r   r   a  s   

zAddCoords.__init__rw   r   c                    sJ  | j dkrY|j\}}}tj|tjd}|d d d d f }| |d  }|d d }||dd}||j}tj	||gdd}| j
rWtt|d d}tj	||gdd}|S | j dkr|j\}}}	}tjddd|gtjd}
tjddd|	gtjd}tj|	tjd}tj|tjd}|d d d d d f }|d d d d d f }t||
}t||}|dddd}| |	d  }| |d  }|d d }|d d }||ddd}||ddd}||j}||j}tj	|||gdd}| j
rtt|d dt|d d }tj	||gdd}|S | j dkr#|j\}}}}	}tjdddd|gtjd}
tjdddd|	gtjd}tjdddd|gtjd}tj|	tjd}|d d d d d d f }tj|tjd}|d d d d d d f }tj|tjd}|d d d d d d f }t||
 tj	 fdd	t|D dd}t||dddd
dtj	fdd	t|D d
d}t||ddd
ddtj	fdd	t|	D dd}||j}||j}||j}tj	||||gdd}| j
r!tt|d dt|d d t|d d }tj	||gdd}|S t)Nr   )dtyper   r   r   r   r6   c                       g | ]} | qS r	   r	   .0rR   )
xy_channelr	   r   
<listcomp>      z%AddCoords.forward.<locals>.<listcomp>r   c                    r   r	   r	   r   )
yz_channelr	   r   r     r   c                    r   r	   r	   r   )
zx_channelr	   r   r     r   )r   shapere   arangeint32r   repeattodevicecatr   r   powonesmatmulr   rF   NotImplementedError)r"   rw   batch_size_shapechannel_in_shapedim_xxx_range
xx_channelr   rrdim_yxx_onesyy_onesyy_range
yy_channeldim_zzz_onesxy_rangeyz_rangezx_range
zz_channelr	   )r   r   r   r   r.   f  s   
K&)   .zAddCoords.forward)Fr}   r	   r	   r'   r   r   `  s    r   c                       f   e Zd Z						ddedededed	ed
edededef fddZdejdejfddZ  Z	S )CoordConv1dr   r   TFr#   r$   r   r   r   r   rt   r   r   c
           
   
      Z   t  |||||||| d| _t| j|	| _t|| j t|	 |||||||| _d S )Nr   )	r   r   r   r   	addcoordsr   r   r   r   
r"   r#   r$   r   r   r   r   rt   r   r   r'   r	   r   r     ,   

zCoordConv1d.__init__rw   r   c                 C      |  |}| |}|S r)   r   r   ry   r	   r	   r   r.     r   zCoordConv1d.forwardr   r   r   r   TFr}   r	   r	   r'   r   r     6    	
#r   c                       r   )CoordConv2dr   r   TFr#   r$   r   r   r   r   rt   r   r   c
           
   
      r   )Nr   )	r   r   r   r   r   r   r   r   r   r   r'   r	   r   r     r   zCoordConv2d.__init__rw   r   c                 C   r   r)   r   ry   r	   r	   r   r.     r   zCoordConv2d.forwardr   r}   r	   r	   r'   r   r     r   r   c                       sN   e Zd ZdZg dddddddf fdd		Zd
d ZdddZdd Z  ZS )LVCBlockz"the location-variable convolutions)r   r6   	      g?r6   r5   @   g        c                    s   t    || _t|| _|| _t||d| t||||	|
d|id	| _t	t
|tjjtj||d| ||d |d  |d d| _t | _|D ]&}| jt	t
|tjjtj|||||d  d |dt
| qMd S )Nr   negative_slope)	cond_channelsconv_in_channelsconv_out_channelsconv_layersconv_kernel_sizekpnet_hidden_channelskpnet_conv_sizekpnet_dropout!kpnet_nonlinear_activation_params)r   r   output_paddingr   )r   r   )r   r   cond_hop_lengthrg   r   r   r   kernel_predictorr   
Sequentialr   r   r    r!   ConvTranspose1d	convt_prerH   conv_blocksrG   r   )r"   r#   r   r   	dilationslReLU_sloper   r   r   r   r   r   r'   r	   r   r     s\   


	zLVCBlock.__init__c              
   C   s   |j \}}}| |}| |\}}t| jD ]V\}}||}	|dd|ddddddddf }
|dd|ddddf }| j|	|
|| jd}	|t|	ddd|ddf t	|	dd|dddf   }q|S )aL  forward propagation of the location-variable convolutions.
        Args:
            x (Tensor): the input sequence (batch, in_channels, in_length)
            c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)

        Returns:
            Tensor: the output sequence (batch, in_channels, in_length)
        N)hop_size)
r   r  r   	enumerater  location_variable_convolutionr   re   r   tanh)r"   rw   c_r#   kernelsr   rR   r   outputkr   r	   r	   r   r.   Y  s   	
(
$
zLVCBlock.forwardr   c                 C   s,  |j \}}}|j \}}}	}
}||| ksJ d|t|
d d  }t|||fdd}|d|d|  |}||k rEt|d|fdd}|d||}|ddddddddd|f }|dd}|d|
d}td	||}|jtj	d
}|
d
djtj	d
}|| }| ||	d}|S )u  perform location-variable convolution operation on the input sequence (x) using the local convolution kernl.
        Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.
        Args:
            x (Tensor): the input sequence (batch, in_channels, in_length).
            kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length)
            bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length)
            dilation (int): the dilation of convolution.
            hop_size (int): the hop_size of the conditioning sequence.
        Returns:
            (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
        z$length of (x, kernel) is not matchedr   r   constantr   r6   Nr   zbildsk,biokl->bolsd)memory_format)r   r   rZ   r
   unfoldr[   re   einsumr   channels_last_3dr   r   view)r"   rw   kernelr   r   r  batchr  	in_lengthr$   r   kernel_lengthr   or	   r	   r   r  v  s(   &z&LVCBlock.location_variable_convolutionc                 C   s<   | j   t| jd d | jD ]
}t|d d qd S )Nr   r   )r   remove_weight_normr   remove_parametrizationsr  r  )r"   blockr	   r	   r   r    s
   

zLVCBlock.remove_weight_norm)r   r5   )	r/   r0   r1   r2   r   r.   r  r  r3   r	   r	   r'   r   r     s    ?
%r   ) typingr   re   torch.nnr   torch.nn.functional
functionalrZ   torch.nn.utilsr   .TTS.tts.layers.delightful_tts.kernel_predictorr   r   r   Moduler   r4   rs   r|   r   r   r   r   r   r   modulesr   r   r   r   r   r   r	   r	   r	   r   <module>   s(    Hh	$c**