o
    
j$                     @   sx   d dl Z d dl mZ d dlmZ d dlmZ G dd dejZ	G dd dejZ
G d	d
 d
ejZG dd dejZdS )    N)nn)MultivariateNormal)
functionalc                       s4   e Zd ZdZ				d	 fdd	Zd
ddZ  ZS )CapacitronVAEzoEffective Use of Variational Embedding Capacity for prosody transfer.

    See https://arxiv.org/abs/1906.03402      Nc                    s   t    tt|t|| _d | _t||d| _	tj
jtttdgd dd| _|}|d urAt||d| _||7 }|d urI||7 }t||| _d S )N)out_dimg      ?   T)requires_grad)encoder_output_dim)super__init__MVNtorchzeroseyeprior_distribution"approximate_posterior_distributionReferenceEncoderencoderr   	ParameterlogexpTensorbetaTextSummarytext_summary_netPostEncoderMLPpost_encoder_mlp)selfnum_melcapacitron_VAE_embedding_dimr   reference_encoder_out_dimspeaker_embedding_dimtext_summary_embedding_dimmlp_input_dimension	__class__ \/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/tacotron/capacitron_layers.pyr      s   
	*zCapacitronVAE.__init__c                 C   s   |d ura|d }|d }|  ||}|d ur1|d }|d }| |||j}	tj||	gdd}|d urCt|}tj||gdd}| |\}
}|
 }
| }t	|
t
|| _| j }n| j d}|d| j| j| jfS )Nr   r	   )dim)r   r   todevicer   catsqueezer   cpur   
diag_embedr   rsampler   sample	unsqueezer   )r   reference_mel_info	text_infospeaker_embeddingreference_melsmel_lengthsenc_outtext_inputsinput_lengthstext_summary_outmusigmaVAE_embeddingr(   r(   r)   forward*   s&   
zCapacitronVAE.forward)r   r   NN)NNN)__name__
__module____qualname____doc__r   rA   __classcell__r(   r(   r&   r)   r      s    r   c                       s4   e Zd ZdZ fddZdd Zedd Z  ZS )r   zNN module creating a fixed size prosody embedding from a spectrogram.

    inputs: mel spectrograms [batch_size, num_spec_frames, num_mel]
    outputs: [batch_size, embedding_dim]
    c                    s   t    || _dgg d  t d } fddt|D }t|| _d| _tdd  dd  D | _	| 
|ddd|}tj d	 | |d
dd| _d S )Nr	   )    rG   @   rH   r   r   c              	      s,   g | ]}t j |  |d   ddddqS )r	   )   rI   )   rJ   )in_channelsout_channelskernel_sizestridepadding)r   Conv2d).0ifiltersr(   r)   
<listcomp>Z   s    z-ReferenceEncoder.__init__.<locals>.<listcomp>Fc                 S   s   g | ]}t j|d qS ))num_features)r   BatchNorm2d)rQ   filter_sizer(   r(   r)   rU   b   s    rI   rJ   r*   T)
input_sizehidden_sizebatch_firstbidirectional)r   r   r    lenranger   
ModuleListconvstrainingbnscalculate_post_conv_heightLSTM
recurrence)r   r    r   
num_layersr`   post_conv_heightr&   rS   r)   r   U   s   

zReferenceEncoder.__init__c                 C   sJ  | d}||dd| j}| }t| j| jD ]P\}}||}||}t|}|d  }t	
|jt	jdd }| d}t	||jt|||dk }	|	dddddddd}	||	 }q|dd}| d}
| ||
d}|}tjjj|| ddd}| j  | |\}\}}|d }||jS )	Nr   r	   r*   rJ   )dtypeTFr[   enforce_sorted)sizeviewr    floatzipr`   rb   Frelur   ceilr,   int64aranger-   expandr]   r4   	transpose
contiguousr   utilsrnnpack_padded_sequencetolistre   flatten_parameters)r   inputsr<   
batch_sizexvalid_lengthsconvbnpost_conv_max_widthmaskpost_conv_widthpost_conv_input_lengthspacked_seqs_htlast_outputr(   r(   r)   rA   i   s8   


 


zReferenceEncoder.forwardc                 C   s*   t |D ]}| | d|  | d } q| S )zAHeight of spec after n convolutions with fixed kernel/stride/pad.rJ   r	   )r^   )heightrM   rN   padn_convsr   r(   r(   r)   rc      s   z+ReferenceEncoder.calculate_post_conv_height)	rB   rC   rD   rE   r   rA   staticmethodrc   rF   r(   r(   r&   r)   r   N   s    5r   c                       $   e Zd Z fddZdd Z  ZS )r   c                    s"   t    tj||ddd| _d S )NTF)r[   r\   )r   r   r   rd   lstm)r   embedding_dimr   r&   r(   r)   r      s   
zTextSummary.__init__c                 C   sB   t jjj|| ddd}| j  | |\}\}}|d }|S )NTFri   r*   )r   rw   rx   ry   rz   r   r{   )r   r|   r<   r   r   r   r   r(   r(   r)   rA      s   
zTextSummary.forwardrB   rC   rD   r   rA   rF   r(   r(   r&   r)   r      s    	r   c                       r   )r   c                    sL   t    || _t||t t||d g}tj| | _t | _	d S )NrJ   )
r   r   rZ   r   LinearTanh
SequentialnetSoftplussoftplus)r   rY   rZ   modulesr&   r(   r)   r      s   

zPostEncoderMLP.__init__c                 C   sD   |  |}|d d d | jf }| |d d | jd f }||fS )N)r   rZ   r   )r   _input
mlp_outputr>   r?   r(   r(   r)   rA      s   
zPostEncoderMLP.forwardr   r(   r(   r&   r)   r      s    r   )r   r   'torch.distributions.multivariate_normalr   r   torch.nnr   ro   Moduler   r   r   r   r(   r(   r(   r)   <module>   s    GX