o
    
jU                     @   s  d dl mZmZmZmZmZ d dlZd dlZd dl	Z	d dl
mZ d dl	mZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d dlm  m  m  mZ d dlm  m  m  mZ d d
lm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dlm'Z'm(Z( d dl)m*Z* d dl+m,Z- d dl.m/Z/ G dd dej0Z1G dd dej0Z2G dd de	jj0Z3G dd de	jj0Z4G dd de	jj0Z5G dd de	jj0Z6G dd de	jj0Z,G d d! d!e&Z7dS )"    )DictListOptionalTupleUnionN)Coqpit)nn)Conv1dConv2dConvTranspose1d)
functional)spectral_norm)weight_norm)remove_parametrizations)SpeakerManager)load_fsspec)FreeVCConfig)BaseVC)get_paddinginit_weights)mel_spectrogram_torch)SpeakerEncoder)	get_wavlmc                       s(   e Zd Zd	 fdd	Zd
ddZ  ZS )ResidualCouplingBlock   r   c           	         s   t    || _|| _|| _|| _|| _|| _|| _t	
 | _t|D ]}| jtj||||||dd | jt  q#d S )NT)gin_channels	mean_only)super__init__channelshidden_channelskernel_sizedilation_raten_layersn_flowsr   r   
ModuleListflowsrangeappendmodulesResidualCouplingLayerFlip)	selfr   r    r!   r"   r#   r$   r   i	__class__ G/home/kuhnn/.local/lib/python3.10/site-packages/TTS/vc/models/freevc.pyr      s.   

zResidualCouplingBlock.__init__NFc                 C   sL   |s| j D ]}|||||d\}}q|S t| j D ]
}|||||d}q|S )Ngreverse)r&   reversed)r,   xx_maskr3   r4   flow_r0   r0   r1   forward4   s   
zResidualCouplingBlock.forward)r   r   )NF__name__
__module____qualname__r   r:   __classcell__r0   r0   r.   r1   r      s    r   c                       s*   e Zd Z	d fdd	ZdddZ  ZS )	Encoderr   c                    sr   t    || _|| _|| _|| _|| _|| _|| _t	
||d| _tj|||||d| _t	
||d d| _d S )N   r      )r   r   in_channelsout_channelsr    r!   r"   r#   r   r   r	   prer)   WNencproj)r,   rD   rE   r    r!   r"   r#   r   r.   r0   r1   r   ?   s   
zEncoder.__init__Nc           	      C   s   t t||dd|j}| || }| j|||d}| 	|| }t j
|| jdd\}}|t |t |  | }||||fS )NrC   rA   r3   )dim)torch	unsqueezecommonssequence_masksizetodtyperF   rH   rI   splitrE   
randn_likeexp)	r,   r6   	x_lengthsr3   r7   statsmlogszr0   r0   r1   r:   O   s   "zEncoder.forwardr   Nr;   r0   r0   r.   r1   r@   >   s    r@   c                       s2   e Zd Z	d	 fdd	Zd
ddZdd Z  ZS )	Generatorr   c	                    sJ  t t|   t|| _t|| _t||dddd| _|dkr"tj	ntj
}t | _tt||D ]#\}	\}
}| jtt|d|	  |d|	d   ||
||
 d d q1t | _tt| jD ]"}	|d|	d   }tt||D ]\}\}}| j|||| qrqat|dddddd| _| jt |d	krt||d| _d S d S )
N   rA      padding1rC   F)ra   biasr   )r   r]   r   lennum_kernelsnum_upsamplesr	   conv_prer)   	ResBlock1	ResBlock2r   r%   ups	enumeratezipr(   r   r   	resblocksr'   	conv_postapplyr   cond)r,   initial_channelresblockresblock_kernel_sizesresblock_dilation_sizesupsample_ratesupsample_initial_channelupsample_kernel_sizesr   r-   ukchjdr.   r0   r1   r   Z   s:   





zGenerator.__init__Nc                 C   s   |  |}|d ur|| | }t| jD ]>}t|tj}| j| |}d }t| j	D ]!}|d u r?| j
|| j	 |  |}q,|| j
|| j	 |  |7 }q,|| j	 }qt|}| |}t|}|S r\   )rg   rp   r'   rf   F
leaky_relur)   LRELU_SLOPErj   re   rm   rn   rL   tanh)r,   r6   r3   r-   xsr{   r0   r0   r1   r:      s    



zGenerator.forwardc                 C   s8   t d | jD ]}t|d q| jD ]}t|d qd S )NzRemoving weight norm...weight)printrj   r   rm   )r,   lr0   r0   r1   remove_weight_norm   s   

zGenerator.remove_weight_normr[   r\   )r<   r=   r>   r   r:   r   r?   r0   r0   r.   r1   r]   Y   s
    

+r]   c                       s&   e Zd Zd fdd	Zdd Z  ZS )	DiscriminatorP   r_   Fc                    s  t t|   || _|| _|dkrtnt}t|t	dd|df|dft
|ddfd|t	dd|df|dft
|ddfd|t	dd|df|dft
|ddfd|t	dd|df|dft
|ddfd|t	dd|dfdt
|ddfdg| _|t	ddd	dd
d| _d S )NFrA       r   r`      i      )r_   rA   )rA   r   )r   r   r   perioduse_spectral_normr   r   r   r%   r
   r   convsrn   )r,   r   r!   strider   norm_fr.   r0   r1   r      s   &&&&"	zDiscriminatorP.__init__c                 C   s   g }|j \}}}|| j dkr$| j|| j  }t|d|fd}|| }||||| j | j}| jD ]}||}t|tj}|	| q3| 
|}|	| t|dd}||fS )Nr   reflectrA   )shaper   r}   padviewr   r~   r)   r   r(   rn   rL   flatten)r,   r6   fmapbctn_padr   r0   r0   r1   r:      s   


zDiscriminatorP.forward)r   r_   Fr;   r0   r0   r.   r1   r      s    r   c                       &   e Zd Zd fdd	Zdd Z  ZS )DiscriminatorSFc                    s   t t|   |dkrtnt}t|tdddddd|tdddd	d	d
d|tdddd	dd
d|tdddd	dd
d|tdddd	dd
d|tddddddg| _|tdddddd| _	d S )NFrA         r^   r`   @   )   r      )groupsra      r   r   rC   r_   )
r   r   r   r   r   r   r%   r	   r   rn   )r,   r   r   r.   r0   r1   r      s   
zDiscriminatorS.__init__c                 C   sZ   g }| j D ]}||}t|tj}|| q| |}|| t|dd}||fS )NrA   r   )	r   r}   r~   r)   r   r(   rn   rL   r   )r,   r6   r   r   r0   r0   r1   r:      s   


zDiscriminatorS.forwardFr;   r0   r0   r.   r1   r      s    r   c                       r   )MultiPeriodDiscriminatorFc                    sH   t t|   g d}t dg}| fdd|D  }t|| _d S )N)rC   r_   r   r^      r   c                    s   g | ]}t | d qS )r   )r   ).0r-   r   r0   r1   
<listcomp>   s    z5MultiPeriodDiscriminator.__init__.<locals>.<listcomp>)r   r   r   r   r   r%   discriminators)r,   r   periodsdiscsr.   r   r1   r      s
   z!MultiPeriodDiscriminator.__init__c                 C   sp   g }g }g }g }t | jD ]$\}}||\}	}
||\}}||	 || ||
 || q||||fS r\   )rk   r   r(   )r,   yy_haty_d_rsy_d_gsfmap_rsfmap_gsr-   r|   y_d_rfmap_ry_d_gfmap_gr0   r0   r1   r:      s   


z MultiPeriodDiscriminator.forwardr   r;   r0   r0   r.   r1   r      s    r   c                       s8   e Zd Zd fdd	Zdd Zdd	 ZdddZ  ZS )r   P   r_   r   c                    s>   t t|   tj|||dd| _t||| _t | _	d S )NT)batch_first)
r   r   r   r   LSTMlstmLinearlinearReLUrelu)r,   mel_n_channelsmodel_num_layersmodel_hidden_sizemodel_embedding_sizer.   r0   r1   r     s   zSpeakerEncoder.__init__c                 C   sD   | j   |  |\}\}}| | |d }|tj|ddd S )Nr   rA   T)rK   keepdim)r   flatten_parametersr   r   rL   norm)r,   melsr9   hidden
embeds_rawr0   r0   r1   r:     s   
zSpeakerEncoder.forwardc                 C   s8   g }t d|| |D ]}t||| }|| q
|S )Nr   )r'   rL   aranger(   )r,   total_framespartial_framespartial_hop
mel_slicesr-   	mel_ranger0   r0   r1   compute_partial_slices  s
   z%SpeakerEncoder.compute_partial_slicesr   r   c           
         s     d} d d | d f }||krZ| |||}t fdd|D }|| tt|dd}t  | |}W d    n1 sIw   Y  tj	|dd
d}	|	S t  | |}	W d    |	S 1 snw   Y  |	S )NrA   c                 3   s     | ]} d d |f V  qd S r\   r0   )r   smelr0   r1   	<genexpr>  s    z1SpeakerEncoder.embed_utterance.<locals>.<genexpr>r   )axis)rP   r   listr(   rL   stacktuplesqueezeno_gradmeanrM   )
r,   r   r   r   mel_lenlast_melr   r   partial_embedsembedr0   r   r1   embed_utterance  s$   






zSpeakerEncoder.embed_utterance)r   r_   r   r   )r   r   )r<   r=   r>   r   r:   r   r   r?   r0   r0   r.   r1   r      s
    r   c                       s<  e Zd ZdZd)dedef fddZedd Zd	d
 Z	defddZ
				d*dejdejdeej deej deej deej deejejejeejejejejejejf f fddZe d+ddZdd Zdd Ze dd Zdd Zed,ded!eee ee f fd"d#Zd-d%d&Zd'd( Z  ZS ).FreeVCa  

    Papaer::
        https://arxiv.org/abs/2210.15418#

    Paper Abstract::
        Voice conversion (VC) can be achieved by first extracting source content information and target speaker
        information, and then reconstructing waveform with these information. However, current approaches normally
        either extract dirty content information with speaker information leaked in, or demand a large amount of
        annotated data for training. Besides, the quality of reconstructed waveform can be degraded by the
        mismatch between conversion model and vocoder. In this paper, we adopt the end-to-end framework of VITS for
        high-quality waveform reconstruction, and propose strategies for clean content information extraction without
        text annotation. We disentangle content information by imposing an information bottleneck to WavLM features,
        and propose the spectrogram-resize based data augmentation to improve the purity of extracted content
        information. Experimental results show that the proposed method outperforms the latest VC models trained with
        annotated data and has greater robustness.

    Original Code::
        https://github.com/OlaWod/FreeVC

    Examples:
        >>> from TTS.vc.configs.freevc_config import FreeVCConfig
        >>> from TTS.vc.models.freevc import FreeVC
        >>> config = FreeVCConfig()
        >>> model = FreeVC(config)
    Nconfigspeaker_managerc              
      s  t  |d |d  | | | jj| _| jj| _| jj| _| jj| _| jj| _| jj	| _	| jj
| _
| jj| _| jj| _| jj| _| jj| _| jj| _| jj| _| jj| _| jj| _| jj| _| jj| _| jj| _t| jj| j| jddd| _t| j| j| j| j| j| j| j| jd| _t| j| j| jddd| jd| _t| j| jddd| jd| _| jst| j| jd| _n|   t  | _!d S )Nr   rA   r   rB   r   )r   r   )"r   r   init_multispeakerargsspec_channelsinter_channelsr    filter_channelsn_headsr#   r!   	p_dropoutrr   rs   rt   ru   rv   rw   segment_sizer   ssl_dimuse_spkr@   enc_pr]   decenc_qr   r8   r   enc_spkload_pretrained_speaker_encoderr   wavlm)r,   r   r   r.   r0   r1   r   F  sR   



















zFreeVC.__init__c                 C   s   t |  jS r\   )next
parametersdevicer,   r0   r0   r1   r   v  s   zFreeVC.devicec                 C   s   t d td| _dS )z@Load pretrained speaker encoder model as mentioned in the paper.z/ > Loading pretrained speaker encoder model ...zShttps://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/speaker_encoder.ptN)r   SpeakerEncoderEx
enc_spk_exr   r0   r0   r1   r   z  s   
z&FreeVC.load_pretrained_speaker_encoderc                 C   s"   | j j| _| jr| jj| _dS dS )a  Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
        or with external `d_vectors` computed from a speaker encoder model.

        You must provide a `speaker_manager` at initialization to set up the multi-speaker modules.

        Args:
            config (Coqpit): Model configuration.
            data (List, optional): Dataset items to infer number of speakers. Defaults to None.
        N)r   num_spksr   )r,   r   r0   r0   r1   r     s   

zFreeVC.init_multispeakerr   specr3   r   	c_lengthsspec_lengthsreturnc              	   C   s   |du rt |d|d |j}|du r*t |d|d |j}d}| js7| |d}| ||\}}}	}| j	|
dd||d\}
}}}| j|
||d}t|
|| j\}}| j||d}||||
|||	||ffS )au  
        Forward pass of the model.

        Args:
            c: WavLM features. Shape: (batch_size, c_seq_len).
            spec: The input spectrogram. Shape: (batch_size, spec_seq_len, spec_dim).
            g: The speaker embedding. Shape: (batch_size, spk_emb_dim).
            mel: The input mel-spectrogram for the speaker encoder. Shape: (batch_size, mel_seq_len, mel_dim).
            c_lengths: The lengths of the WavLM features. Shape: (batch_size,).
            spec_lengths: The lengths of the spectrogram. Shape: (batch_size,).

        Returns:
            o: The output spectrogram. Shape: (batch_size, spec_seq_len, spec_dim).
            ids_slice: The slice indices. Shape: (batch_size, num_slices).
            spec_mask: The spectrogram mask. Shape: (batch_size, spec_seq_len).
            (z, z_p, m_p, logs_p, m_q, logs_q): A tuple of latent variables.
        Nr   r   rA   rC   rJ   )rL   onesrP   rQ   r   r   r   rM   r   r   	transposer8   rN   rand_slice_segmentsr   r   )r,   r   r   r3   r   r   r   r9   m_plogs_prZ   m_qlogs_q	spec_maskz_pz_slice	ids_sliceor0   r0   r1   r:     s   !"" zFreeVC.forwardc                 C   s   |dkrt |d|d |j}| js#| j|}|d}| 	||\}}}}| j
|||dd}	| j|	| |d}
|
S )a  
        Inference pass of the model

        Args:
            c (torch.Tensor): Input tensor. Shape: (batch_size, c_seq_len).
            g (torch.Tensor): Speaker embedding tensor. Shape: (batch_size, spk_emb_dim).
            mel (torch.Tensor): Mel-spectrogram tensor. Shape: (batch_size, mel_seq_len, mel_dim).
            c_lengths (torch.Tensor): Lengths of the input tensor. Shape: (batch_size,).

        Returns:
            torch.Tensor: Output tensor.
        Nr   r   Tr2   rJ   )rL   r   rP   rQ   r   r   r   r   rM   r   r8   r   )r,   r   r3   r   r   r  r  r  c_maskrZ   r	  r0   r0   r1   	inference  s   "
zFreeVC.inferencec                 C   sH   t   | j|d }W d   n1 sw   Y  |dd}|S )zExtract WavLM features from an audio tensor.

        Args:
            y (torch.Tensor): Audio tensor. Shape: (batch_size, audio_seq_len).
        r   NrA   rC   )rL   r   r   extract_featuresr   )r,   r   r   r0   r0   r1   extract_wavlm_features  s
   
zFreeVC.extract_wavlm_featuresc                 C   s   t |trtj|| jjjd\}}t |tjr t	
|| j}t |t	jr,|| j}t |tr=t	
t|| j}| S )z Read and format the input audio.)sr)
isinstancestrlibrosaloadr   audioinput_sample_ratenpndarrayrL   
from_numpyrQ   r   Tensorr   arrayfloat)r,   wavr9   r0   r0   r1   
load_audio  s   

zFreeVC.load_audioc           
   	   C   s$  |  |  }tjj|dd\}}| jjjr/| j	
|}t|ddddf | j}n,t|d| j}t|| jjj| jjj| jjj| jjj| jjj| jjj| jjj}|  |}| |dddf }| jjjrx| j||d}	n| j||ddd}	|	d d j   }	|	S )	z
        Voice conversion pass of the model.

        Args:
            src (str or torch.Tensor): Source utterance.
            tgt (str or torch.Tensor): Target utterance.

        Returns:
            torch.Tensor: Output tensor.
        r   )top_dbNr   rJ   rA   rC   r   )r  cpunumpyr  effectstrimr   
model_argsr   r   r   rL   r  rQ   r   rM   r   r  filter_lengthn_mel_channelsr  
hop_length
win_lengthmel_fminmel_fmaxr  r  r   datar  )
r,   srctgtwav_tgtr9   g_tgtmel_tgtwav_srcr   r  r0   r0   r1   voice_conversion  s.   
"

zFreeVC.voice_conversionc                   C      d S r\   r0   r0   r0   r0   r1   	eval_step#     zFreeVC.eval_stepTsamplesc                 C   s   t | }|S r\   )r   )r   r4  verbosemodelr0   r0   r1   init_from_config&  s   zFreeVC.init_from_configFc                 C   s:   t |td|d}| j|d |d |r|   d S d S )Nr  )map_locationcacher6  )strict)r   rL   r   load_state_dicteval)r,   r   checkpoint_pathr<  r:  r9  stater0   r0   r1   load_checkpoint+  s
   zFreeVC.load_checkpointc                   C   r1  r\   r0   r0   r0   r0   r1   
train_step1  r3  zFreeVC.train_stepr\   )NNNN)NNN)NT)FTF) r<   r=   r>   __doc__r   r   r   propertyr   r   r   rL   r  r   r   r:   r   r  r  r  inference_moder0  r2  staticmethodr   r   r   r   r7  r?  r@  r?   r0   r0   r.   r1   r   *  sV    0

:
)&
r   )8typingr   r   r   r   r   r  r  r  rL   coqpitr   r   torch.nnr	   r
   r   r   r}   torch.nn.utilsr   torch.nn.utils.parametrizationsr   torch.nn.utils.parametrizer   TTS.vc.modules.freevc.commonsvcr)   freevcrN   TTS.vc.modules.freevc.modulesTTS.tts.utils.speakersr   TTS.utils.ior   TTS.vc.configs.freevc_configr   TTS.vc.models.base_vcr   r   r   $TTS.vc.modules.freevc.mel_processingr   5TTS.vc.modules.freevc.speaker_encoder.speaker_encoderr   r   TTS.vc.modules.freevc.wavlmr   Moduler   r@   r]   r   r   r   r   r0   r0   r0   r1   <module>   s:    $I'*