o
    
j?                    @   s  d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	m
Z
mZmZ d dlZd dlZd dlmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z=m>Z>m?Z?m@Z@mAZA d dlBmCZC d dlDmEZE d dlFmGZG d dlHmIZImJZJmKZKmLZLmMZMmNZN d dlOmPZP d dlQmRZR d dlSmTZT d dlUmVZV d dlWmXZX d d lYmZZZ i a[i a\e] d!ej^fd"d#Z_d$ej^fd%d&Z`d'd( ZadKd+d,ZbdLd-d.Zcd/d0 Zdd1d2 ZedMd4d5Zfd6d7 ZgdMd8d9ZheG d:d; d;eZidNd<ejd=ekd>elfd?d@ZmG dAdB dBe,ZneG dCdD dDeZoG dEdF dFe9ZpG dGdH dHeIZqG dIdJ dJeJZrdS )O    N)	dataclassfieldreplace)chain)DictListTupleUnion)Coqpit)mel)nn)autocast)
functional)
DataLoader)WeightedRandomSampler)DistributedSamplerDistributedSamplerWrapper)get_optimizerget_scheduler)CharactersConfig)
TTSDataset_parse_sample)DurationPredictor)VitsDiscriminator)PosteriorEncoderResidualCouplingBlocksTextEncoder)StochasticDurationPredictor)BaseTTS)rehash_fairseq_vits_checkpoint)generate_pathmaximum_pathrand_segmentssegmentsequence_mask)LanguageManager)SpeakerManager)	synthesis)BaseCharactersBaseVocabulary_characters_pad	_phonemes_punctuations)TTSTokenizer)plot_alignment)load_fsspec)BucketBatchSampler)HifiganGenerator)plot_resultsmc                 C   s$   t | dd }t|r|   d S d S )Nreset_parameters)getattrcallabler5   )r4   r5    r8   F/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/models/vits.pyweights_reset2   s   r:   mdlc                 C   s8   i }|   D ]\}}d|v r|j  }|||< q|S )Nweight)named_parametersdatasumitem)r;   	dict_sumsnamewvaluer8   r8   r9   get_module_weights_sum:   s   rE   c                 C   s6   t | \}}|dk |dk   dksJ ||fS )z^Load the audio file normalized in [-1, 1]

    Return Shapes:
        - x: :math:`[1, T]`
       r   )
torchaudioloadr?   )	file_pathxsrr8   r8   r9   
load_audioC   s    rM   rF   h㈵>c                 C   s   t t j| |d| S )N)min)torchlogclamp)rK   Cclip_valr8   r8   r9   
_amp_to_dbN   s   rU   c                 C   s   t | | S N)rP   exp)rK   rS   r8   r8   r9   
_db_to_ampR   s   rX   c                 C      t | }|S rV   )rU   
magnitudesoutputr8   r8   r9   	amp_to_dbV      r]   c                 C   rY   rV   )rX   rZ   r8   r8   r9   	db_to_amp[   r^   r_   Fc                 C   s  |  d} t| dk rtdt|  t| dkr#tdt|  t| jd t| j }t|d | }|tvrIt|j	| j| jdt|< tj
jj| dt|| d t|| d fd	d
} |  d} tj| |||t| |d	dddd
}t|ddd }|S )k
    Args Shapes:
        - y : :math:`[B, 1, T]`

    Return Shapes:
        - spec : :math:`[B,C,T]`
    rF         min value is       ?max value is _dtypedevice   reflectmodeFT
hop_length
win_lengthwindowcenterpad_mode
normalizedonesidedreturn_complexrG   ư>)squeezerP   rO   printmaxstrrg   rh   hann_windowtor   r   pad	unsqueezeintstftsqrtpowr?   )yn_fftrn   ro   rq   dtype_devicewnsize_dtype_devicespecr8   r8   r9   wav_to_spec`   s:   

r   c           	      C   sz   t | jd t | j }t |d | }|tvr/t|||||d}t|j| j| jdt|< tt| | }t	|}|S )zk
    Args Shapes:
        - spec : :math:`[B,C,T]`

    Return Shapes:
        - mel : :math:`[B,C,T]`
    re   rL   r   n_melsfminfmaxrf   )
rz   rg   rh   	mel_basislibrosa_mel_fnrP   
from_numpyr|   matmulr]   )	r   r   num_melssample_rater   r   r   fmax_dtype_devicer   r8   r8   r9   spec_to_mel   s   	r   c	                 C   sp  |  d} t| dk rtdt|  t| dkr#tdt|  t| jd t| j }	t|d |	 }
t|d |	 }|
tvrZt	|||||d}t
|j| j| jdt|
< |tvrlt|j| j| jdt|< tjjj| dt|| d	 t|| d	 fd
d} |  d} tj| |||t| |d
dddd
}t|d	dd }tt|
 |}t|}|S )r`   rF   ra   rb   rc   rd   re   r   rf   ri   rj   rk   FTrm   rG   rv   )rw   rP   rO   rx   ry   rz   rg   rh   r   r   r   r|   r{   r   r   r}   r~   r   r   r   r   r?   r   r]   )r   r   r   r   rn   ro   r   r   rq   r   r   r   r   r   r8   r8   r9   
wav_to_mel   sF   

r   c                   @   sb   e Zd ZU dZeed< dZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dS )VitsAudioConfigi   fft_sizei"V  r   ro      rn   P   r   r   mel_fminNmel_fmax)__name__
__module____qualname__r   r   __annotations__r   ro   rn   r   r   r   r8   r8   r8   r9   r      s   
 r   items	attr_name
multi_dictc                    s   t  fdd| D t  fddD }t fddD }d| t fdd|D }|t j| }durgD ]}|v sUJ | d qFt  fd	d| D }||9 }t| t | fS )
zkCreate inverse frequency weights for balancing the dataset.
    Use `multi_dict` to scale relative weights.c                       g | ]}|  qS r8   r8   .0r@   )r   r8   r9   
<listcomp>       z2get_attribute_balancer_weights.<locals>.<listcomp>c                    s   g | ]}  |qS r8   )indexr   l)unique_attr_namesr8   r9   r          c                    s"   g | ]}t t |kd  qS )r   )lennpwherer   )attr_names_samplesr8   r9   r      s   " rc   c                    s   g | ]} | qS r8   r8   r   )weight_attrr8   r9   r      r   Nz not in c                    s   g | ]
} |  d qS )rc   )getr   )r   r   r8   r9   r      s    )	r   arrayuniquetolistlinalgnormrP   r   float)r   r   r   attr_idx
attr_countdataset_samples_weightkmultiplier_samplesr8   )r   r   r   r   r   r9   get_attribute_balancer_weights   s    r   c                       s8   e Zd Z fddZdd Zedd Zdd Z  ZS )	VitsDatasetc                    s(   t  j|i | | jjj| _|| _d S rV   )super__init__	tokenizer
characterspad_id
model_args)selfr   argskwargs	__class__r8   r9   r     s   
zVitsDataset.__init__c              	   C   s   | j | }|d }t|d \}}| jjd ur6|d| jj dkr6|d d d t|d| jj  f }tj|d }| 	||d }t
|| jksU|jd | jk rb|  jd7  _| | jS ||t
||||d |d |d dS )	Ntext
audio_filerF   r   speaker_namelanguageaudio_unique_name)raw_text	token_ids	token_lenwavwav_filer   language_namer   )samplesrM   r   encoder_sample_ratesizer   ospathbasenameget_token_idsr   max_text_lenshapemin_audio_lenrescue_item_idx__getitem__)r   idxr@   r   r   re   wav_filenamer   r8   r8   r9   r     s(   
(zVitsDataset.__getitem__c                 C   s@   g }| j D ]}t|^}}}tj|d d }|| q|S )N      )r   r   r   r   getsizeappend)r   lensr@   re   r   	audio_lenr8   r8   r9   lengths%  s   
zVitsDataset.lengthsc                    st  t  } fdd d D  tjtdd  d D ddd\}}td	d  d
 D }t d }||  }dd  d D }t|}t|}	||	 }
t||}t|d|	}| | j }| | j }tt |D ].} d
 | }t|||d d | f<  d | }t|||ddd|	df< qs||||||
 d  d  d  d  d dS )a  
        Return Shapes:
            - tokens: :math:`[B, T]`
            - token_lens :math:`[B]`
            - token_rel_lens :math:`[B]`
            - waveform: :math:`[B, 1, T]`
            - waveform_lens: :math:`[B]`
            - waveform_rel_lens: :math:`[B]`
            - speaker_names: :math:`[B]`
            - language_names: :math:`[B]`
            - audiofile_paths: :math:`[B]`
            - raw_texts: :math:`[B]`
            - audio_unique_names: :math:`[B]`
        c                    s    i | ]   fd dD qS )c                    r   r8   r8   )r   dicr   r8   r9   r   ?  r   z5VitsDataset.collate_fn.<locals>.<dictcomp>.<listcomp>r8   )r   batchr   r9   
<dictcomp>?  s     z*VitsDataset.collate_fn.<locals>.<dictcomp>r   c                 S   s   g | ]}| d qS rF   )r   r   rK   r8   r8   r9   r   B  r   z*VitsDataset.collate_fn.<locals>.<listcomp>r   T)dim
descendingc                 S   s   g | ]}t |qS r8   )r   r   r8   r8   r9   r   E  r   r   r   c                 S   s   g | ]}|j d  qS r   )r   r   rC   r8   r8   r9   r   I  r   rF   Nr   r   r   r   r   )tokens
token_lenstoken_rel_lenswaveformwaveform_lenswaveform_rel_lensspeaker_nameslanguage_namesaudio_filesr   audio_unique_names)
r   rP   sort
LongTensorry   FloatTensorzero_r   ranger   )r   r   Bre   ids_sorted_decreasingr   r   r   wav_lenswav_lens_maxwav_rel_lenstoken_padded
wav_paddedir   r   r8   r   r9   
collate_fn.  sB   


$zVitsDataset.collate_fn)	r   r   r   r   r   propertyr   r  __classcell__r8   r8   r   r9   r      s    
r   c                   @   s  e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< d Zeed!< ed"d# d$Zee ed%< ed&d# d$Zeee  ed'< ed(d# d$Zee ed)< d*Zeed+< ed,d# d$Zee ed-< ed.d# d$Z ee ed/< d0Z!e"ed1< d2Z#eed3< d4Z$eed5< dZ%eed6< d2Z&eed7< d2Z'eed8< d9Z(eed:< d0Z)e"ed;< d<Z*e"ed=< d<Z+e"ed>< d?Z,eed@< d9Z-eedA< d9Z.ee edB< dCZ/eedD< d<Z0e"edE< d?Z1eedF< d0Z2e"edG< d<Z3e"edH< dZ4eedI< d?Z5eedJ< d9Z6eedK< d<Z7e"edL< dMZ8eedN< dMZ9eedO< d0Z:e"edP< d<Z;e"edQ< d<Z<e"edR< d<Z=e"edS< d<Z>e"edT< d<Z?e"edU< d9Z@eedV< d0ZAe"edW< d<ZBe"edX< d<ZCe"edY< d9S )ZVitsArgsau  VITS model arguments.

    Args:

        num_chars (int):
            Number of characters in the vocabulary. Defaults to 100.

        out_channels (int):
            Number of output channels of the decoder. Defaults to 513.

        spec_segment_size (int):
            Decoder input segment size. Defaults to 32 `(32 * hoplength = waveform length)`.

        hidden_channels (int):
            Number of hidden channels of the model. Defaults to 192.

        hidden_channels_ffn_text_encoder (int):
            Number of hidden channels of the feed-forward layers of the text encoder transformer. Defaults to 256.

        num_heads_text_encoder (int):
            Number of attention heads of the text encoder transformer. Defaults to 2.

        num_layers_text_encoder (int):
            Number of transformer layers in the text encoder. Defaults to 6.

        kernel_size_text_encoder (int):
            Kernel size of the text encoder transformer FFN layers. Defaults to 3.

        dropout_p_text_encoder (float):
            Dropout rate of the text encoder. Defaults to 0.1.

        dropout_p_duration_predictor (float):
            Dropout rate of the duration predictor. Defaults to 0.1.

        kernel_size_posterior_encoder (int):
            Kernel size of the posterior encoder's WaveNet layers. Defaults to 5.

        dilatation_posterior_encoder (int):
            Dilation rate of the posterior encoder's WaveNet layers. Defaults to 1.

        num_layers_posterior_encoder (int):
            Number of posterior encoder's WaveNet layers. Defaults to 16.

        kernel_size_flow (int):
            Kernel size of the Residual Coupling layers of the flow network. Defaults to 5.

        dilatation_flow (int):
            Dilation rate of the Residual Coupling WaveNet layers of the flow network. Defaults to 1.

        num_layers_flow (int):
            Number of Residual Coupling WaveNet layers of the flow network. Defaults to 6.

        resblock_type_decoder (str):
            Type of the residual block in the decoder network. Defaults to "1".

        resblock_kernel_sizes_decoder (List[int]):
            Kernel sizes of the residual blocks in the decoder network. Defaults to `[3, 7, 11]`.

        resblock_dilation_sizes_decoder (List[List[int]]):
            Dilation sizes of the residual blocks in the decoder network. Defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`.

        upsample_rates_decoder (List[int]):
            Upsampling rates for each concecutive upsampling layer in the decoder network. The multiply of these
            values must be equal to the kop length used for computing spectrograms. Defaults to `[8, 8, 2, 2]`.

        upsample_initial_channel_decoder (int):
            Number of hidden channels of the first upsampling convolution layer of the decoder network. Defaults to 512.

        upsample_kernel_sizes_decoder (List[int]):
            Kernel sizes for each upsampling layer of the decoder network. Defaults to `[16, 16, 4, 4]`.

        periods_multi_period_discriminator (List[int]):
            Periods values for Vits Multi-Period Discriminator. Defaults to `[2, 3, 5, 7, 11]`.

        use_sdp (bool):
            Use Stochastic Duration Predictor. Defaults to True.

        noise_scale (float):
            Noise scale used for the sample noise tensor in training. Defaults to 1.0.

        inference_noise_scale (float):
            Noise scale used for the sample noise tensor in inference. Defaults to 0.667.

        length_scale (float):
            Scale factor for the predicted duration values. Smaller values result faster speech. Defaults to 1.

        noise_scale_dp (float):
            Noise scale used by the Stochastic Duration Predictor sample noise in training. Defaults to 1.0.

        inference_noise_scale_dp (float):
            Noise scale for the Stochastic Duration Predictor in inference. Defaults to 0.8.

        max_inference_len (int):
            Maximum inference length to limit the memory use. Defaults to None.

        init_discriminator (bool):
            Initialize the disciminator network if set True. Set False for inference. Defaults to True.

        use_spectral_norm_disriminator (bool):
            Use spectral normalization over weight norm in the discriminator. Defaults to False.

        use_speaker_embedding (bool):
            Enable/Disable speaker embedding for multi-speaker models. Defaults to False.

        num_speakers (int):
            Number of speakers for the speaker embedding layer. Defaults to 0.

        speakers_file (str):
            Path to the speaker mapping file for the Speaker Manager. Defaults to None.

        speaker_embedding_channels (int):
            Number of speaker embedding channels. Defaults to 256.

        use_d_vector_file (bool):
            Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False.

        d_vector_file (List[str]):
            List of paths to the files including pre-computed speaker embeddings. Defaults to None.

        d_vector_dim (int):
            Number of d-vector channels. Defaults to 0.

        detach_dp_input (bool):
            Detach duration predictor's input from the network for stopping the gradients. Defaults to True.

        use_language_embedding (bool):
            Enable/Disable language embedding for multilingual models. Defaults to False.

        embedded_language_dim (int):
            Number of language embedding channels. Defaults to 4.

        num_languages (int):
            Number of languages for the language embedding layer. Defaults to 0.

        language_ids_file (str):
            Path to the language mapping file for the Language Manager. Defaults to None.

        use_speaker_encoder_as_loss (bool):
            Enable/Disable Speaker Consistency Loss (SCL). Defaults to False.

        speaker_encoder_config_path (str):
            Path to the file speaker encoder config file, to use for SCL. Defaults to "".

        speaker_encoder_model_path (str):
            Path to the file speaker encoder checkpoint file, to use for SCL. Defaults to "".

        condition_dp_on_speaker (bool):
            Condition the duration predictor on the speaker embedding. Defaults to True.

        freeze_encoder (bool):
            Freeze the encoder weigths during training. Defaults to False.

        freeze_DP (bool):
            Freeze the duration predictor weigths during training. Defaults to False.

        freeze_PE (bool):
            Freeze the posterior encoder weigths during training. Defaults to False.

        freeze_flow_encoder (bool):
            Freeze the flow encoder weigths during training. Defaults to False.

        freeze_waveform_decoder (bool):
            Freeze the waveform decoder weigths during training. Defaults to False.

        encoder_sample_rate (int):
            If not None this sample rate will be used for training the Posterior Encoder,
            flow, text_encoder and duration predictor. The decoder part (vocoder) will be
            trained with the `config.audio.sample_rate`. Defaults to None.

        interpolate_z (bool):
            If `encoder_sample_rate` not None and  this parameter True the nearest interpolation
            will be used to upsampling the latent variable z with the sampling rate `encoder_sample_rate`
            to the `config.audio.sample_rate`. If it is False you will need to add extra
            `upsample_rates_decoder` to match the shape. Defaults to True.

    d   	num_charsi  out_channels    spec_segment_size   hidden_channelsi    hidden_channels_ffn_text_encoderri   num_heads_text_encoder   num_layers_text_encoder   kernel_size_text_encoderg?dropout_p_text_encoderg      ?dropout_p_duration_predictor   kernel_size_posterior_encoderrF   dilation_rate_posterior_encoderr   num_layers_posterior_encoderkernel_size_flowdilation_rate_flow   num_layers_flow1resblock_type_decoderc                   C      g dS )N)r        r8   r8   r8   r8   r9   <lambda>1      zVitsArgs.<lambda>)default_factoryresblock_kernel_sizes_decoderc                   C   s   g dg dg dgS )N)rF   r  r  r8   r8   r8   r8   r9   r,  2  r   resblock_dilation_sizes_decoderc                   C   r)  )N)r   r   ri   ri   r8   r8   r8   r8   r9   r,  3  r-  upsample_rates_decoderi    upsample_initial_channel_decoderc                   C   r)  )N)r   r   r%  r%  r8   r8   r8   r8   r9   r,  5  r-  upsample_kernel_sizes_decoderc                   C   r)  )N)ri   r  r  r*  r+  r8   r8   r8   r8   r9   r,  6  r-  "periods_multi_period_discriminatorTuse_sdprc   noise_scalegMbX?inference_noise_scalelength_scalenoise_scale_dpinference_noise_scale_dpNmax_inference_leninit_discriminatorFuse_spectral_norm_disriminatoruse_speaker_embeddingr   num_speakersspeakers_filed_vector_filer   speaker_embedding_channelsuse_d_vector_filed_vector_dimdetach_dp_inputuse_language_embeddingembedded_language_dimnum_languageslanguage_ids_fileuse_speaker_encoder_as_loss speaker_encoder_config_pathspeaker_encoder_model_pathcondition_dp_on_speakerfreeze_encoder	freeze_DP	freeze_PEfreeze_flow_decoderfreeze_waveform_decoderr   interpolate_z	reinit_DPreinit_text_encoder)Dr   r   r   __doc__r  r   r   r  r  r  r  r  r  r  r  r   r  r   r!  r"  r#  r$  r&  r(  rz   r   r/  r   r0  r1  r2  r3  r4  r5  boolr6  r7  r8  r9  r:  r;  r<  r=  r>  r?  r@  rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rL  rM  rN  rO  rP  rQ  rR  rS  r   rT  rU  rV  r8   r8   r8   r9   r  m  sx   
  2r  c                       s6  e Zd ZdZ				d~deddddded	ef
 fd
dZedd Z	defddZ
dd Zdd ZdefddZdd Zdd Zdd ZdefddZdd  Zedefd!d"Zdefd#d$Zd%d& Zdd'd(Zdddd)fd*ejd+ejd,ejd-ejd.ejd/efd0d1Zed2d3 Ze dddddd4fd5d6Ze 	d~d7d8Zd9d: Z d;e!d<e"j#d=e$d/e%eef fd>d?Z&ddAdBZ'd;e!dCe!dDdEdFe!dGe$f
dHdIZ(e d;e!d<e"j#d=e$fdJdKZ)d;e!dCe!dDdEdFe!dGe$d/dfdLdMZ*dNdO Z+e d/e%eef fdPdQZ,dCe!dDdEdFe!dGe$d/df
dRdSZ-d;ed/efdTdUZ.dVdW Z/ddedZe0fd[d\Z1	ddedFed]e2d^e3e4e e4e4 f d_e2d`e$dae$d/dbfdcddZ5d/e4fdedfZ6d/e4fdgdhZ7d/e4fdidjZ8dkdl Z9	YddndoZ:	mddpdqZ;edddrd^e3e4e4 e4e f fdsdtZ<ddve=d_e2fdwdxZ>ddye=fdzd{Z?dd|d}Z@  ZAS )Vitsa  VITS TTS model

    Paper::
        https://arxiv.org/pdf/2106.06103.pdf

    Paper Abstract::
        Several recent end-to-end text-to-speech (TTS) models enabling single-stage training and parallel
        sampling have been proposed, but their sample quality does not match that of two-stage TTS systems.
        In this work, we present a parallel endto-end TTS method that generates more natural sounding audio than
        current two-stage models. Our method adopts variational inference augmented with normalizing flows and
        an adversarial training process, which improves the expressive power of generative modeling. We also propose a
        stochastic duration predictor to synthesize speech with diverse rhythms from input text. With the
        uncertainty modeling over latent variables and the stochastic duration predictor, our method expresses the
        natural one-to-many relationship in which a text input can be spoken in multiple ways
        with different pitches and rhythms. A subjective human evaluation (mean opinion score, or MOS)
        on the LJ Speech, a single speaker dataset, shows that our method outperforms the best publicly
        available TTS systems and achieves a MOS comparable to ground truth.

    Check :class:`TTS.tts.configs.vits_config.VitsConfig` for class arguments.

    Examples:
        >>> from TTS.tts.configs.vits_config import VitsConfig
        >>> from TTS.tts.models.vits import Vits
        >>> config = VitsConfig()
        >>> model = Vits(config)
    NconfigapAudioProcessorr   r.   speaker_managerlanguage_managerc                    s  t  ||||| | | | | |   | jj| _| jj| _| jj| _| jj	| _	| jj
| _
| jj| _| jj| _t| jj| jj| jj| jj| jj| jj| jj| jj| jd	| _t| jj| jj| jj| jj| jj| jj| jd| _t| jj| jj| jj| jj | jj!| jd| _"| jj#rt$| jjdd| jj%d| jj&r| jnd| jd| _'nt(| jjdd| jj%| j| jd| _'t)| jjd	| jj*| jj+| jj,| jj-| jj.| jj/d| jd
d
d
d| _0| jj1rt2| jj3| jj4d| _5d S d S )N)language_emb_dim)kernel_sizedilation_rate
num_layerscond_channelsr  r  r%  r   )rc  r_  r   rF   F)inference_paddingrc  conv_pre_weight_normconv_post_weight_normconv_post_bias)periodsuse_spectral_norm)6r   r   init_multispeakerinit_multilingualinit_upsamplingr   r8  r6  r7  r:  r9  r;  r  r   r  r  r  r  r  r  r  rG  text_encoderr   r  r   r!  r"  embedded_speaker_dimposterior_encoderr   r#  r$  r&  flowr5  r   r  rN  duration_predictorr   r2   r(  r0  r/  r3  r2  r1  waveform_decoderr<  r   r4  r=  disc)r   rZ  r[  r   r]  r^  r   r8   r9   r   w  s   









	

	zVits.__init__c                 C   s   t |  jS rV   )next
parametersrh   r   r8   r8   r9   rh        zVits.devicec                 C   s   d| _ | jj| _d| _| jr| jj| _| jjr|   | jjr#|   | jj	ri| jj
du r9| jjr5| jjs9td| jj
  td t| jj
drk| jjj| jj
jd krmtjj| jjj| jj
jd d| _dS dS dS dS )a  Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
        or with external `d_vectors` computed from a speaker encoder model.

        You must provide a `speaker_manager` at initialization to set up the multi-speaker modules.

        Args:
            config (Coqpit): Model configuration.
            data (List, optional): Dataset items to infer number of speakers. Defaults to None.
        r   Nz [!] To use the speaker consistency loss (SCL) you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!z% > External Speaker Encoder Loaded !!audio_configr   	orig_freqnew_freq)rn  r   r?  audio_transformr]  r>  _init_speaker_embeddingrC  _init_d_vectorrJ  encoderrM  rL  RuntimeErrorevalrx   hasattrrZ  audior   rx  rH   
transformsResampler   rZ  r8   r8   r9   rj    s<   


zVits.init_multispeakerc                 C   s6   | j dkrtd | jj| _t| j | j| _d S d S )Nr   z. > initialization of speaker-embedding layers.)r?  rx   r   rB  rn  r   	Embeddingemb_grv  r8   r8   r9   r}    s
   

zVits._init_speaker_embeddingc                 C   s    t | dr	td| jj| _d S )Nr  zI[!] Speaker embedding layer already initialized before d_vector settings.)r  
ValueErrorr   rD  rn  rv  r8   r8   r9   r~    s   
zVits._init_d_vectorc                 C   sv   | j jdurt|jd| _| j jr6| jr6td | jj| _| j j| _t	| j| j| _
tjj| j
j dS d| _dS )zuInitialize multilingual modules of a model.

        Args:
            config (Coqpit): Model configuration.
        N)language_ids_file_pathz/ > initialization of language-embedding layers.r   )r   rI  r%   r^  rF  rx   rH  rG  r   r  emb_lrP   initxavier_uniform_r<   r  r8   r8   r9   rk    s   


zVits.init_multilingualc                 C   sD   | j jr | jjd | j j | _tjj| jjd | j jd| _dS dS )z;
        Initialize upsampling modules of a model.
        r   ry  N)	r   r   rZ  r  interpolate_factorrH   r  r  audio_resamplerrv  r8   r8   r9   rl  %  s   zVits.init_upsamplingc                 C   s,   |    | jjr| jj| j| j_dS dS )z*Freeze layers at the beginning of an epochN)_freeze_layersr   rJ  r]  r  r|   rh   )r   trainerr8   r8   r9   on_epoch_start/  s   zVits.on_epoch_startc                 C   s   | j jr,t| j}| jjtd t| j}| D ]\}}||| kr'tdqtd | j j	rZt| j
}| j
jtd t| j
}| D ]\}}||| krStdqEtd dS dS )zReinit layes if needed)fnz@ [!] The weights of Duration Predictor was not reinit check it !z! > Duration Predictor was reinit.z: [!] The weights of Text Encoder was not reinit check it !z > Text Encoder was reinit.N)r   rU  rE   rq  applyr:   r   r  rx   rV  rm  )r   r  before_dict
after_dictkeyrD   r8   r8   r9   on_init_end6  s&   



zVits.on_init_end	aux_inputc                 C   s    |  |\}}}}|d ||dS )N)speaker_ids	style_wav	d_vectorslanguage_ids)_set_cond_input)r   r  sidglidre   r8   r8   r9   get_aux_inputL  s   zVits.get_aux_inputc                 C   s   | j jr| j D ]}d|_q	t| dr| j D ]}d|_q| j jr.| j D ]}d|_q(| j j	r=| j
 D ]}d|_q7| j jrL| j D ]}d|_qF| j jr[| j D ]}d|_qUd S d S )NFr  )r   rO  rm  ru  requires_gradr  r  rQ  ro  rP  rq  rR  rp  rS  rr  )r   paramr8   r8   r9   r  P  s(   
zVits._freeze_layersc                 C   s   d\}}}}d| v r| d dur| d }|j dkr|d}d| v r<| d dur<t| d d}|j dkr<|d}d| v rT| d durT| d }|j dkrT|d}d	| v rb| d	 durb| d	 }||||fS )
zCSet the speaker conditioning input based on the multi-speaker mode.NNNNr  Nr   r  rG   ri   r  	durations)ndim
unsqueeze_F	normalizer~   )r  r  r  r  r  r8   r8   r9   r  i  s    





zVits._set_cond_inputc                 C   sb   | dd }| dd }|d ur|d urtd|d ur%t| ds%td|d ur-|}|S |}|S )Nr  r  z2[!] Cannot use d-vectors and speaker-ids together.r  z>[!] Cannot use speaker-ids without enabling speaker embedding.)r   r  r  )r   r  r  r  r  r8   r8   r9   _set_speaker_input  s   zVits._set_speaker_inputc
                 C   s  t |dt |d }
t  ^ t d| }t dtdtj  | dgd}t d|d|d  g}t d|| |g}t d|d  | dgd}|| | | }t	||

dd }W d    n1 sqw   Y  |d}| jjr| j| jjr| n|||| jjr|d ur| n|| jjr|	d ur|	 n|	d}|t | }nCt |d	 | }| j| jjr| n||| jjr|d ur| n|| jjr|	d ur|	 n|	d}t || d ddgt | }||d
< ||fS )NrG   ri   g      rF   zklm, kln -> kmnr  r  lang_embrv   loss_duration)rP   r~   no_gradrW   r?   mathrQ   pieinsumr!   rw   detachr   r5  rq  rE  )r   outputsz_pm_plogs_prK   x_masky_maskr  r  	attn_masko_scalelogp1logp2logp3logp4logpattnattn_durationsr  attn_log_durationslog_durationsr8   r8   r9   forward_mas  s<   
( 

"zVits.forward_masc                 C   s   | j }| jjrE|d ur|t| j n|}|t| j }| jjrEtjjj	|| jgdd
d}|d urE|d urEt|| j d |jd}||||fS )Nlinear)scale_factorrl   r   rF   )r  r   r   r   r  rT  rP   r   r   interpolaterw   r$   r|   rg   r~   )r   z	slice_ids	y_lengthsr  r  r8   r8   r9   upsampling_z  s   zVits.upsampling_zr  r  r  rK   	x_lengthsr   r  r   returnc                 C   s  i }|  |\}}	}
}| jjr|dur| |d}	d}| jjr-|
dur-| |
d}| j|||d\}}}}| j|||	d\}}}}| j	|||	d}| j
||||||||	|d	\}}td||g}td||g}t||| jddd\}}| j||d	\}}}}| j||	d}t||| jjj || jjj dd
}| jjr| jjdurtj||fdd}| jdur| |}| jjj|dd}tj|ddd\}}nd\}}|||d||||||||||d |S )aC  Forward pass of the model.

        Args:
            x (torch.tensor): Batch of input character sequence IDs.
            x_lengths (torch.tensor): Batch of input character sequence lengths.
            y (torch.tensor): Batch of input spectrograms.
            y_lengths (torch.tensor): Batch of input spectrogram lengths.
            waveform (torch.tensor): Batch of ground truth waveforms per sample.
            aux_input (dict, optional): Auxiliary inputs for multi-speaker and multi-lingual training.
                Defaults to {"d_vectors": None, "speaker_ids": None, "language_ids": None}.

        Returns:
            Dict: model outputs keyed by the output name.

        Shapes:
            - x: :math:`[B, T_seq]`
            - x_lengths: :math:`[B]`
            - y: :math:`[B, C, T_spec]`
            - y_lengths: :math:`[B]`
            - waveform: :math:`[B, 1, T_wav]`
            - d_vectors: :math:`[B, C, 1]`
            - speaker_ids: :math:`[B]`
            - language_ids: :math:`[B]`

        Return Shapes:
            - model_outputs: :math:`[B, 1, T_wav]`
            - alignments: :math:`[B, T_seq, T_dec]`
            - z: :math:`[B, C, T_dec]`
            - z_p: :math:`[B, C, T_dec]`
            - m_p: :math:`[B, C, T_dec]`
            - logs_p: :math:`[B, C, T_dec]`
            - m_q: :math:`[B, C, T_dec]`
            - logs_q: :math:`[B, C, T_dec]`
            - waveform_seg: :math:`[B, 1, spec_seg_size * hop_length]`
            - gt_spk_emb: :math:`[B, 1, speaker_encoder.proj_dim]`
            - syn_spk_emb: :math:`[B, 1, speaker_encoder.proj_dim]`
        NrG   r  r  r  zklmn, kjm -> kjnT)let_short_samples	pad_short)r  r  r   )r   )l2_normri   NNrF   )model_outputs
alignmentsr  r  r  r  m_qlogs_qwaveform_seg
gt_spk_embsyn_spk_embr  )r  r   r>  r  r~   rF  r  rm  ro  rp  r  rP   r  r"   r  r  rr  r#   rZ  r  rn   rJ  r]  r  catr|  forwardchunkupdaterw   )r   rK   r  r   r  r   r  r  r  r  r  re   r  r  r  r  r  r  r  r  r  r  z_slicer  r  owav_seg
wavs_batch	pred_embsr  r  r8   r8   r9   r    sZ   . 

zVits.forwardc                 C   s8   d|v r|d d ur|d S t | jdd | jS )Nr  rF   ri   )rP   tensorr   r|   rh   )rK   r  r8   r8   r9   _set_x_lengths:  s   zVits._set_x_lengthsr  r  r  r  r  c              	   C   sL  |  |\}}}}| ||}| jjr|dur| |d}d}| jjr1|dur1| |d}| j|||d\}}	}
}|du rr| jj	rX| j
||| jjrO|ndd| j|d}n| j
||| jjrb|nd|d}t|| | j }n|jd |jd ks~J |d}t|}tt|dd	gd }t|d|jd}||dd	 }t|d|ddd	}t|dd	|	dd	dd	}	t|dd	|
dd	dd	}
|	t|	t|
 | j  }| j|||dd
}| j|||d\}}}}| j || ddddd| j!f |d}||d||||	|
|d}|S )aG  
        Note:
            To run in batch mode, provide `x_lengths` else model assumes that the batch size is 1.

        Shapes:
            - x: :math:`[B, T_seq]`
            - x_lengths: :math:`[B]`
            - d_vectors: :math:`[B, C]`
            - speaker_ids: :math:`[B]`

        Return Shapes:
            - model_outputs: :math:`[B, 1, T_wav]`
            - alignments: :math:`[B, T_seq, T_dec]`
            - z: :math:`[B, C, T_dec]`
            - z_p: :math:`[B, C, T_dec]`
            - m_p: :math:`[B, C, T_dec]`
            - logs_p: :math:`[B, C, T_dec]`
        NrG   r  T)r  reverser6  r  r  r   rF   ri   r  r  )r  r  r  )r  r  r  r  r  r  r  r  )"r  r  r   r>  r  r~   rF  r  rm  r5  rq  rN  r:  rP   rW   r8  r   ceil	clamp_minr?   longr$   r|   rg   	transposer    rw   r   
randn_liker7  rp  r  rr  r;  )r   rK   r  r  r  r  r  r  r  r  r  r  logwrC   w_ceilr  r  r  r  r  r  re   r  r  r8   r8   r9   	inference@  sZ   	

$$*
zVits.inferencec                 C   sx   t || jjj| jjj| jjjdd}t|dg	|j
}|dur%|n|}|dur-|n|}	| ||||	\}
}}|
S )a  Inference for voice conversion

        Args:
            reference_wav (Tensor): Reference wavform. Tensor of shape [B, T]
            speaker_id (Tensor): speaker_id of the target speaker. Tensor of shape [B]
            d_vector (Tensor): d_vector embedding of target speaker. Tensor of shape `[B, C]`
            reference_speaker_id (Tensor): speaker_id of the reference_wav speaker. Tensor of shape [B]
            reference_d_vector (Tensor): d_vector embedding of the reference_wav speaker. Tensor of shape `[B, C]`
        Frq   rG   N)r   rZ  r  r   rn   ro   rP   r  r   r|   rh   voice_conversion)r   reference_wav
speaker_idd_vectorreference_speaker_idreference_d_vectorr   r  speaker_cond_srcspeaker_cond_tgtr   re   r8   r8   r9   inference_voice_conversion  s   zVits.inference_voice_conversionc                 C   s   | j dks	J d| jjr4| jjs4| tt|	d	d}| tt|	d	d}n| jjsM| jjrMt
|	d}t
|	d}ntd| j|||d\}}}}	| j||	|d}
| j|
|	|dd}| j||	 |d}||	||
|ffS )a  Forward pass for voice conversion

        TODO: create an end-point for voice conversion

        Args:
            y (Tensor): Reference spectrograms. Tensor of shape [B, T, C]
            y_lengths (Tensor): Length of each reference spectrogram. Tensor of shape [B]
            speaker_cond_src (Tensor): Reference speaker ID. Tensor of shape [B,]
            speaker_cond_tgt (Tensor): Target speaker ID. Tensor of shape [B,]
        r   z&num_speakers have to be larger than 0.rG   z@ [!] Voice conversion is only supported on multi-speaker models.r  Tr  )r?  r   r>  rC  r  rP   r   r   r   r~   r  r  r  ro  rp  rr  )r   r   r  r  r  g_srcg_tgtr  re   r  r  z_hato_hatr8   r8   r9   r    s   "$zVits.voice_conversionr   	criterionoptimizer_idxc                 C   sb  |d }|dkri|d }|d }|d }|d }|d }	|d }
|d	 }| j |||||||	|
d
d}|| _| |d  |d \}}}}tdd || ||}W d   ||fS 1 s`w   Y  ||fS |dkr-|d }tddL | jjr| jt| j	 }n| j}t
| | jd |dd}t| jd  | jjj| jjj| jjj| jjj| jjj| jjj| jjjdd	}W d   n1 sw   Y  | | jd | jd \}}}}tdd? || | | | jd  | jd  | jd  | jd  ||||| jd | jj| jd | jd d}W d   n	1 s#w   Y  | j|fS td) a  Perform a single training step. Run the model forward pass and compute losses.

        Args:
            batch (Dict): Input tensors.
            criterion (nn.Module): Loss layer designed for the model.
            optimizer_idx (int): Index of optimizer to use. 0 for the generator and 1 for the discriminator networks.

        Returns:
            Tuple[Dict, Dict]: Model ouputs and computed losses.
        	spec_lensr   r   r   r   r  r  r  r   r  r  r  r  F)enabledNrF   r   r  Tr  )	r   r   r   r   rn   ro   r   r   rq   r  r  r  r  r  r  r  )mel_slice_hat	mel_slicer  r  r  r  z_lenscores_disc_fakefeats_disc_fakefeats_disc_realr  rJ  r  r  z  [!] Unexpected `optimizer_idx`.)r  model_outputs_cachers  r  r   r   r   r  r   r  r#   r   r   rZ  r  r   r   r   rn   ro   r   r   rJ  r  )r   r   r  r  r  r   token_lenghtsr   r  r  r  r   r  r  re   scores_disc_real	loss_dictr   r  r	  r  r  r  r8   r8   r9   
train_step  s   




zVits.train_steptrainc                 C   s   |d d }|d d }t ||||}|d d   }| d|i}	|d d }
|
d j  j}|dt|dd	i ||	fS )
NrF   r  r  r   z/audior  	alignmentF
output_fig)	r3   rw   r  cpunumpyr>   Tr  r/   )r   r[  r   r  name_prefixy_hatr   figuressample_voiceaudiosr  	align_imgr8   r8   r9   _log5  s   z	Vits._logr  loggerLoggerassetsstepsc                 C   s8   |  | j||d\}}||| |||| jj dS )a  Create visualizations and waveform examples.

        For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
        be projected onto Tensorboard.

        Args:
            ap (AudioProcessor): audio processor used at training.
            batch (Dict): Model inputs used at the previous training step.
            outputs (Dict): Model outputs generated at the previoud training step.

        Returns:
            Tuple[Dict, np.ndarray]: training plots and output waveform.
        r  N)r   r[  train_figurestrain_audiosr   r   r   r  r!  r#  r$  r  r  r8   r8   r9   	train_logF  s   zVits.train_logc                 C   s   |  |||S rV   )r  )r   r   r  r  r8   r8   r9   	eval_stepZ  rw  zVits.eval_stepc                 C   s8   |  | j||d\}}||| |||| jj d S )Nr  )r   r[  eval_figureseval_audiosr   r'  r8   r8   r9   eval_log^  s   zVits.eval_logc           
      C   s2  t | jdr| jj}n| j}d\}}}}t|trHt|dkr$|d }n&t|dkr/|\}}nt|dkr;|\}}}nt|dkrG|\}}}}n|}d\}}}	t | d	r~|jrk|d u ra| j }n| jj	|d d
d}n|j
r~|d u rx| j }n| jj| }t | dr|jr|d ur| jj| }	|||||	|dS )Nr   r  rF   r   ri   r  r%  NNNr]  F)num_samples	randomizer^  )r   r  r  r  language_idr   )r  rZ  r   
isinstancelistr   rC  r]  get_random_embeddingget_mean_embeddingr>  get_random_id
name_to_idrF  r^  )
r   sentence_inforZ  r   r   r  r   r  r  r0  r8   r8   r9   !get_aux_input_from_test_sentencesc  sD   





z&Vits.get_aux_input_from_test_sentencesc                 C   s   t d i }i }| jj}t|D ]D\}}| |}t| |d | jdtt|  j	v |d |d |d |d dd	d


 \}}	}
}
||d|< t|	jd	d|d|< q||dS )zGeneric test run for `tts` models used by `Trainer`.

        You can override this for a different behaviour.

        Returns:
            Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
        z! | > Synthesizing test sentences.r   cudar  r  r  r0  TF)r  r  r  r0  use_griffin_limdo_trim_silencez{}-audior  z{}-alignment)r  r  )rx   rZ  test_sentences	enumerater8  r'   rz   rt  ru  rh   valuesformatr/   r  )r   r#  test_audiostest_figuresr<  r   s_info
aux_inputsr   r  re   r8   r8   r9   test_run  s.   	


zVits.test_runc                 C   s*   | ||d | jj |||d  d S )Nr  r  )r@  r[  r   rA  )r   r  r!  r#  r$  r8   r8   r9   test_log  s   zVits.test_logc                    s   d}d}d}j durj jrjjrfdd|d D }|dur't|}j durHj jrHjjrHj j  fdd|d D }t|}j	dur`j	jr`jj
r`fdd|d D }|durit|}||d	< ||d
< ||d< |S )zGCompute speaker, langugage IDs and d_vector for the batch if necessary.Nc                       g | ]} j j| qS r8   )r]  r6  )r   snrv  r8   r9   r         z%Vits.format_batch.<locals>.<listcomp>r   c                    s   g | ]} | d  qS )	embeddingr8   r   )d_vector_mappingr8   r9   r     rH  r   c                    rF  r8   )r^  r6  )r   lnrv  r8   r9   r     rH  r   r  r  r  )r]  r6  r   r>  rP   r   
embeddingsrC  r  r^  rF  )r   r   r  r  r  r8   )rJ  r   r9   format_batch  s&   


zVits.format_batchc                 C   s~  | j j}| jjr| |d }n|d }t||j|j|jdd|d< | jjrwt|d |j|j|jdd}|	dt
|d 	d| j kr]|dddddt
|d 	d| j f }n|d dddddt
|	d| j f |d< n|d }t||j|j|j|j|jd|d< | jjr|d jd t
|d jd | j ksJ |d jd  d	|d jd  n!|d jd |d jd ksJ |d jd  d	|d jd  |d jd |d
  
 |d< |d jd |d
  
 |d< | jjr|d |d | j 
   dksJ n|d |d   dksJ |d t|d d |d< |d t|d d |d< |S )z#Compute spectrograms on the device.r   Fr  r   ri   N)r   r   r   r   r   r   r   z, r   r  mel_lensr   rF   )rZ  r  r   r   r  r   r   rn   ro   r   r   r  r   r   r   r   r   r   r?   r$   r~   )r   r   acr   spec_melr8   r8   r9   format_batch_on_device  sD   "04
	 B
*zVits.format_batch_on_devicerF   Fdatasetc                 C   s   d }|j }t|ddrC|j D ]2\}}td| d| d |j|d }	t|	 t|||	d\}}
}|| }td|
 d|  q|d ur`t|t	|}t
|||rV|jn|jd	d
 dd}nd }|d u rt|dkrpt|}|S d }|S |dkr|t|n|}|S )Nuse_weighted_samplerFz) > Using weighted sampler for attribute 'z' with alpha '')r   r   r   z > Attribute weights for 'z' 
 | > c                 S   s   t j| d S )Nr   )r   r   r   )rK   r8   r8   r9   r,  "  s    z"Vits.get_sampler.<locals>.<lambda>T)r>   
batch_sizesort_key	drop_lastrF   )r   r6   weighted_sampler_attrsr   rx   weighted_sampler_multipliersr   r   r   r   r1   eval_batch_sizerU  r   r   )r   rZ  rR  num_gpusis_evalweights
data_itemsr   alphar   
attr_namesattr_weights	w_samplerbatch_samplerr8   r8   r9   get_sampler  s<   zVits.get_samplerr\  r   verboser[  rankr   c                 C   s  |r	|j s	d }|S t| j||rdn|j|j |j|j|j|j|j	|j
|| j|jd}	|dkr2t  |	  | ||	|}
|
d u r[t|	|rH|jn|jd|	jd|rS|jn|jdd}|S |dkrxt|	|
|rg|jn|j|	j|rp|jn|jdd}|S t|	|
|	j|r|jn|jdd}|S )Nr   )r   r   batch_group_sizemin_text_lenr   r   max_audio_lenphoneme_cache_pathprecompute_num_workersre  r   start_by_longestrF   F)rU  shuffler  rW  num_workers
pin_memory)samplerrU  r  rn  ro  )rc  r  rn  ro  )run_evalr   r   rg  rU  rh  r   r   ri  rj  rk  r   rl  distbarrierpreprocess_samplesrd  r   rZ  r  num_eval_loader_workersnum_loader_workers)r   rZ  r#  r\  r   re  r[  rf  loaderrR  rp  r8   r8   r9   get_data_loader0  sd   

7zVits.get_data_loaderc                 C   sV   t | jj| jj| jj| j}tdd |  D }t | jj| jj| jj|d}||gS )zInitiate and return the GAN optimizers based on the config parameters.
        It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator.
        Returns:
            List: optimizers.
        c                 s   s"    | ]\}}| d s|V  qdS )zdisc.N)
startswith)r   r   paramsr8   r8   r9   	<genexpr>}  s     z%Vits.get_optimizer.<locals>.<genexpr>)ru  )	r   rZ  	optimizeroptimizer_paramslr_discrs  r   r=   lr_gen)r   
optimizer0gen_parameters
optimizer1r8   r8   r9   r   t  s   zVits.get_optimizerc                 C   s   | j j| j jgS )zSet the initial learning rates for each optimizer.

        Returns:
            List: learning rates for each optimizer.
        )rZ  r~  r  rv  r8   r8   r9   get_lr  s   zVits.get_lrc                 C   s8   t | jj| jj|d }t | jj| jj|d }||gS )zSet the schedulers for each optimizer.

        Args:
            optimizer (List[`torch.optim.Optimizer`]): List of optimizers.

        Returns:
            List: Schedulers, one for each optimizer.
        r   rF   )r   rZ  lr_scheduler_disclr_scheduler_disc_paramslr_scheduler_genlr_scheduler_gen_params)r   r|  scheduler_Dscheduler_Gr8   r8   r9   r     s   	zVits.get_schedulerc                 C   s$   ddl m}m} || j|| jgS )zxGet criterions for each optimizer. The index in the output list matches the optimizer idx used in
        `train_step()`r   )VitsDiscriminatorLossVitsGeneratorLoss)TTS.tts.layers.lossesr  r  rZ  )r   r  r  r8   r8   r9   get_criterion  s   zVits.get_criterionTc           
      C   s  t |td|d}dd |d  D |d< | jjdur"|r"d| _t| dri|d d j| j	j
jkri| j	j
jd	 |d d jd	  }td
| d |d d }t||jd }	tj||	gd	d}||d d< | j|d |d |r}|   | jrJ dS dS )z=Load the model checkpoint and setup for training or inferencer  )map_locationcachec                 S   s   i | ]\}}d |vr||qS )speaker_encoderr8   )r   r   vr8   r8   r9   r     s    z(Vits.load_checkpoint.<locals>.<dictcomp>modelNr  zemb_g.weightr   z > Loading checkpoint with z additional speakers.rF   )axisstrict)r0   rP   rh   r   r   r   r  r  r   r  r<   rx   randnr  load_state_dictr  training)
r   rZ  checkpoint_pathr  r  r  statenum_new_speakersr  new_rowr8   r8   r9   load_checkpoint  s"   " 
zVits.load_checkpointc                 C   s   ddl }ddlm} d| _tj|d}tj|d}tj|d}	t|ddd	}
||
}W d   n1 s9w   Y  |d
 d | j	j
_t|	}t|j|jj| j_td||d|d
 d dd| _t|}| j||d |r{|   | jr}J dS dS )u  Load VITS checkpoints released by fairseq here: https://github.com/facebookresearch/fairseq/tree/main/examples/mms
        Performs some changes for compatibility.

        Args:
            config (Coqpit): 🐸TTS model config.
            checkpoint_dir (str): Path to the checkpoint directory.
            eval (bool, optional): Set to True for evaluation. Defaults to False.
        r   N)basic_cleanerszconfig.jsonzG_100000.pthz	vocab.txtrutf-8encodingr>   sampling_rateF	add_blank)use_phonemestext_cleanerr   
phonemizerr  use_eos_bosr  )jsonTTS.tts.utils.text.cleanersr  rs  r   r   joinopenrI   rZ  r  r   FairseqVocabr   r  r  r   r  rm  embr.   r   r   r  r  r  )r   rZ  checkpoint_dirr  r  r  r  config_filecheckpoint_file
vocab_filefile
config_orgvocabnew_chkr8   r8   r9   load_fairseq_checkpoint  s6   
	
zVits.load_fairseq_checkpoint
VitsConfigc                 C   s   ddl m} tt| jj }| jjs)|| j	j
ks(J d| d| j	j
 n| j	j| jj }| j	j
| }||ksEJ d| d| |j| |d}t| \}}	t| |}
t| }| jjrl|
| jj| jj t|	|||
|S )zInitiate model from config

        Args:
            config (VitsConfig): Model config.
            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
                Defaults to None.
        r   )r\  zA [!] Product of upsample rates must be equal to the hop length - z vs )re  )TTS.utils.audior\  rP   prod	as_tensorr   r1  r@   r   r  rn   r   init_from_configr.   r&   r%   rM  init_encoderrL  rY  )rZ  r   re  r\  upsample_rate$encoder_to_vocoder_upsampling_factoreffective_hop_lengthr[  r   
new_configr]  r^  r8   r8   r9   r    s(   	

zVits.init_from_configcoqui_vits.onnxoutput_pathc                    s`   j }d}t dr j} j}d _   d fdd	}| _ d}tjddd|ftjd	}t|	dg}	t
 j j jg}
||	|
f}g d
} jdkratdg}||f7 }|d t dr jdkr jdkrtdg}||f7 }|d tjj |d|||dgdddddidddddd | _ |r   |dur| _dS dS )zExport model to ONNX format for inference

        Args:
            output_path (str): Path to save the exported model.
            verbose (bool): Print verbose information. Defaults to True.
        Nrs  c                    sH   |d }|d }|d }| _ | _| _ j| |d ||d ddd S )Nr   rF   ri   r  r  r  )r6  r8  r9  r  )r   text_lengthsscalesr  langidr6  r8  r9  rv  r8   r9   onnx_inference!  s"   	z(Vits.export_onnx.<locals>.onnx_inferencer  r   ri   rF   )lowhighr   rg   inputinput_lengthsr  r  rH  r     r\   rU  phonemes)r   rF   time1time2)r   rF   ri   )r  r  r\   )r  r   opset_versionfre  input_namesoutput_namesdynamic_axesr  )r  r  rs  r  r  rP   randintr  r   r   r  r7  r8  r:  r?  r   rH  rG  onnxexportr  )r   r  re  _forwardrs  r  r  dummy_input_length	sequencessequence_lengthsr  dummy_inputr  r  r0  r8   rv  r9   export_onnx  sT   	








zVits.export_onnx
model_pathc                 C   s@   dd l }|du r
dndddifg}| }|j|||d| _d S )Nr   FCPUExecutionProviderCUDAExecutionProvidercudnn_conv_algo_searchDEFAULT)sess_options	providers)onnxruntimeSessionOptionsInferenceSession	onnx_sess)r   r  r9  ortr  r  r8   r8   r9   	load_onnx^  s   
zVits.load_onnxc                 C   s   t |tjr|  }|du rtj|jd gtjd}t |tjr(|  }tj| j	| j
| jgtjd}|||d}|durLt|g  |d< |dur\t|g  |d< | jdg|}|d d S )	zONNX inferenceNrF   )rg   r  r  r  r\   r   )r1  rP   Tensorr  r  r   r   r   int64r7  r8  r:  float32r  r  run)r   rK   r  r  r0  r  input_paramsr  r8   r8   r9   inference_onnxm  s(   zVits.inference_onnxr  r-  )r  )rF   FrV   )FTF)FT)NT)r  TF)Br   r   r   rW  r
   r&   r%   r   r  rh   rj  r}  r~  rk  rl  r  r  r   r  r  staticmethodr  r  r  r  rP   r  r  r  r  r  r  r  dictr   Moduler   r   r  r   r(  r)  r,  r8  rD  rE  rM  rQ  r   rd  rX  r	   r   rx  r   r  r   r  r  r  r  rz   r  r  r  r  r8   r8   r   r9   rY  [  s    _
,

#

y
V$
g
"0
5,	
D

,&"PrY  c                       sd   e Zd ZdZeeeefdededededdf
 fdd	Z	d
d Z
edefddZdddZ  ZS )VitsCharacterszICharacters class for VITs model for compatibility with pre-trained models	graphemespunctuationsr}   ipa_charactersr  Nc              
      s0   |d ur||7 }t  j|||d d dddd d S )Nz<BLNK>FT)	is_unique	is_sorted)r   r   )r   r  r  r}   r  r   r8   r9   r     s    zVitsCharacters.__init__c                 C   sV   | j gt| j t| j | jg | _dd t| jD | _dd t| jD | _	d S )Nc                 S      i | ]\}}||qS r8   r8   r   r   charr8   r8   r9   r     r   z0VitsCharacters._create_vocab.<locals>.<dictcomp>c                 S      i | ]\}}||qS r8   r8   r  r8   r8   r9   r     r   )
r+   r2  r-   r*   _blank_vocabr=  r  _char_to_id_id_to_charrv  r8   r8   r9   _create_vocab  s   &zVitsCharacters._create_vocabrZ  c                 C   sd   | j d ur#| j d }| j d }| j d }| j d }t||||d| fS t }t| | d}||fS )Nr}   r  r   r  )r  r  r  r}   )r   )r   r  r   	to_config)rZ  r+   r-   _letters_letters_ipar   r  r8   r8   r9   r    s   




zVitsCharacters.init_from_configr   c              
   C   s    t | j| j| jd d | jdddS )NFT)r   r  r}   eosbosblankr  r  )r   r*   r-   r+   r   rv  r8   r8   r9   r    s   zVitsCharacters.to_config)r  r   )r   r   r   rW  r*   r-   r+   r,   rz   r   r  r  r
   r  r  r  r8   r8   r   r9   r    s*    r  c                       s<   e Zd Zdef fddZedd Zejdd Z  ZS )r  r  c                    s   t t  || _d S rV   )r   r  r   r  )r   r  r   r8   r9   r     s   
zFairseqVocab.__init__c                 C   s   | j S )z!Return the vocabulary dictionary.)r  rv  r8   r8   r9   r    s   zFairseqVocab.vocabc                 C   s   t |dd}dd | D | _W d    n1 sw   Y  | jd | _d| _dd t| jD | _d	d t| jD | _d S )
Nr  r  c                 S   s   g | ]}| d dqS )
rK  )r   r   r8   r8   r9   r     rH  z&FairseqVocab.vocab.<locals>.<listcomp>r    c                 S   r  r8   r8   r   r  sr8   r8   r9   r     r   z&FairseqVocab.vocab.<locals>.<dictcomp>c                 S   r  r8   r8   r  r8   r8   r9   r     r   )r  	readlinesr  r
  r}   r=  r  r  )r   r  r  r8   r8   r9   r    s   )	r   r   r   rz   r   r  r  setterr  r8   r8   r   r9   r    s    
r  )rF   rN   r   r  rV   )sr  r   dataclassesr   r   r   	itertoolsr   typingr   r   r   r	   r  r   rP   torch.distributeddistributedrr  rH   coqpitr
   librosa.filtersr   r   r   torch.cuda.amp.autocast_moder   torch.nnr   r  torch.utils.datar   torch.utils.data.samplerr   trainer.torchr   r   trainer.trainer_utilsr   r   TTS.tts.configs.shared_configsr   TTS.tts.datasets.datasetr   r   *TTS.tts.layers.glow_tts.duration_predictorr   !TTS.tts.layers.vits.discriminatorr   TTS.tts.layers.vits.networksr   r   r   1TTS.tts.layers.vits.stochastic_duration_predictorr   TTS.tts.models.base_ttsr   TTS.tts.utils.fairseqr   TTS.tts.utils.helpersr    r!   r"   r#   r$   TTS.tts.utils.languagesr%   TTS.tts.utils.speakersr&   TTS.tts.utils.synthesisr'   TTS.tts.utils.text.charactersr(   r)   r*   r+   r,   r-   TTS.tts.utils.text.tokenizerr.   TTS.tts.utils.visualr/   TTS.utils.ior0   TTS.utils.samplersr1   $TTS.vocoder.models.hifigan_generatorr2   TTS.vocoder.utils.generic_utilsr3   r{   r   r  r  r:   rE   rM   rU   rX   r]   r_   r   r   r   r   r2  rz   r  r   r   r  rY  r  r  r8   r8   r8   r9   <module>   s     	


-
8m n          =0