o
    
j}                     @   s   d dl Z d dlmZ d dlZd dlZd dlm  mZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZmZ d d	lmZ d d
lmZ e  ddedddddddd ddfddZdd Zdd ZeG dd deZ eG dd deZ!G dd deZ"dS )     N)	dataclass)Coqpit)GPT)HifiDecoder)init_stream_support)VoiceBpeTokenizersplit_sentence)SpeakerManagerLanguageManager)BaseTTS)load_fsspecz"../experiments/clips_mel_norms.pthcpu         F"V  @  P   c                 C   sz   t jj||||||	|
||dd
|}| |} || }ttj|dd}|du r1tj||d}||dd }|S )	a  
    Convert waveform to mel-spectrogram with hard-coded parameters for cloning.

    Args:
        wav (torch.Tensor): Input waveform tensor.
        mel_norms_file (str): Path to mel-spectrogram normalization file.
        mel_norms (torch.Tensor): Mel-spectrogram normalization tensor.
        device (torch.device): Device to use for computation.

    Returns:
        torch.Tensor: Mel-spectrogram tensor.
    slaney)
n_fft
hop_length
win_lengthpower
normalizedsample_ratef_minf_maxn_melsnormgh㈵>)minNmap_locationr   )	
torchaudio
transformsMelSpectrogramtotorchlogclampload	unsqueeze)wavmel_norms_file	mel_normsdevicer   r   r   r   r   r   r   r   r   mel_stftmel r2   F/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/models/xtts.pywav_to_mel_cloning   s*   
r4   c                 C   s   t | \}}|ddkrtj|ddd}||kr"t j|||}t|dks0t|dk sAtd|  d|	  d|
   |d	d |S )
Nr      T)dimkeepdim
   zError with z. Max=z min=r"   )r#   r*   sizer'   mean
functionalresampleanyprintmaxr   clip_)	audiopathsampling_rateaudiolsrr2   r2   r3   
load_audioE   s   "rE   c                 C   sR   | dd|f }| j d |kr| }|S | j d |k r't| d|| j d  f}|S )a>  
    Ensure a given tensor t has a specified sequence length by either padding it with zeros or clipping it.

    Args:
        t (torch.Tensor): The input tensor to be padded or truncated.
        length (int): The desired length of the tensor.

    Returns:
        torch.Tensor: The padded or truncated tensor.
    .Nr"   r   )shapeFpad)tlengthtpr2   r2   r3   pad_or_truncate[   s   rL   c                   @   s*   e Zd ZU dZdZeed< dZeed< dS )XttsAudioConfigz
    Configuration class for audio-related parameters in the XTTS model.

    Args:
        sample_rate (int): The sample rate in which the GPT operates.
        output_sample_rate (int): The sample rate of the output audio waveform.
    r   r   ]  output_sample_rateN)__name__
__module____qualname____doc__r   int__annotations__rO   r2   r2   r2   r3   rM   n   s   
 rM   c                   @   sz  e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed	< dZeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed < d!Zeed"< d#Zeed$< dZeed%< dZeed&< dZeed'< d(Zeed)< d*Z eed+< d,Z!eed-< dZ"eed.< d/Z#eed0< dZ$eed1< d2Z%eed3< dS )4XttsArgsa_	  A dataclass to represent XTTS model arguments that define the model structure.

    Args:
        gpt_batch_size (int): The size of the auto-regressive batch.
        enable_redaction (bool, optional): Whether to enable redaction. Defaults to True.
        kv_cache (bool, optional): Whether to use the kv_cache. Defaults to True.
        gpt_checkpoint (str, optional): The checkpoint for the autoregressive model. Defaults to None.
        clvp_checkpoint (str, optional): The checkpoint for the ConditionalLatentVariablePerseq model. Defaults to None.
        decoder_checkpoint (str, optional): The checkpoint for the DiffTTS model. Defaults to None.
        num_chars (int, optional): The maximum number of characters to generate. Defaults to 255.

        For GPT model:
        gpt_max_audio_tokens (int, optional): The maximum mel tokens for the autoregressive model. Defaults to 604.
        gpt_max_text_tokens (int, optional): The maximum text tokens for the autoregressive model. Defaults to 402.
        gpt_max_prompt_tokens (int, optional): The maximum prompt tokens or the autoregressive model. Defaults to 70.
        gpt_layers (int, optional): The number of layers for the autoregressive model. Defaults to 30.
        gpt_n_model_channels (int, optional): The model dimension for the autoregressive model. Defaults to 1024.
        gpt_n_heads (int, optional): The number of heads for the autoregressive model. Defaults to 16.
        gpt_number_text_tokens (int, optional): The number of text tokens for the autoregressive model. Defaults to 255.
        gpt_start_text_token (int, optional): The start text token for the autoregressive model. Defaults to 255.
        gpt_checkpointing (bool, optional): Whether to use checkpointing for the autoregressive model. Defaults to False.
        gpt_train_solo_embeddings (bool, optional): Whether to train embeddings for the autoregressive model. Defaults to False.
        gpt_code_stride_len (int, optional): The hop_size of dvae and consequently of the gpt output. Defaults to 1024.
        gpt_use_masking_gt_prompt_approach (bool, optional):  If True, it will use ground truth as prompt and it will mask the loss to avoid repetition. Defaults to True.
        gpt_use_perceiver_resampler (bool, optional):  If True, it will use perceiver resampler from flamingo paper - https://arxiv.org/abs/2204.14198. Defaults to False.
    r5   gpt_batch_sizeFenable_redactionTkv_cacheNgpt_checkpointclvp_checkpointdecoder_checkpoint   	num_chars tokenizer_filei]  gpt_max_audio_tokensi  gpt_max_text_tokensF   gpt_max_prompt_tokens   
gpt_layersr   gpt_n_model_channels   gpt_n_headsgpt_number_text_tokensgpt_start_text_tokengpt_stop_text_tokeni   gpt_num_audio_tokensi    gpt_start_audio_tokeni   gpt_stop_audio_tokengpt_code_stride_len"gpt_use_masking_gt_prompt_approachgpt_use_perceiver_resamplerr   input_sample_raterN   rO      output_hop_lengthdecoder_input_dimi   d_vector_dim&cond_d_vector_in_each_upsampling_layeri  duration_const)&rP   rQ   rR   rS   rW   rT   rU   rX   boolrY   rZ   strr[   r\   r^   r`   ra   rb   rd   rf   rg   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rO   ru   rv   rw   rx   ry   r2   r2   r2   r3   rV   |   s@   
 rV   c                       sX  e Zd ZdZdef fddZdd Zedd Ze	
 d9dedefddZe	
 dd Ze	
 			
	
			d:ddZd;ddZe	
 									
		d<dd Ze	
 							!		d=d"d#Zd$d% Ze	
 	&	'								d>d(d)Zd*d+ Zd,d- Zed?d/d0Z fd1d2Zd3d4 Z							d@d5d6Zd7d8 Z  ZS )AXttsu  ⓍTTS model implementation.

    ❗ Currently it only supports inference.

    Examples:
        >>> from TTS.tts.configs.xtts_config import XttsConfig
        >>> from TTS.tts.models.xtts import Xtts
        >>> config = XttsConfig()
        >>> model = Xtts.inif_from_config(config)
        >>> model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True)
    configc                    sp   t  j|d d d d | _|| _| jj| _| jj| _|j| _| jj	| _	t
 | _d | _|   | dtd d S )N)ap	tokenizer	mel_statsr   )super__init__mel_stats_pathr}   argsrZ   r\   	model_dir
models_dirrW   r   r   gptinit_modelsregister_bufferr'   ones)selfr}   	__class__r2   r3   r      s   


zXtts.__init__c                 C   s   | j j dur| j  | j_| j j d| j_| j j d| j_| jjrRt| jj| jj	| jj| jj| jj
| jj| jj| jj| jj| jj| jj| jj| jj| jjd| _t| jj| jj| jj| jj| jj| jj| jjd| _dS )zOInitialize the models. We do it here since we need to load the tokenizer first.Nz[START]z[STOP])layers	model_dimstart_text_tokenstop_text_tokenheadsmax_text_tokensmax_mel_tokensmax_prompt_tokensnumber_text_tokensnum_audio_tokensstart_audio_tokenstop_audio_tokenuse_perceiver_resamplercode_stride_len)rs   rO   ru   ar_mel_length_compressionrv   rw   rx   )r   get_number_tokensr   rj   token_to_idrk   rl   r   rf   rg   ri   rb   ra   rd   rm   rn   ro   rr   rp   r   r   rs   rO   ru   rv   rw   rx   hifigan_decoderr   r2   r2   r3   r      s<   zXtts.init_modelsc                 C   s   t |  jS N)next
parametersr/   r   r2   r2   r3   r/      s   zXtts.devicere      rJ   chunk_lengthc                 C   s(  |dkrt j||d}|dkr|dddd| f }| jjrrg }td|jd d| D ]:}|dd||d|  f }|ddk rEq-t|| j	
 ddd	d
dddddd}| j|| jd}	||	 q-t|jdd}
nt|| j	
 dd	dd
dddddd}| j|| j}
|
dd
S )a  Compute the conditioning latents for the GPT model from the given audio.

        Args:
            audio (tensor): audio tensor.
            sr (int): Sample rate of the audio.
            length (int): Length of the audio in seconds. If < 0, use the whole audio. Defaults to 30.
            chunk_length (int): Length of the audio chunks in seconds. When `length == chunk_length`, the whole audio
                is being used without chunking. It must be < `length`. Defaults to 6.
        r   r   Nr5   r"   g    l@i   rt   r   r   Fr   r   )
r.   r   r   r   r   r   r   r   r   r   r6   r   )r#   r;   r<   r   rr   rangerF   r9   r4   r   r   r   get_style_embr&   r/   appendr'   stackr:   	transpose)r   rC   srrJ   r   
style_embsiaudio_chunk	mel_chunk	style_embcond_latentr1   r2   r2   r3   get_gpt_cond_latents  sR   zXtts.get_gpt_cond_latentsc                 C   s8   t j||d}| jjj|| jddd| jS )Ni>  T)l2_normr"   )	r#   r;   r<   r   speaker_encoderforwardr&   r/   r+   )r   rC   r   	audio_16kr2   r2   r3   get_speaker_embedding<  s
   zXtts.get_speaker_embeddingNFr   c                 C   s   t |ts	|g}n|}g }	g }
d}|D ]B}t||}|ddd|| f | j}|r7|t|  d }|durEtj	j
||dd }| ||}|	| |
| qtj|
dd}| j||||d}|	rst|	}|jdd}||fS )a	  Get the conditioning latents for the GPT model from the given audio.

        Args:
            audio_path (str or List[str]): Path to reference audio file(s).
            max_ref_length (int): Maximum length of each reference audio in seconds. Defaults to 30.
            gpt_cond_len (int): Length of the audio used for gpt latents. Defaults to 6.
            gpt_cond_chunk_len (int): Chunk length used for gpt latents. It must be <= gpt_conf_len. Defaults to 6.
            librosa_trim_db (int, optional): Trim the audio using this value. If None, not trimming. Defaults to None.
            sound_norm_refs (bool, optional): Whether to normalize the audio. Defaults to False.
            load_sr (int, optional): Sample rate to load the audio. Defaults to 24000.
        N      ?)top_dbr   r"   r   )rJ   r   )
isinstancelistrE   r&   r/   r'   absr?   librosaeffectstrimr   r   catr   r   r:   )r   
audio_pathmax_ref_lengthgpt_cond_lengpt_cond_chunk_lenlibrosa_trim_dbsound_norm_refsload_sraudio_pathsspeaker_embeddingsaudiosspeaker_embedding	file_pathrC   
full_audiogpt_cond_latentsr2   r2   r3   get_conditioning_latentsE  s0   

 

zXtts.get_conditioning_latentsc           
      K   s   |dkrn|| j jv sJ d| d| j j |j|j|j|j|jd}|| |durC| jj	| 
 \}}	| j||||	fi |S ||j|j|j|jd | j|||fi |S )a  Synthesize speech with the given input text.

        Args:
            text (str): Input text.
            config (XttsConfig): Config with inference parameters.
            speaker_wav (list): List of paths to the speaker audio files to be used for cloning.
            language (str): Language ID of the speaker.
            **kwargs: Inference settings. See `inference()`.

        Returns:
            A dictionary of the output values with `wav` as output waveform, `deterministic_seed` as seed used at inference,
            `text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents`
            as latents used at inference.

        zhzzh-cnu    ❗ Language z+ is not supported. Supported languages are )temperaturelength_penaltyrepetition_penaltytop_ktop_pN)r   r   max_ref_lenr   )r}   	languagesr   r   r   r   r   updatespeaker_managerspeakersvalues	inferencer   r   r   r   full_inference)
r   textr}   speaker_wavlanguage
speaker_idkwargssettingsgpt_cond_latentr   r2   r2   r3   
synthesize~  s(   
zXtts.synthesizer         ?      $@2   333333?Tr8   c              	   K   s>   | j ||
|||d\}}| j||||f||||||	d|S )aq  
        This function produces an audio clip of the given text being spoken with the given reference voice.

        Args:
            text: (str) Text to be spoken.

            ref_audio_path: (str) Path to a reference audio file to be used for cloning. This audio file should be >3
                seconds long.

            language: (str) Language of the voice to be generated.

            temperature: (float) The softmax temperature of the autoregressive model. Defaults to 0.65.

            length_penalty: (float) A length penalty applied to the autoregressive decoder. Higher settings causes the
                model to produce more terse outputs. Defaults to 1.0.

            repetition_penalty: (float) A penalty that prevents the autoregressive decoder from repeating itself during
                decoding. Can be used to reduce the incidence of long silences or "uhhhhhhs", etc. Defaults to 2.0.

            top_k: (int) K value used in top-k sampling. [0,inf]. Lower values mean the decoder produces more "likely"
                (aka boring) outputs. Defaults to 50.

            top_p: (float) P value used in nucleus sampling. (0,1]. Lower values mean the decoder produces more "likely"
                (aka boring) outputs. Defaults to 0.8.

            gpt_cond_len: (int) Length of the audio used for cloning. If audio is shorter, then audio length is used
                else the first `gpt_cond_len` secs is used. Defaults to 30 seconds.

            gpt_cond_chunk_len: (int) Chunk length used for cloning. It must be <= `gpt_cond_len`.
                If gpt_cond_len == gpt_cond_chunk_len, no chunking. Defaults to 6 seconds.

            hf_generate_kwargs: (**kwargs) The huggingface Transformers generate API is used for the autoregressive
                transformer. Extra keyword args fed to this function get forwarded directly to that API. Documentation
                here: https://huggingface.co/docs/transformers/internal/generation_utils

        Returns:
            Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length.
            Sample rate is 24kHz.
        )r   r   r   r   r   )r   r   r   r   r   	do_sample)r   r   )r   r   ref_audio_pathr   r   r   r   r   r   r   r   r   r   r   hf_generate_kwargsr   r   r2   r2   r3   r     s,   ;
zXtts.full_inferencer5   c                 K   s  | dd }dt|d }|| j}|| j}|r't||| jj| }n|g}g }g }|D ]}|  }t	
| jj||dd| j}|jd | jjk sWJ dt	 p | jjd||d |
|	||| j|||dd	|}t	j|jd | jj g|jd
}t	j|jd g| jd
}| j|||||ddd}|dkrtj|dd|dddd}||  || j||d   W d    n1 sw   Y  q0t	j|dd t	j|dd |dS )N-r   r   皙?langr"   >    ❗ XTTS can only generate text with a maximum of 400 tokens.F)cond_latentstext_inputsinput_tokensr   r   r   r   num_return_sequences	num_beamsr   r   output_attentions)r/   T)r   return_attentionsreturn_latentr5   r   linearscale_factormodegr   )r,   gpt_latentsr   r2   )splitr?   r&   r/   r   r   char_limitsstriplowerr'   	IntTensorencoder+   rF   r   rb   no_gradr   generaterW   tensorr   rG   interpolater   r   r   r   squeezer   numpy)r   r   r   r   r   r   r   r   r   r   r   r   speedenable_text_splittingr   length_scalewavsgpt_latents_listsenttext_tokens	gpt_codesexpected_output_lentext_lenr  r2   r2   r3   r     sv   $

(zXtts.inferencec                 C   s   |d|  }|dur||j d | |  }|durk|t|kr=|dur1||j d | d }n|| d }||dfS |d| }|tdd||j }|tdd||j |d|< |d|  |7  < || d }|}|||fS )z)Handle chunk formatting in streaming modeNr   g        r   )rF   lenr'   linspacer&   r/   )r   wav_genwav_gen_prevwav_overlapoverlap_len	wav_chunkcrossfade_wavr2   r2   r3   handle_chunksI  s    
"
zXtts.handle_chunks   r   c                 k   s   | dd }dt|d }|| j}|| j}|r(t||| jj| }n|g}|D ]}|  }t	
| jj||dd| j}|jd | jjk sTJ d| j|| j|}| jjd||
|||ddt|t|	d	d
d|}g }g }d }d }d	}|szt|\}}||g7 }||g7 }W n ty   d
}Y nw |s|dkrt||krt	j|ddd d d f }|dkrtj|dd|dddd}| j||| jd}| | |||\}}}g }|V  |rq-d S )Nr   r   r   r   r   r"   r   r5   FT)fake_inputsr   r   r   r   r   r   r   r   r   output_hidden_statesr   r   r   r   r   r2   )r  r?   r&   r/   r   r   r  r  r  r'   r  r  r+   rF   r   rb   r   compute_embeddingsget_generatorfloatr   StopIterationr  r   rG   r  r   r   r   r  )r   r   r   r   r   stream_chunk_sizeoverlap_wav_lenr   r   r   r   r   r   r  r  r   r  r  r  r"  gpt_generatorlast_tokensall_latentsr  r  is_endxlatentr  r  r  r2   r2   r3   inference_streamb  s~   $


zXtts.inference_streamc                 C      t dNzuXTTS has a dedicated trainer, please check the XTTS docs: https://tts.readthedocs.io/en/dev/models/xtts.html#trainingNotImplementedErrorr   r2   r2   r3   r        zXtts.forwardc                 C   r1  r2  r3  r   r2   r2   r3   	eval_step  r5  zXtts.eval_step
XttsConfigc                 K   s   t | S r   )r|   )r}   r   r2   r2   r3   init_from_config  s   zXtts.init_from_configc                    s   | j   t   dS )znSets the model to evaluation mode. Overrides the default eval() method to also set the GPT model to eval mode.N)r   init_gpt_for_inferencer   evalr   r   r2   r3   r:    s   
z	Xtts.evalc                 C   sx   t |tddd }g d}t| D ]$}|dr-|dd}|| ||< ||= |}|dd |v r9||= q|S )	Nr   r    model)#torch_mel_spectrogram_style_encodertorch_mel_spectrogram_dvaedvaezxtts.r_   .r   )r   r'   r/   r   keys
startswithreplacer  )r   
model_path
checkpointignore_keyskeynew_keyr2   r2   r3   $get_compatible_checkpoint_state_dict  s   
z)Xtts.get_compatible_checkpoint_state_dictc	                 C   s   |pt j|d}	|pt j|d}|pt j|d}t|| _d| _t j|r.t|| _t j|r:t|d| _	| 
  | |	}
z	| j|
|d W n   |r[| jj| jjd | j|
|d Y |r{| j  | jj| jj|d | j  dS dS )	a  
        Loads a checkpoint from disk and initializes the model's state and tokenizer.

        Args:
            config (dict): The configuration dictionary for the model.
            checkpoint_dir (str, optional): The directory where the checkpoint is stored. Defaults to None.
            checkpoint_path (str, optional): The path to the checkpoint file. Defaults to None.
            vocab_path (str, optional): The path to the vocabulary file. Defaults to None.
            eval (bool, optional): Whether to set the model to evaluation mode. Defaults to True.
            strict (bool, optional): Whether to strictly enforce that the keys in the checkpoint match the keys in the model. Defaults to True.

        Returns:
            None
        z	model.pthz
vocab.jsonzspeakers_xtts.pthN)
vocab_file)strict)rY   )rY   use_deepspeed)ospathjoinr
   language_managerr   existsr	   r   r   r   rH  load_state_dictr   r9  r   rY   r   r:  )r   r}   checkpoint_dircheckpoint_path
vocab_pathr:  rJ  rK  speaker_file_pathrC  rD  r2   r2   r3   load_checkpoint  s,   



zXtts.load_checkpointc                 C   r1  r2  r3  r   r2   r2   r3   
train_step  r5  zXtts.train_step)re   r   )re   r   r   NFr   r   )
r   r   r   r   r   Tre   r   r8   F)	r   r   r   r   r   Tr5   r   F)
r!  r   r   r   r   r   r   Tr   F)r}   r7  )NNNTTFN)rP   rQ   rR   rS   r   r   r   propertyr/   r'   inference_moderT   r   r   r   r   r   r   r   r0  r   r6  staticmethodr8  r:  rH  rV  rW  __classcell__r2   r2   r   r3   r|      s    #
:

8'PRS
7r|   )#rL  dataclassesr   r   r'   torch.nn.functionalnnr;   rG   r#   coqpitr   TTS.tts.layers.xtts.gptr   #TTS.tts.layers.xtts.hifigan_decoderr   $TTS.tts.layers.xtts.stream_generatorr   TTS.tts.layers.xtts.tokenizerr   r    TTS.tts.layers.xtts.xtts_managerr	   r
   TTS.tts.models.base_ttsr   TTS.utils.ior   r/   r4   rE   rL   rM   rV   r|   r2   r2   r2   r3   <module>   sF    
0B