o
    
jkY                     @   s   d dl Z d dlZd dlmZ d dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ G dd dej Z!dS )    N)List)nn)load_config)
VitsConfig)setup_model)Vits)	synthesistransfer_voicetrim_silence)AudioProcessor)save_wav)interpolate_vocoder_inputc                       s  e Zd Z													d7dededededed	ed
edededededededdf fddZedefddZdedededdfddZdededdfddZ	dededdfddZ
dedededdfddZd d! Zd"ed#ededdfd$d%Zdee fd&d'Zd8d(ee d)eddfd*d+Zd,ed-edee fd.d/Z									0d9d1ed2ed3ed4edee f
d5d6Z  ZS ):Synthesizer NFtts_checkpointtts_config_pathtts_speakers_filetts_languages_filevocoder_checkpointvocoder_configencoder_checkpointencoder_configvc_checkpoint	vc_config	model_dir	voice_diruse_cudareturnc                    s^  t    || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _d| _d| _d| _d| _i | _d| _d| _i | _d| _| d| _|| _|| _| jrYtj sYJ d|ri| ||| | jjd | _|ry|  ||| | jjd | _|	r| !|	|
| | jjd | _|rd|v r| "|| | jjd | _dS | #|| | jjd | _dS dS )u  General 🐸 TTS interface for inference. It takes a tts and a vocoder
        model and synthesize speech from the provided text.

        The text is divided into a list of sentences using `pysbd` and synthesize
        speech on each sentence separately.

        If you have certain special characters in your text, you need to handle
        them before providing the text to Synthesizer.

        TODO: set the segmenter based on the source language

        Args:
            tts_checkpoint (str, optional): path to the tts model file.
            tts_config_path (str, optional): path to the tts config file.
            vocoder_checkpoint (str, optional): path to the vocoder model file. Defaults to None.
            vocoder_config (str, optional): path to the vocoder config file. Defaults to None.
            encoder_checkpoint (str, optional): path to the speaker encoder model file. Defaults to `""`,
            encoder_config (str, optional): path to the speaker encoder config file. Defaults to `""`,
            vc_checkpoint (str, optional): path to the voice conversion model file. Defaults to `""`,
            vc_config (str, optional): path to the voice conversion config file. Defaults to `""`,
            use_cuda (bool, optional): enable/disable cuda. Defaults to False.
        Nr   enz%CUDA is not availabe on this machine.sample_rateoutput_sample_ratefairseq)$super__init__r   r   r   r   r   r   r   r   r   r   r   	tts_modelvocoder_modelvc_modelspeaker_managertts_speakerslanguage_managernum_languagestts_languagesd_vector_dim_get_segmentersegr   torchcudais_available	_load_tts
tts_configaudior    _load_vocoder_load_vc_load_fairseq_from_dir_load_tts_from_dir)selfr   r   r   r   r   r   r   r   r   r   r   r   r   	__class__ H/home/kuhnn/.local/lib/python3.10/site-packages/TTS/utils/synthesizer.pyr#      sT   
&zSynthesizer.__init__langc                 C   s   t j| ddS )zget the sentence segmenter for the given language.

        Args:
            lang (str): target language code.

        Returns:
            [type]: [description]
        T)languageclean)pysbd	Segmenter)r>   r<   r<   r=   r-   p   s   
zSynthesizer._get_segmentervc_config_pathc                 C   s>   t || _t| jd| _| j| j| |r| j  dS dS )a  Load the voice conversion model.

        1. Load the model config.
        2. Init the model from the config.
        3. Load the model weights.
        4. Move the model to the GPU if CUDA is enabled.

        Args:
            vc_checkpoint (str): path to the model checkpoint.
            tts_config_path (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        configN)r   r   setup_vc_modelr&   load_checkpointr0   )r9   r   rC   r   r<   r<   r=   r6   |   s   
zSynthesizer._load_vcc                 C   sJ   t  | _t| j| _| jj| j|dd | jj| _|r#| j  dS dS )zLoad the fairseq model from a directory.

        We assume it is VITS and the model knows how to load itself from the directory and there is a config.json file in the directory.
        Tcheckpoint_direvalN)r   r3   r   init_from_configr$   load_fairseq_checkpointrE   r0   )r9   r   r   r<   r<   r=   r7      s   
z"Synthesizer._load_fairseq_from_dirc                 C   sJ   t tj|d}|| _t|| _| jj||dd |r#| j  dS dS )zLoad the TTS model from a directory.

        We assume the model knows how to load itself from the directory and there is a config.json file in the directory.
        zconfig.jsonTrH   N)	r   ospathjoinr3   setup_tts_modelr$   rG   r0   )r9   r   r   rE   r<   r<   r=   r8      s   
zSynthesizer._load_tts_from_dirc                 C   s   t || _| jd r| jd du rtdt| jd| _| js#|   | jj| j|dd |r4| j  | jrJt	| jdrL| jj
| j| j| dS dS dS )	a  Load the TTS model.

        1. Load the model config.
        2. Init the model from the config.
        3. Load the model weights.
        4. Move the model to the GPU if CUDA is enabled.
        5. Init the speaker manager in the model.

        Args:
            tts_checkpoint (str): path to the model checkpoint.
            tts_config_path (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        use_phonemes
phonemizerNz,Phonemizer is not defined in the TTS config.rD   TrJ   r'   )r   r3   
ValueErrorrP   r$   r   *_set_speaker_encoder_paths_from_tts_configrG   r0   hasattrr'   init_encoderr   )r9   r   r   r   r<   r<   r=   r2      s   

zSynthesizer._load_ttsc                 C   s>   t | jdrt | jjdr| jjj| _| jjj| _dS dS dS )zQSet the encoder paths from the tts model config for models with speaker encoders.
model_argsspeaker_encoder_config_pathN)rV   r3   rX   speaker_encoder_model_pathr   rY   r   )r9   r<   r<   r=   rU      s   z6Synthesizer._set_speaker_encoder_paths_from_tts_config
model_filemodel_configc                 C   sX   t || _tdddi| jj| _t| j| _| jj| j|dd |r*| j  dS dS )a  Load the vocoder model.

        1. Load the vocoder config.
        2. Init the AudioProcessor for the vocoder.
        3. Init the vocoder model from the config.
        4. Move the model to the GPU if CUDA is enabled.

        Args:
            model_file (str): path to the model checkpoint.
            model_config (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        verboseFTrS   Nr<   )	r   r   r   r4   
vocoder_apsetup_vocoder_modelr%   rG   r0   )r9   r[   r\   r   r<   r<   r=   r5      s   
zSynthesizer._load_vocoderc                 C   s   | j |S )zSplit give text into sentences.

        Args:
            text (str): input text in string format.

        Returns:
            List[str]: list of sentences.
        )r.   segment)r9   textr<   r<   r=   split_into_sentences   s   	z Synthesizer.split_into_sentenceswavrN   c                 C   s@   t |r|  }t|trt|}t||| j	|d dS )a  Save the waveform as a file.

        Args:
            wav (List[int]): waveform as a list of values.
            path (str): output path to save the waveform.
            pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
        )rc   rN   r   pipe_outN)
r/   	is_tensorcpunumpy
isinstancelistnparrayr   r    )r9   rc   rN   rd   r<   r<   r=   r      s
   
	

zSynthesizer.save_wav
source_wav
target_wavc                 C   s   | j ||}|S N)r&   voice_conversion)r9   rl   rm   
output_wavr<   r<   r=   ro      s   zSynthesizer.voice_conversionTra   speaker_namelanguage_namesplit_sentencesc
                 K   s  t   }g }|s|std|r"|g}|	rtd | |}t| d|
v r0|
d | _|
d d}d}| js>t| jj	dr|rmt
|trm| jjdksm| jjre| jj	j|ddd}t|dddf }n7| jj	j| }n/t| jj	jd	krt| jj	j d
 }n|s|stdd}n|r| jdu rtd| dd}| jst| jdr| jjdur| jjdkst| jjjd	krt| jjj d
 }n<|rt
|trz	| jjj| }W n+ ty } ztd| d| jjj  d|d}~ww |stdtd| d|dur*| jj	dur*t| jj	dr*| jj	jdur*| jj	|}d}| jdu }|s<t| j j}| j rBd}|s|D ]}t| jdrd| jj!d&|| j|| j|||d|
}nt"| j|| j| j ||||||d
}|d }|s|d d d
 # $ % }| jj&'|j(j(}| j)*|j(}d	| j+d d | jj&j, g}|d	 d	krtd  t-||}nt./|0d
}| j1|2|}t.3|r|jt.dkr|s|$ }|s|% }|4 }d!| jj5v r | jj5d! r t6|| jj&}|t|7 }|d
gd" 7 }qGnd}d}| js t| jj	drR|rKt
|trK| jjrC| jj	7|d
 }t|dddf }n| jj	j| }n| jj	|}t8| j| j| j ||||||d#	}|}|s|d
 # $ % }| jj&'|j(j(}| j)*|j(}d	| j+d d | jj&j, g}|d	 d	krtd  t-||}nt./|0d
}| j1|2|}t.3|r|jt.dkr|$ }|s|% }|4 }t   | }t|| jj5d  }td$|  td%||   |S )'u  🐸 TTS magic. Run all the models and generate speech.

        Args:
            text (str): input text.
            speaker_name (str, optional): speaker id for multi-speaker models. Defaults to "".
            language_name (str, optional): language id for multi-language models. Defaults to "".
            speaker_wav (Union[str, List[str]], optional): path to the speaker wav for voice cloning. Defaults to None.
            style_wav ([type], optional): style waveform for GST. Defaults to None.
            style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
            reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
            reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None.
            split_sentences (bool, optional): split the input text into sentences. Defaults to True.
            **kwargs: additional arguments to pass to the TTS model.
        Returns:
            List[int]: [description]
        zuYou need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API.z > Text splitted to sentences.r   N
name_to_idxttsF)num_samples	randomize   r   z [!] Looks like you are using a multi-speaker model. You need to define either a `speaker_idx` or a `speaker_wav` to use a multi-speaker model.z; [!] Missing speakers.json file path for selecting speaker z\.Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. r)   z8 [!] Looks like you use a multi-lingual model. Language z$ is not in the available languages: .z [!] Look like you use a multi-lingual model. You need to define either a `language_name` or a `style_wav` to use a multi-lingual model.z@ [!] Missing language_ids.json file path for selecting language zb.Define path for language_ids.json if it is a multi-lingual model or remove defined language idx. 
encoder_aprf   r0   
synthesize)ra   rE   
speaker_id
voice_dirsd_vectorspeaker_wavr?   )
modelra   CONFIGr   r|   	style_wav
style_textuse_griffin_limr~   language_idrc   outputsmodel_outputsr4   r   z" > interpolating tts model output.do_trim_silencei'  )	r   r   r   reference_wavr|   r~   r   reference_speaker_idreference_d_vectorz > Processing time: z > Real-time factor: r<   )9timerT   printrb   r   popr   rV   r$   r'   rh   strr3   r   use_d_vector_fileget_mean_embeddingrj   rk   rt   lenri   valuesr   r)   KeyErrorkeysrz   compute_embedding_from_clipr%   next
parametersdevicer   r{   r   detachrf   rg   apdenormalizeTr^   	normalizer   r   r   r/   tensor	unsqueeze	inferencetore   squeezer4   r
   get_embeddings_by_namer	   )r9   ra   rq   rr   r   r   r   r   reference_speaker_namers   kwargs
start_timewavssensspeaker_embeddingr|   r   evocoder_deviceuse_glsenr   waveformmel_postnet_specvocoder_inputscale_factorreference_speaker_embeddingr   process_time
audio_timer<   r<   r=   tts  sd  







$<

zSynthesizer.tts)r   r   r   r   r   r   r   r   r   r   r   NFrn   )	r   r   r   NNNNNT)__name__
__module____qualname__r   boolr#   staticmethodr-   r6   r7   r8   r2   rU   r5   r   rb   intr   ro   r   __classcell__r<   r<   r:   r=   r      s    	
V
r   )"rM   r   typingr   rg   rj   rA   r/   r   
TTS.configr   TTS.tts.configs.vits_configr   TTS.tts.modelsr   rP   TTS.tts.models.vitsr   TTS.tts.utils.synthesisr   r	   r
   TTS.utils.audior    TTS.utils.audio.numpy_transformsr   TTS.vc.modelsrF   TTS.vocoder.modelsr_   TTS.vocoder.utils.generic_utilsr   Moduler   r<   r<   r<   r=   <module>   s$    