o
    
j&I                     @   s   d dl Z d dlZd dlmZmZmZmZ d dlZd dlm	Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$m%Z% G dd deZ&dS )    N)DictListTupleUnion)Coqpit)nn)
DataLoader)WeightedRandomSampler)DistributedSamplerDistributedSamplerWrapper)BaseTrainerModel)
TTSDataset)get_length_balancer_weights)LanguageManagerget_language_balancer_weights)SpeakerManagerget_speaker_balancer_weights)	synthesis)plot_alignmentplot_spectrogramc                       s  e Zd ZdZdZ		d+dedddedef fd	d
ZdefddZ	d,dede
fddZdefddZdd ZdedefddZd-dedefddZ	d,dedededee
e e
e
 f ded ed!edd"fd#d$Zdefd%d&Zdedeeef fd'd(Zd)d* Z  ZS ).BaseVCzBase `vc` class. Every new `vc` model must inherit this.

    It defines common `vc` specific functions on top of `Model` implementation.
    vcNconfigapAudioProcessorspeaker_managerlanguage_managerc                    s0   t    || _|| _|| _|| _| | d S N)super__init__r   r   r   r   _set_model_args)selfr   r   r   r   	__class__ H/home/kuhnn/.local/lib/python3.10/site-packages/TTS/vc/models/base_vc.pyr       s   
zBaseVC.__init__c                 C   s<   d|j jv r|| _|j| _dS d|j jv r|| _dS td)a  Setup model args based on the config type (`ModelConfig` or `ModelArgs`).

        `ModelArgs` has all the fields reuqired to initialize the model architecture.

        `ModelConfig` has all the fields required for training, inference and containes `ModelArgs`.

        If the config is for training with a name like "*Config", then the model args are embeded in the
        config.model_args

        If the config is for the model with a name like "*Args", then we assign the directly.
        ConfigArgsz(config must be either a *Config or *ArgsN)r#   __name__r   
model_argsargs
ValueError)r!   r   r$   r$   r%   r    .   s   
zBaseVC._set_model_argsdatac                 C   s   | j dur| j j| _n	t|dr|j| _|js|jr)d|v r&|jdur&|jnd| _|jrG|jsItd t	| j| j| _
| j
jjdd dS dS dS )a  Initialize a speaker embedding layer if needen and define expected embedding channel size for defining
        `in_channels` size of the connected layers.

        This implementation yields 3 possible outcomes:

        1. If `config.use_speaker_embedding` and `config.use_d_vector_file are False, do nothing.
        2. If `config.use_d_vector_file` is True, set expected embedding channel size to `config.d_vector_dim` or 512.
        3. If `config.use_speaker_embedding`, initialize a speaker embedding layer with channel size of
        `config.d_vector_dim` or 512.

        You can override this function for new models.

        Args:
            config (Coqpit): Model configuration.
        Nnum_speakersd_vector_dimi   z  > Init speaker_embedding layer.r   g333333?)r   r-   hasattruse_speaker_embeddinguse_d_vector_filer.   embedded_speaker_dimprintr   	Embeddingspeaker_embeddingweightr,   normal_)r!   r   r,   r$   r$   r%   init_multispeakerC   s   

zBaseVC.init_multispeakerreturnc                 K   s   dddddS )z2Prepare and return `aux_input` used by `forward()`N)
speaker_id	style_wavd_vectorlanguage_idr$   )r!   kwargsr$   r$   r%   get_aux_inputd   s   zBaseVC.get_aux_inputc           
      C   s*  t | jdr| jj}n| j}d\}}}}t|trHt|dkr$|d }n&t|dkr/|\}}nt|dkr;|\}}}nt|dkrG|\}}}}n|}d\}}}	| jd ur{|jrh|d u ra| j }n| j	|}n|j
r{|d u ru| j }n| jj| }| jd ur|jr|d ur| jj| }	|||||	d	S )
Nr)   )NNNN   r            )NNN)textr:   r;   r<   r=   )r/   r   r)   
isinstancelistlenr   r1   get_random_embeddingget_d_vector_by_namer0   get_random_id
name_to_idr   use_language_embedding)
r!   sentence_infor   rD   speaker_namer;   language_namer:   r<   r=   r$   r$   r%   !get_aux_input_from_test_sentencesh   sB   





z(BaseVC.get_aux_input_from_test_sentencesbatchc                 C   sr  |d }|d }|d }|d }|d }|d }|d }|d }	|d	 }
|d
 }|d }|d }|d }|d }|d }t | }t | }d}|durt |jd |jd }t|D ]m\}}|ddd|| d|| f dd }t j|dd\}}t || g|j	}|||< |
 ||  }t | d| }||  d8  < |
 || ksJ d|
  d||  |||d|| f< q`||jd |d| jj d}|
ddkd d}t || jj }i d|d|d|d|d|d|d|d|d|d |d
|d	|
d!t|d"t|d#|	d|d||||d$ d%S )&zGeneric batch formatting for `VCDataset`.

        You must override this if you use a custom dataset.

        Args:
            batch (Dict): [description]

        Returns:
            Dict: [description]
        token_idtoken_id_lengthsspeaker_nameslinearmelmel_lengthsstop_targets	item_idxs	d_vectorsspeaker_idsattnswaveformpitchenergylanguage_idsNr   rA   r@   T)return_countsz [!] total duration z vs spectrogram length g        
text_inputtext_lengths	mel_inputlinear_inputstop_target_lengths	attn_mask	durationsmax_text_lengthmax_spec_lengthitem_idxaudio_unique_names)r_   r`   rm   )torchmaxfloatzerosshape	enumerateuniqueonestodtypesumargsortviewsizer   r	unsqueezesqueezedivideceil_)r!   rQ   rc   rd   rT   rf   re   rW   rX   rl   rZ   r[   rh   r]   r^   r_   r`   rj   rk   ri   idxamc_idxscountsdurextra_frameslargest_idxsrg   r$   r$   r%   format_batch   s   ,"	
zBaseVC.format_batchr@   datasetc                 C   s   d }|j }t|ddrt|dd}td| t|| }t|ddr@t|dd}td| |d ur:|t|| 7 }nt|| }t|d	drdt|d
d}td| |d ur^|t|| 7 }nt|| }|d urpt|t|}nd }|d u r|dkrt|}|S d }|S |dkrt	|n|}|S )Nuse_language_weighted_samplerFlanguage_weighted_sampler_alphag      ?z. > Using Language weighted sampler with alpha:use_speaker_weighted_samplerspeaker_weighted_sampler_alphaz- > Using Speaker weighted sampler with alpha:use_length_weighted_samplerlength_weighted_sampler_alphaz, > Using Length weighted sampler with alpha:r@   )
samplesgetattrr3   r   r   r   r	   rG   r
   r   )r!   r   r   num_gpusweights
data_itemsalphasamplerr$   r$   r%   get_sampler   s8   


zBaseVC.get_samplerassetsis_evalr   verboser   rankr   c              
   C   s0  |r	|j s	d }|S | jd ur@t|dr-|jjr| jjnd }	|jjr%| jjnd }
|jj|_n|jr4| jjnd }	|jr=| jjnd }
nd }	d }
| jd urT| j	j
rQ| jjnd }nd }tdi dd|v ra|jndd|j dkpm|jd|ddd	|d	d d
|d
dd|dd d|d| jdd|v r|jndd|rdn|j|j d|jd|jd|jd|jd|jd|jd|rdn|jd|d|	d|jr|
nd dd d|jd|}|dkrt  |  |  |||}t!||r|j"n|j|d u r|j#nd|j$|j%||r|j&n|j'dd}|S ) Nr)   outputs_per_stepr|   r@   compute_linear_spectacotron
compute_f0Ff0_cache_pathcompute_energyenergy_cache_pathr   r   
return_wavbatch_group_sizer   min_text_lenmax_text_lenmin_audio_lenmax_audio_lenphoneme_cache_pathprecompute_num_workersuse_noise_augmentr   speaker_id_mappingd_vector_mapping	tokenizerstart_by_longestlanguage_id_mapping)
batch_sizeshuffle
collate_fn	drop_lastr   num_workers
pin_memoryr$   )(run_evalr   r/   r)   r0   rK   r1   
embeddingsr   r*   rL   r   r|   modellowerr   getr   r   r   r   r   r   r   r   r   r   r   r   distbarrierpreprocess_samplesr   r   eval_batch_sizer   r   r   num_eval_loader_workersnum_loader_workers)r!   r   r   r   r   r   r   r   loaderr   r   r   r   r   r$   r$   r%   get_data_loader  s   

F


	

zBaseVC.get_data_loaderc                    sd   d } j jr fdd jjD }tt|df} j js d ntt jj	 d|d d}|S )Nc                    s   g | ]
} j j| d  qS )	embedding)r   r   ).0namer!   r$   r%   
<listcomp>d  s    z.BaseVC._get_test_aux_input.<locals>.<listcomp>r@   )r:   r<   r;   )
r   r1   r   r   randomsamplesortedr0   rK   values)r!   r<   
aux_inputsr$   r   r%   _get_test_aux_input_  s   zBaseVC._get_test_aux_inputc           	      C   s   t d i }i }| jj}|  }t|D ]X\}}t|tr&| |}|d }t| || jdt	t
|  jv |d |d |d ddd		}|d
 |d|< t|d d | jdd|d|< t|d d dd|d|< q||fS )a`  Generic test run for `vc` models used by `Trainer`.

        You can override this for a different behaviour.

        Args:
            assets (dict): A dict of training assets. For `vc` models, it must include `{'audio_processor': ap}`.

        Returns:
            Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
        z! | > Synthesizing test sentences.rD   cudar:   r<   r;   TF)r:   r<   r;   use_griffin_limdo_trim_silencewavz{}-audiooutputsmodel_outputs)
output_figz{}-prediction
alignmentsz{}-alignment)r3   r   test_sentencesr   rs   rE   rF   rP   r   strnext
parametersdeviceformatr   r   r   )	r!   r   test_audiostest_figuresr   r   r   senoutputs_dictr$   r$   r%   test_runp  s8   

zBaseVC.test_runc                 C   s   | j dur:tj|jd}| j | ||j_t|jdr"||jj	_|j
tj|jd td| d td | jdurvtj|jd}| j| ||j_t|jdr\||jj	_|j
tj|jd td	| d td
 dS dS )zdSave the speaker.pth and language_ids.json at the beginning of the training. Also update both paths.Nzspeakers.pthr)   zconfig.jsonz > `speakers.pth` is saved to .z1 > `speakers_file` is updated in the config.json.zlanguage_ids.jsonz# > `language_ids.json` is saved to z5 > `language_ids_file` is updated in the config.json.)r   ospathjoinoutput_pathsave_ids_to_filer   speakers_filer/   r)   	save_jsonr3   r   language_ids_file)r!   trainerr   r$   r$   r%   on_init_start  s&   



zBaseVC.on_init_start)NNr   )r@   )r(   
__module____qualname____doc__
MODEL_TYPEr   r   r   r   r    r   r8   r   r?   rP   r   r   r   boolr   intr   r   r   r   r   __classcell__r$   r$   r"   r%   r      sV    !/O.	
S
(r   )'r   r   typingr   r   r   r   rn   torch.distributeddistributedr   coqpitr   r   torch.utils.datar   torch.utils.data.samplerr	   trainer.torchr
   r   	TTS.modelr   TTS.tts.datasets.datasetr   TTS.tts.utils.datar   TTS.tts.utils.languagesr   r   TTS.tts.utils.speakersr   r   TTS.tts.utils.synthesisr   TTS.tts.utils.visualr   r   r   r$   r$   r$   r%   <module>   s$    