o
    
jV/                     @   s   d dl Z d dlmZ d dlmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ G dd deZdS )    N)abstractmethod)DictTuple)Coqpit)nn)TacotronLoss)BaseTTS)sequence_mask)SpeakerManager)	synthesis)TTSTokenizer)plot_alignmentplot_spectrogramformat_aux_input)load_fsspec)gradual_training_schedulerc                
       s&  e Zd ZdZ	d8ddddddd	ef fd
dZededefddZdd Z	dd Z
edd Zedd Z	d9ddZdejfddZedefddZdedeeef fd d!Zd"ed#d$ded%eddf
d&d'Zd(d) Zd*d+ Zd,d- Zd8d.d/Zd:d0d1Zed2d3 Zed4d5 Zd6d7 Z   Z!S );BaseTacotronz+Base class shared by Tacotron and Tacotron2NconfigTacotronConfigapAudioProcessor	tokenizerr   speaker_managerc                    s   t  |||| |D ]
}t| |||  qd | _d | _d | _d | _d | _d | _| j	r:| j
r:|  j| j	j7  _d | _| jrL| jrL|  j| jj7  _d | _d | _d | _d S N)super__init__setattr	embeddingencoderdecoderpostnetembedded_speakersembedded_speakers_projectedgstuse_gstdecoder_in_featuresgst_embedding_dim	gst_layercapacitron_vaeuse_capacitron_vaecapacitron_VAE_embedding_dimcapacitron_vae_layerdecoder_backwardcoarse_decoder)selfr   r   r   r   key	__class__ O/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/models/base_tacotron.pyr      s"   
zBaseTacotron.__init__	aux_inputreturnc                 C   s   | r
t ddd| S dS )z*Set missing fields to their default valuesN)	d_vectorsspeaker_idsr   )r5   r3   r3   r4   _format_aux_input=   s   zBaseTacotron._format_aux_inputc                 C   s   t | j| _dS )z8Init the backward decoder for Forward-Backward decoding.N)copydeepcopyr    r-   r/   r3   r3   r4   _init_backward_decoderH   s   z#BaseTacotron._init_backward_decoderc                 C   s*   t | j| _| j| j_| j| j dS )z7Init the coarse decoder for Double-Decoder Consistency.N)r:   r;   r    r.   ddc_rr_initset_rr<   r3   r3   r4   _init_coarse_decoderL   s   
z!BaseTacotron._init_coarse_decoderc                 C      d S r   r3   r<   r3   r3   r4   forwardV      zBaseTacotron.forwardc                 C   rB   r   r3   r<   r3   r3   r4   	inferenceZ   rD   zBaseTacotron.inferenceFc                 C   s   t |td|d}| |d  d|v r| j|d  nd|v r-| j|d d  n| j|j |rH|   td| jj  | j	rJJ dS dS )a  Load model checkpoint and set up internals.

        Args:
            config (Coqpi): model configuration.
            checkpoint_path (str): path to checkpoint file.
            eval (bool, optional): whether to load model for evaluation.
            cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
        cpu)map_locationcachemodelrr   z) > Model's reduction rate `r` is set to: N)
r   torchdeviceload_state_dictr    r@   rJ   evalprinttraining)r/   r   checkpoint_pathrN   rH   stater3   r3   r4   load_checkpoint^   s   
zBaseTacotron.load_checkpointc                 C   s
   t | jS )z)Get the model criterion used in training.)r   r   r<   r3   r3   r4   get_criterionz   s   
zBaseTacotron.get_criterionc                 C   s8   ddl m} || }t| }t| }t| |||S )zInitialize model from config.r   )r   )TTS.utils.audior   init_from_configr   r
   r   )r   r   r   r   r   r3   r3   r4   rV   ~   s
   


zBaseTacotron.init_from_configassetsc           	      C   s   t d i }i }| jj}|  }t|D ]J\}}t| || jdtt|  j	v |d |d |d ddd	}|d	 |d

|< t|d d | jdd|d
|< t|d d dd|d
|< q||dS )ab  Generic test run for `tts` models used by `Trainer`.

        You can override this for a different behaviour.

        Args:
            assets (dict): A dict of training assets. For `tts` models, it must include `{'audio_processor': ap}`.

        Returns:
            Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
        z! | > Synthesizing test sentences.cuda
speaker_idd_vector	style_wavTF)rY   rZ   r[   use_griffin_limdo_trim_silencewavz{}-audiooutputsmodel_outputs)
output_figz{}-prediction
alignmentsz{}-alignment)figuresaudios)rO   r   test_sentences_get_test_aux_input	enumerater   strnext
parametersrL   formatr   r   r   )	r/   rW   test_audiostest_figuresre   
aux_inputsidxsenoutputs_dictr3   r3   r4   test_run   s2   
zBaseTacotron.test_runr_   loggerLoggerstepsc                 C   s*   | ||d | jj |||d  d S )Nrd   rc   )rl   r   sample_raterm   )r/   r_   rs   rW   ru   r3   r3   r4   test_log   s   zBaseTacotron.test_logc                 C   sX   t |}d}|dur(| }| jj}|| dkr ||||   n|}t ||d}||fS )z)Compute masks  against sequence paddings.Nr   )max_len)r	   maxr    rJ   )r/   text_lengthsmel_lengths
input_maskoutput_maskrx   rJ   r3   r3   r4   compute_masks   s    zBaseTacotron.compute_masksc                 C   s6   |  |tj|dd|\}}}|dd }||fS )zRun backwards decoder)   )dimsr      )r-   rK   flip	transpose
contiguous)r/   	mel_specsencoder_outputsmaskdecoder_outputs_balignments_b_r3   r3   r4   _backward_pass   s
   
zBaseTacotron._backward_passc           
   	   C   s   |j d }|| jj dkr%| jj|| jj  }tjj|ddd|ddf}| | ||\}}}	tjjj|	dd|j d dd	dd}|	dd}|ddd|ddf }||fS )zDouble Decoder Consistencyr   r   r   nearest)sizemodeN)
shaper.   rJ   rK   r   
functionalpaddetachinterpolater   )
r/   r   r   rb   r|   Tpadding_sizedecoder_outputs_backwardalignments_backwardr   r3   r3   r4   _coarse_decoder_pass   s"   



z!BaseTacotron._coarse_decoder_passc                 C   s   t |tr_tdd| jjd |}|dur%tj||dddgdd}t	| j
jj}tdd| jj|}| D ] \}}|t| dddd}	| j
j||	}
||
|  }q=n|du rptdd| jj|}n| 
||}| ||}|S )zCompute global style tokenr   r   Ndimr   )
isinstancedictrK   zerosr$   r'   type_ascatreshapetanhr(   style_token_layerstyle_tokensitemsint	unsqueezeexpand	attention_concat_speaker_embedding)r/   inputsstyle_inputspeaker_embeddingquery_GSTgst_outputsk_tokenv_amplifierr0   gst_outputs_attr3   r3   r4   compute_gst   s    
zBaseTacotron.compute_gstc           
      C   s:   |  |||\}}}}||j}| ||}	|	|||fS )z"Capacitron Variational Autoencoder)r,   torL   r   )
r/   r   reference_mel_info	text_infor   VAE_outputsposterior_distributionprior_distributioncapacitron_betaencoder_outputr3   r3   r4    compute_capacitron_VAE_embedding   s&   z-BaseTacotron.compute_capacitron_VAE_embeddingc                 C   s&   | | d| dd}| | } | S )Nr   r   r   )r   r   r_   r"   embedded_speakers_r3   r3   r4   _add_speaker_embedding  s   z#BaseTacotron._add_speaker_embeddingc                 C   s0   | | d| dd}tj| |gdd} | S )Nr   r   r   r   )r   r   rK   r   r   r3   r3   r4   r     s   z&BaseTacotron._concat_speaker_embeddingc                 C   s`   | j r.t|j|j\}|j_||j_| j| |jjr#|j	j
| td| jj  dS dS )zCallback for setting values wrt gradual training schedule.

        Args:
            trainer (TrainerTTS): TTS trainer object that is used to train this model.
        z
 > Number of output frames: N)gradual_trainingr   total_steps_doner   
batch_sizerJ   r    r@   bidirectional_decoderrI   r-   rO   )r/   trainerrJ   r3   r3   r4   on_epoch_start%  s   zBaseTacotron.on_epoch_startr   )FF)NN)"__name__
__module____qualname____doc__r
   r   staticmethodr   r9   r=   rA   r   rC   rE   rS   r   ModulerT   r   rV   r   rr   r   r   rw   r~   r   r   r   r   r   r   r   __classcell__r3   r3   r1   r4   r      s^    %




%





	r   ) r:   abcr   typingr   r   rK   coqpitr   r   TTS.tts.layers.lossesr   TTS.tts.models.base_ttsr   TTS.tts.utils.helpersr	   TTS.tts.utils.speakersr
   TTS.tts.utils.synthesisr   TTS.tts.utils.text.tokenizerr   TTS.tts.utils.visualr   r   TTS.utils.generic_utilsr   TTS.utils.ior   TTS.utils.trainingr   r   r3   r3   r3   r4   <module>   s"    