o
    
jEL                     @   s   d dl mZmZmZ d dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZ d dlmZ G dd deZ dS )    )DictListUnionN)nn)autocast)get_optimizerget_scheduler)CapacitronVAE)GST)DecoderEncoderPostnet)BaseTacotron)alignment_diagonal_score)SpeakerManager)TTSTokenizer)plot_alignmentplot_spectrogram)CapacitronOptimizerc                       s:  e Zd ZdZ			d1ddddddd	ef fd
dZedd ZdddddfddZe	
 d2ddZd3ddZdede	jjfddZdefddZdefddZdd  Zd!d" Zded#ed$d%d&ed'eddfd(d)Zdedejfd*d+Zded#ed$d%d&ed'eddfd,d-Zed2ddd.eee ee f fd/d0Z  ZS )4	Tacotron2a  Tacotron2 model implementation inherited from :class:`TTS.tts.models.base_tacotron.BaseTacotron`.

    Paper::
        https://arxiv.org/abs/1712.05884

    Paper abstract::
        This paper describes Tacotron 2, a neural network architecture for speech synthesis directly from text.
        The system is composed of a recurrent sequence-to-sequence feature prediction network that maps character
        embeddings to mel-scale spectrograms, followed by a modified WaveNet model acting as a vocoder to synthesize
        timedomain waveforms from those spectrograms. Our model achieves a mean opinion score (MOS) of 4.53 comparable
        to a MOS of 4.58 for professionally recorded speech. To validate our design choices, we present ablation
        studies of key components of our system and evaluate the impact of using mel spectrograms as the input to
        WaveNet instead of linguistic, duration, and F0 features. We further demonstrate that using a compact acoustic
        intermediate representation enables significant simplification of the WaveNet architecture.

    Check :class:`TTS.tts.configs.tacotron2_config.Tacotron2Config` for model arguments.

    Args:
        config (TacotronConfig):
            Configuration for the Tacotron2 model.
        speaker_manager (SpeakerManager):
            Speaker manager for multi-speaker training. Uuse only for multi-speaker training. Defaults to None.
    NconfigTacotron2ConfigapAudioProcessor	tokenizerr   speaker_managerc                    s  t  |||| |j| _|D ]
}t| |||  q| js | jr-| | |  j| j	7  _| j
r9|  j| jj7  _| jrE|  j| jj7  _tj| jddd| _t| j| _t| j| j| j| j| j| j| j| j| j| j| j | j!| j"| j#| j$| _%t&| j| _'| j(| j%j)_*| jr| j
rt+| j| jj,| jj-| jjd| _.| jr| jrt/| j| j| jj| jj0r| j	nd | jj1r| jj2nd d| _3| j4r| 5  | j6rt| j| j| j7| j| j| j| j| j| j| j| j | j!| j"| j#| j$| _8d S d S )Ni   r   )padding_idx)num_mel	num_headsnum_style_tokensgst_embedding_dim)r   encoder_output_dimcapacitron_VAE_embedding_dimspeaker_embedding_dimtext_summary_embedding_dim)9super__init__out_channelsdecoder_output_dimsetattruse_speaker_embeddinguse_d_vector_fileinit_multispeakerdecoder_in_featuresembedded_speaker_dimuse_gstgstr    use_capacitron_vaecapacitron_vaer"   r   	Embedding	num_chars	embeddingr   encoder_in_featuresencoderr   rattention_typeattention_winattention_normprenet_typeprenet_dropoutuse_forward_attntransition_agentforward_attn_masklocation_attnattention_headsseparate_stopnetmax_decoder_stepsdecoderr   postnetprenet_dropout_at_inferenceprenetdropout_at_inferencer
   gst_num_headsgst_num_style_tokens	gst_layerr	    capacitron_use_speaker_embedding&capacitron_use_text_summary_embeddings%capacitron_text_summary_embedding_dimcapacitron_vae_layerbidirectional_decoder_init_backward_decoderdouble_decoder_consistencyddc_rcoarse_decoder)selfr   r   r   r   key	__class__ K/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/models/tacotron2.pyr&   .   s   


zTacotron2.__init__c                 C   s"   |  dd} | dd}| ||fS )z*Final reshape of the model output tensors.      )	transpose)mel_outputsmel_outputs_postnet
alignmentsrZ   rZ   r[   shape_outputs   s   
zTacotron2.shape_outputsspeaker_ids	d_vectorsc                 C   s  |  |}ddd}| ||\}}| |dd}	| |	|}
| jr-| jr-| |
|}
| js3| j	rR| j	sD| 
|d dddf }nt|d d}| |
|}
| jrx| jrx| j|
||g| jjrj|	dd|gnd| jjrq|ndd^}
}nd}|
|d|
 }
| |
||\}}}|dur||d| }| |}|| }|dur||d| }| |||\}}}| jr| ||
|\}}||d< ||d	< | jr| ||
||\}}||d< ||d	< ||||||d
 |S )a;  Forward pass for training with Teacher Forcing.

        Shapes:
            text: :math:`[B, T_in]`
            text_lengths: :math:`[B]`
            mel_specs: :math:`[B, T_out, C]`
            mel_lengths: :math:`[B]`
            aux_input: 'speaker_ids': :math:`[B, 1]` and  'd_vectors': :math:`[B, C]`
        N)alignments_backwarddecoder_outputs_backwardr\   r]   rd   re   reference_mel_info	text_infospeaker_embeddingrf   rg   )model_outputsdecoder_outputsra   stop_tokenscapacitron_vae_outputs)_format_aux_inputcompute_masksr5   r^   r7   r0   r/   compute_gstr*   r+   rk   torch	unsqueeze_concat_speaker_embeddingr2   r1    compute_capacitron_VAE_embeddingrN   rM   	expand_asrE   rF   rb   rQ   _backward_passrS   _coarse_decoder_passupdate)rV   texttext_lengths	mel_specsmel_lengths	aux_inputoutputs
input_maskoutput_maskembedded_inputsencoder_outputsembedded_speakersro   rm   ra   rn   postnet_outputsrg   rf   rZ   rZ   r[   forward   sd   

	
	zTacotron2.forwardc                 C   s  |  |}| |dd}| j|}| jr%| jr%| ||d |d }| jr| j	r|d durH| |d }t
j|dgt
jd|j}|d dur`t
j|d dgt
jd|jnd}| j||d durq|d |gnd|d dur|||gnd| jjr|d ndd^}}| jdkr| js| |d	 d }	|	jdkr|	ddddf }	n|	jdkr|	dddf }	n|d }	| ||	}| j|\}
}}| |
}|
| }| |
||\}
}}||
||d
}|S )zForward pass for inference with no Teacher-Forcing.

        Shapes:
           text: :math:`[B, T_in]`
           text_lengths: :math:`[B]`
        r\   r]   	style_melre   
style_textN)dtyperh   rd   )rl   rm   ra   rn   )rp   r5   r^   r7   	inferencer0   r/   rr   r2   r1   rs   tensorsizeint64todevicerv   rM   num_speakersr+   rk   ndimru   rE   rF   rb   )rV   r{   r   r   r   style_text_embeddingstyle_text_lengthreference_mel_length_r   rm   ra   rn   r   r   rZ   rZ   r[   r      sZ   
$





zTacotron2.inferencereturnc                 C   s"   | j r|d   |  d S d S )Ncapacitron_vae_beta_loss)r1   backward
first_step)rV   	loss_dict	optimizerrZ   rZ   r[   before_backward_pass.  s   zTacotron2.before_backward_passbatch	criterionc                 C   st  |d }|d }|d }|d }|d }|d }|d }	|d }
|	|
d	}|  |||||}| | jj d
krK|| jj| | jj   | jj }n|| jj }tddL ||d  |d  | d|d  | || jrv|d nd||d du rdn|d  |d  ||d du rdn|d  |}W d   n1 sw   Y  dt|d  }||d< ||fS )zA single training step. Forward pass and loss computation.

        Args:
            batch ([Dict]): A dictionary of input tensors.
            criterion ([type]): Callable criterion to compute model loss.
        
text_inputr|   	mel_inputr~   stop_targetsstop_target_lengthsrd   re   rc   r   F)enabledrl   rm   Nrn   ro   rg   ra   rf   r\   align_error)r   maxrE   r8   r   floatr2   r   )rV   r   r   r   r|   r   r~   r   r   rd   re   r   r   alignment_lengthsr   r   rZ   rZ   r[   
train_step5  sH   




zTacotron2.train_stepc                 C   s0   | j rt| j|  S t| jj| jj| jj| S N)r1   r   r   named_parametersr   r   optimizer_paramslr)rV   rZ   rZ   r[   r   h  s   zTacotron2.get_optimizerr   c                 C   s$   | j r|jn|}t| jj| jj|S r   )r1   primary_optimizerr   r   lr_schedulerlr_scheduler_params)rV   r   optrZ   rZ   r[   r   m  s   zTacotron2.get_schedulerc                 C   sP   | j r&g }|  D ]\}}|jr|dkr|| q	tjj|| jj	 d S d S )Nzcapacitron_vae_layer.beta)
r1   r   requires_gradappendrs   r   utilsclip_grad_norm_r2   capacitron_grad_clip)rV   model_params_to_clipnameparamrZ   rZ   r[   before_gradient_clippingq  s   
z"Tacotron2.before_gradient_clippingc                 C   s   |d }|d }|d }|d }|d j   }|d j   }	|d j   }
t||ddt|	|ddt|
ddd}| jsE| jrTt|d j   dd|d	< ||j}|d
|ifS )z!Create dashboard log information.rl   ra   rf   r   r   F)
output_fig)
predictionground_truth	alignmentalignment_backwardaudio)	datacpunumpyr   r   rQ   rS   inv_melspectrogramT)rV   r   r   r   r   ra   rf   r   	pred_specgt_spec	align_imgfiguresr   rZ   rZ   r[   _create_logs{  s   
zTacotron2._create_logsr   loggerLoggerassetsstepsc                 C   s6   |  ||| j\}}||| |||| jj dS )zLog training progress.N)r   r   train_figurestrain_audiossample_raterV   r   r   r   r   r   r   audiosrZ   rZ   r[   	train_log  s   zTacotron2.train_logc                 C   s   |  ||S r   )r   )rV   r   r   rZ   rZ   r[   	eval_step  s   zTacotron2.eval_stepc                 C   s6   |  ||| j\}}||| |||| jj d S r   )r   r   eval_figureseval_audiosr   r   rZ   rZ   r[   eval_log  s   zTacotron2.eval_logsamplesc                 C   s>   ddl m} || }t| \}}t||}t||||S )zInitiate model from config

        Args:
            config (Tacotron2Config): Model config.
            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
                Defaults to None.
        r   )r   )TTS.utils.audior   init_from_configr   r   r   )r   r   r   r   r   
new_configr   rZ   rZ   r[   r     s
   	
zTacotron2.init_from_config)NNNr   )r   N)__name__
__module____qualname____doc__r   r&   staticmethodrb   r   rs   no_gradr   r   r   r   Moduler   r   r   objectr   r   r   dictintr   r   r   r   r   __classcell__rZ   rZ   rX   r[   r      sV    e

T
?3

".r   )!typingr   r   r   rs   r   torch.cuda.amp.autocast_moder   trainer.trainer_utilsr   r   )TTS.tts.layers.tacotron.capacitron_layersr	   "TTS.tts.layers.tacotron.gst_layersr
   !TTS.tts.layers.tacotron.tacotron2r   r   r   TTS.tts.models.base_tacotronr   TTS.tts.utils.measuresr   TTS.tts.utils.speakersr   TTS.tts.utils.text.tokenizerr   TTS.tts.utils.visualr   r   TTS.utils.capacitron_optimizerr   r   rZ   rZ   rZ   r[   <module>   s   