o
    
j[I                     @   s   d dl mZmZmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZ d dlm Z  G dd deZ!dS )    )DictListTupleUnionN)nn)autocast)get_optimizerget_scheduler)CapacitronVAE)GST)DecoderEncoderPostCBHG)BaseTacotron)alignment_diagonal_score)SpeakerManager)TTSTokenizer)plot_alignmentplot_spectrogram)CapacitronOptimizerc                       s:  e Zd ZdZ			d/ddddddd	ef fd
dZdddddfddZe d0ddZ	d1ddZ
dedejjdeeef fddZdefddZdefddZdd Zdd  Zded!ed"d#d$ed%eddfd&d'Zdedejfd(d)Zded!ed"d#d$ed%eddfd*d+Zed0ddd,eee ee f fd-d.Z  ZS )2Tacotrona  Tacotron as in https://arxiv.org/abs/1703.10135
    It's an autoregressive encoder-attention-decoder-postnet architecture.
    Check `TacotronConfig` for the arguments.

    Args:
        config (TacotronConfig): Configuration for the Tacotron model.
        speaker_manager (SpeakerManager): Speaker manager to handle multi-speaker settings. Only use if the model is
            a multi-speaker model. Defaults to None.
    NconfigTacotronConfigapAudioProcessor	tokenizerr   speaker_managerc                    s  t  |||| |D ]
}t| |||  q| js| jr)| | |  j| j7  _| jr5|  j| j	j
7  _| jrA|  j| jj7  _tj| jddd| _| jjjdd t| j| _t| j| j| j| j| j| j| j| j| j | j!| j"| j#| j$| j%| j&| j'| _(t)| j| _*t+| j*j,j-d | j.| _/| j0| j(j1_2| j	r| jrt3| j| j	j4| j	j5| j	j
d| _6| jr| jrt7| j| j| jj| jr| jj8r| jnd | jj9r| jj:nd d| _;| j<r| =  | j>rt| j| j| j?| j| j| j| j| j| j | j!| j"| j#| j$| j%| j&| j'| _@d S d S )N   r   )padding_idxg333333?   )num_mel	num_headsnum_style_tokensgst_embedding_dim)r    encoder_output_dimcapacitron_VAE_embedding_dimspeaker_embedding_dimtext_summary_embedding_dim)Asuper__init__setattruse_speaker_embeddinguse_d_vector_fileinit_multispeakerdecoder_in_featuresembedded_speaker_dimuse_gstgstr#   use_capacitron_vaecapacitron_vaer%   r   	Embedding	num_chars	embeddingweightdatanormal_r   encoder_in_featuresencoderr   decoder_output_dimrmemory_sizeattention_type	windowingattention_normprenet_typeprenet_dropoutuse_forward_attntransition_agentforward_attn_masklocation_attnattention_headsseparate_stopnetmax_decoder_stepsdecoderr   postnetLinearcbhggru_featuresout_channelslast_linearprenet_dropout_at_inferenceprenetdropout_at_inferencer   gst_num_headsgst_num_style_tokens	gst_layerr
    capacitron_use_speaker_embedding&capacitron_use_text_summary_embeddings%capacitron_text_summary_embedding_dimcapacitron_vae_layerbidirectional_decoder_init_backward_decoderdouble_decoder_consistencyddc_rcoarse_decoder)selfr   r   r   r   key	__class__ J/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/models/tacotron.pyr)       s   


zTacotron.__init__speaker_ids	d_vectorsc                 C   s  |  |}ddd}| |}| ||\}}	| |}
|
|d|
 }
| jr2| jr2| |
|}
| j	s8| j
rW| j
sI| |d dddf }nt|d d}| |
|}
| jry| jry| j|
||g| jjrk||gnd| jjrr|ndd^}
}nd}| |
||\}}}|	dur||	d| }| |}|	dur||	d| }| |}|dd }| jr| ||
|\}}||d< ||d	< | jr| ||
||\}}||d< ||d	< ||||||d
 |S )z
        Shapes:
            text: [B, T_in]
            text_lengths: [B]
            mel_specs: [B, T_out, C]
            mel_lengths: [B]
            aux_input: 'speaker_ids': [B, 1] and  'd_vectors':[B, C]
        N)alignments_backwarddecoder_outputs_backwardr   rh   ri      reference_mel_info	text_infospeaker_embeddingrj   rk   )model_outputsdecoder_outputs
alignmentsstop_tokenscapacitron_vae_outputs)_format_aux_inputr6   compute_masksr;   	unsqueeze	expand_asr1   r0   compute_gstr+   r,   rp   torch_concat_speaker_embeddingr3   r2    compute_capacitron_VAE_embeddingrY   rX   rK   rL   rQ   	transpose
contiguousr\   _backward_passr^   _coarse_decoder_passupdate)ra   texttext_lengths	mel_specsmel_lengths	aux_inputoutputsinputs
input_maskoutput_maskencoder_outputsembedded_speakersru   rr   rs   rt   postnet_outputsrk   rj   re   re   rf   forward   sd   




	

	zTacotron.forwardc                 C   s  |  |}| |}| |}| jr | jr | ||d |d }| jr| jr|d d urC| |d }tj	|
dgtjd|j}|d d ur[tj	|d 
dgtjd|jnd }| j||d d url|d |gnd |d d urw||gnd | jjr|d nd d^}}| jdkr| js| |d }	|	jdkr|	d d d d f }	n|	jdkr|	d d d f }	nt|d d}	| ||	}| j|\}
}}| |
}| |}|
dd}
||
||d	}|S )
N	style_melri   
style_textrl   )dtyperm   rh   r   )rq   rr   rs   rt   )rv   r6   r;   r1   r0   rz   r3   r2   r{   tensorsizeint64todevicer}   rX   num_speakersr,   rp   ndimrx   r|   rK   	inferencerL   rQ   r~   )ra   
text_inputr   r   r   style_text_embeddingstyle_text_lengthreference_mel_length_r   rr   rs   rt   r   r   re   re   rf   r      sZ   


$







zTacotron.inferencereturnc                 C   s"   | j r|d   |  d S d S )Ncapacitron_vae_beta_loss)r2   backward
first_step)ra   	loss_dict	optimizerre   re   rf   before_backward_pass  s   zTacotron.before_backward_passbatch	criterionc                 C   s  |d }|d }|d }|d }|d }|d }|d }	|d }
|d	 }|
|d
}|  |||||}| | jj dkrO|| jj| | jj   | jj }n|| jj }tddN ||d  |d  | | |d  | |	| jr||d nd||d du rdn|d  |d  ||d du rdn|d  |}W d   n1 sw   Y  dt|d  }||d< ||fS )zPerform a single training step by fetching the right set of samples from the batch.

        Args:
            batch ([Dict]): A dictionary of input tensors.
            criterion ([torch.nn.Module]): Callable criterion to compute model loss.
        r   r   	mel_inputr   linear_inputstop_targetsstop_target_lengthsrh   ri   rg   r   F)enabledrq   rr   rt   ru   Nrk   rs   rj   rl   align_error)r   maxrK   r=   r   floatr3   r   )ra   r   r   r   r   r   r   r   r   r   rh   ri   r   r   alignment_lengthsr   r   re   re   rf   
train_step  sJ   




zTacotron.train_stepc                 C   s0   | j rt| j|  S t| jj| jj| jj| S N)r2   r   r   named_parametersr   r   optimizer_paramslr)ra   re   re   rf   r   L  s   zTacotron.get_optimizerr   c                 C   s$   | j r|jn|}t| jj| jj|S r   )r2   primary_optimizerr	   r   lr_schedulerlr_scheduler_params)ra   r   optre   re   rf   r	   Q  s   zTacotron.get_schedulerc                 C   sP   | j r&g }|  D ]\}}|jr|dkr|| q	tjj|| jj	 d S d S )Nzcapacitron_vae_layer.beta)
r2   r   requires_gradappendr{   r   utilsclip_grad_norm_r3   capacitron_grad_clip)ra   model_params_to_clipnameparamre   re   rf   before_gradient_clippingU  s   
z!Tacotron.before_gradient_clippingc                 C   s  |d }|d }|d }|d }|d }|d }	|d j   }
|d j   }|	d j   }|d j   }|d j   }t|
|dd	t||dd	t||dd	t||dd	t|dd	d
}| jsk| jrzt|d j   dd	|d< ||
j}|d|ifS )Nrq   rr   rs   rj   r   r   r   F)
output_fig)pred_linear_specreal_linear_specpred_mel_specreal_mel_spec	alignmentalignment_backwardaudio)	r8   cpunumpyr   r   r\   r^   inv_spectrogramT)ra   r   r   r   r   rr   rs   rj   r   r   r   r   gt_linear_specgt_mel_spec	align_imgfiguresr   re   re   rf   _create_logs_  s*   
zTacotron._create_logsr   loggerLoggerassetsstepsc                 C   6   |  ||| j\}}||| |||| jj d S r   )r   r   train_figurestrain_audiossample_ratera   r   r   r   r   r   r   audiosre   re   rf   	train_log|  s   zTacotron.train_logc                 C   s   |  ||S r   )r   )ra   r   r   re   re   rf   	eval_step  s   zTacotron.eval_stepc                 C   r   r   )r   r   eval_figureseval_audiosr   r   re   re   rf   eval_log  s   zTacotron.eval_logsamplesc                 C   s>   ddl m} || }t| \}}t| |}t||||S )zInitiate model from config

        Args:
            config (TacotronConfig): Model config.
            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
                Defaults to None.
        r   )r   )TTS.utils.audior   init_from_configr   r   r   )r   r   r   r   r   
new_configr   re   re   rf   r     s
   	
zTacotron.init_from_config)NNNr   )r   N)__name__
__module____qualname____doc__r   r)   r   r{   no_gradr   r   r   r   Moduler   r   r   r   objectr	   r   r   dictintr   r   r   staticmethodr   r   __classcell__re   re   rc   rf   r      sR    i
R
6"4

".r   )"typingr   r   r   r   r{   r   torch.cuda.amp.autocast_moder   trainer.trainer_utilsr   r	   )TTS.tts.layers.tacotron.capacitron_layersr
   "TTS.tts.layers.tacotron.gst_layersr    TTS.tts.layers.tacotron.tacotronr   r   r   TTS.tts.models.base_tacotronr   TTS.tts.utils.measuresr   TTS.tts.utils.speakersr   TTS.tts.utils.text.tokenizerr   TTS.tts.utils.visualr   r   TTS.utils.capacitron_optimizerr   r   re   re   re   rf   <module>   s   