o
    
j[                     @   s   d dl mZmZmZ d dlZd dlm  mZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZm Z m!Z! G dd dejj"Z#dS )    )CallableDictTupleN)Coqpit)nn)	Conformer)PhonemeLevelProsodyEncoderUtteranceLevelProsodyEncoderget_mask_from_lengths)EnergyAdaptor)EmbeddingPaddedpositional_encoding)PhonemeProsodyPredictor)PitchAdaptor)VariancePredictor)AlignmentNetwork)generate_pathmaximum_pathsequence_maskc                       s  e Zd Z		d<					d= fdd	Zdefd
dZedefddZdefddZ	defddZ
dd Zdd Zed>ddZdejdejdejdejfddZdejd ejdejd!ejd"ejd#eejejejejf fd$d%Zd&ejd'ejd#ejfd(d)Z	*		d?d+ejd,ejd-ejd.ejd/ejd0ejd"ejd1ed2ejd3ejd#eeejf fd4d5Ze 					d@d+ejd3ejd6ed7ed2ejd8ed9ed#ejfd:d;Z  ZS )AAcousticModelNargs	ModelArgs	tokenizerTTSTokenizerspeaker_managerSpeakerManagerc                    s  t    || _|| _|| _| | t| jjtr t	| jjn| jj| _|j
| _t| jj
| jj| jj| j| jj| jj| jjd| _t| jj
| jjd| jj| jj| jj| jjd| _t| jj
| jjd| jj| jj| jj| jjd| _t| jj| jj
d| _t| jj
| jjd| jj| jj| jjd| _ t!| jj"| jj#| jj$| jj%| jj&| jj
| jj| jj'| jj(d	| _)t*| jj
| jj+| jj| jj'| jjd| _,t-| jj"| jj#| jj$| jj%| jj&| jj
| jj| jj.| jjd		| _/t*| jj
| jj+| jj| jj.| jjd| _0t12| jj'| jj
| _3t14| jj'| _5t12| jj.| jj
| _6t14| jj.| _7t| jj8| jj9| jj:| j| jj;| jj<| jjd| _=| jj>j?}t@| jjA| jj
|d
| _Bt12| jj8| jj"| _CtDj1jEdddd d| _F| jFGd d S )N)dimn_layersn_headsspeaker_embedding_dim	p_dropoutkernel_size_conv_modlrelu_slope   )n_inputn_hiddenn_outkernel_sizeemb_kernel_sizer    r"   )channels_inchannels_hiddenchannels_outr'   r(   dropoutr"   )in_query_channelsin_key_channels)r)   channelsr+   r'   r    r"   )	num_melsref_enc_filtersref_enc_sizeref_enc_gru_sizeref_enc_stridesr%   r,   bottleneck_size_u	token_num)hidden_sizer'   r,   bottleneck_sizer"   )	r0   r1   r2   r3   r4   r%   r,   bottleneck_size_pr   )padding_idxFT)affinetrack_running_statsmomentum)Hsuper__init__r   r   r   init_multispeaker
isinstancelength_scaleintfloatn_hidden_conformer_encoderemb_dimr   n_layers_conformer_encodern_heads_conformer_encoderembedded_speaker_dimdropout_conformer_encoder&kernel_size_conv_mod_conformer_encoderr"   encoderr   n_hidden_variance_adaptorkernel_size_variance_adaptor emb_kernel_size_variance_adaptordropout_variance_adaptorpitch_adaptorr   energy_adaptorr   out_channelsalignerr   duration_predictorr	   r0   !ref_enc_filters_reference_encoderref_enc_size_reference_encoder"ref_enc_gru_size_reference_encoder!ref_enc_strides_reference_encoder#bottleneck_size_u_reference_encodertoken_num_reference_encoderutterance_prosody_encoderr   'predictor_kernel_size_reference_encoderutterance_prosody_predictorr   #bottleneck_size_p_reference_encoderphoneme_prosody_encoderphoneme_prosody_predictorr   Linearu_bottle_outInstanceNorm1du_normp_bottle_outp_normn_hidden_conformer_decodern_layers_conformer_decodern_heads_conformer_decoderdropout_conformer_decoder&kernel_size_conv_mod_conformer_decoderdecoder
characterspad_idr   	num_charssrc_word_embto_meltorchBatchNorm1denergy_scalerrequires_grad_)selfr   r   r   r:   	__class__ _/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/delightful_tts/acoustic_model.pyr?      s   

 		
	

zAcousticModel.__init__c                 C   sN   d| _ | jj| _d| _| jr| jj| _| jjr|   | jjr%|   dS dS )z Init for multi-speaker training.r   N)	rI   r   num_speakersaudio_transformr   use_speaker_embedding_init_speaker_embeddinguse_d_vector_file_init_d_vector)rw   r   rz   rz   r{   r@      s   

zAcousticModel.init_multispeaker	aux_inputc                 C   s   d\}}}}d| v r| d dur| d }|j dkr|d}d| v r6| d dur6t| d }|j dkr6|}d| v rD| d durD| d }||||fS )zCSet the speaker conditioning input based on the multi-speaker mode.)NNNNspeaker_idsNr   	d_vectors   	durations)ndim
unsqueeze_F	normalize)r   sidglidr   rz   rz   r{   _set_cond_input   s   


zAcousticModel._set_cond_inputc                 C   s    |  |\}}}}|d ||dS )N)r   	style_wavr   language_ids)r   )rw   r   r   r   r   _rz   rz   r{   get_aux_input   s   zAcousticModel.get_aux_inputc                 C   sb   | dd }| dd }|d ur|d urtd|d ur%t| ds%td|d ur-|}|S |}|S )Nr   r   z2[!] Cannot use d-vectors and speaker-ids together.emb_gz>[!] Cannot use speaker-ids without enabling speaker embedding.)get
ValueErrorhasattr)rw   r   r   r   r   rz   rz   r{   _set_speaker_input   s   z AcousticModel._set_speaker_inputc                 C   s6   | j dkrtd | jj| _t| j | j| _d S d S )Nr   z. > initialization of speaker-embedding layers.)r|   printr   speaker_embedding_channelsrI   r   	Embeddingr   rw   rz   rz   r{   r      s
   

z%AcousticModel._init_speaker_embeddingc                 C   s    t | dr	td| jj| _d S )Nr   zI[!] Speaker embedding layer already initialized before d_vector settings.)r   r   r   d_vector_dimrI   r   rz   rz   r{   r      s   
zAcousticModel._init_d_vectorc                 C   sp   |du r|  d }d||dk < tt|dd| j}t|dt|d }t| |d| j}|S )a  Generate an attention mask from the linear scale durations.

        Args:
            dr (Tensor): Linear scale durations.
            x_mask (Tensor): Mask for the input (character) sequence.
            y_mask (Tensor): Mask for the output (spectrogram) sequence. Compute it from the predicted durations
                if None. Defaults to None.

        Shapes
           - dr: :math:`(B, T_{en})`
           - x_mask: :math:`(B, T_{en})`
           - y_mask: :math:`(B, T_{de})`
        Nr#   r   )	sumlongrs   	unsqueezer   todtyper   squeeze)drx_masky_mask	y_lengths	attn_maskattnrz   rz   r{   generate_attn   s   zAcousticModel.generate_attno_enr   r   r   c                 C   sN   t t|d d|j}| |||}t d| |g}|||ddfS )Nr#   zkmn, kjm -> kjnr   )	rs   r   r   r   r   r   einsumrD   	transpose)rw   r   r   r   r   r   r   o_en_exrz   rz   r{   _expand_encoder_with_durations   s   z,AcousticModel._expand_encoder_with_durationsxyr   attn_priorsreturnc                 C   s   t |dt |d }| |dd|dd||\}}t|ddd |d }	t |	d }
|d}|	dd}	|
|||	fS )a  Aligner forward pass.

        1. Compute a mask to apply to the attention map.
        2. Run the alignment network.
        3. Apply MAS to compute the hard alignment map.
        4. Compute the durations from the hard alignment map.

        Args:
            x (torch.FloatTensor): Input sequence.
            y (torch.FloatTensor): Output sequence.
            x_mask (torch.IntTensor): Input sequence mask.
            y_mask (torch.IntTensor): Output sequence mask.
            attn_priors (torch.FloatTensor): Prior for the aligner network map.

        Returns:
            Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
                Durations from the hard alignment map, soft alignment potentials, log scale alignment potentials,
                hard alignment map.

        Shapes:
            - x: :math:`[B, T_en, C_en]`
            - y: :math:`[B, T_de, C_de]`
            - x_mask: :math:`[B, 1, T_en]`
            - y_mask: :math:`[B, 1, T_de]`
            - attn_priors: :math:`[B, T_de, T_en]`

            - aligner_durations: :math:`[B, T_en]`
            - aligner_soft: :math:`[B, T_de, T_en]`
            - aligner_logprob: :math:`[B, 1, T_de, T_en]`
            - aligner_mas: :math:`[B, T_de, T_en]`
        r   r   r#   )	rs   r   rT   r   r   r   
contiguousr   rC   )rw   r   r   r   r   r   r   aligner_softaligner_logprobaligner_masaligner_durationsrz   rz   r{   _forward_aligner
  s   '$ 
zAcousticModel._forward_aligneru_prosody_predsrc_maskc                 C   s0   | d  d}|j ddd|ddd }|S )N      ?r#   T)keepdimr   )r   view)rw   r   r   lengthsrz   rz   r{   average_utterance_prosody;  s   z'AcousticModel.average_utterance_prosodyTtokenssrc_lensmelsmel_lenspitchesenergiesuse_ground_truthr   speaker_idxc           ,   	   C   s  |  |	|
d\}}}}t|}t|}| |}||dd}| j||dd|d d d f  |d d d f  |d\}}}}|}d }|	d urM|}n|
d urYt| 	|}t
| jt|jd t||jd}| j||||d}| | j||d	}| | j| j||d
|d}|r|| | }n|| | }| | j|||||d}| | j||d
}|r|| | }n|| | }|}| jj||||d\}} }!| jj||||d\}"}#}$|dd|! |$ }| j| |d
}%| j||||d d d f  d\}&}'}(| j |'dd|||d})| !|)})t"#|d }t"$|%d }*| %|*|d|&}+i d|)d|d| d|"d|#d|d|d|d|d|+d|(d|d|d|d|d|%&dd|&dd |iS )!Nr   r   r           r#   r   )r   r   r   r   r   devicespeaker_embeddingencoding)r   r   r   maskr   r   )r   r   r   r   r   )r   targetr   r   r   r   r   r   model_outputs
pitch_predpitch_targetenergy_predenergy_targetr   u_prosody_refp_prosody_predp_prosody_refalignments_dp
alignmentsr   r   r   r   dr_log_preddr_log_targetspk_emb)'r   r
   rq   masked_fillr   r   r   r   r   r   r   rF   maxshaper   rL   re   r\   r   r^   rc   rg   r`   ra   rf   rQ   get_pitch_embedding_trainrR   get_energy_embedding_trainrU   detachr   rm   rr   rs   logexpr   r   ),rw   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   mel_masktoken_embeddingsr   r   r   r   r   r   pos_encodingencoder_outputsr   r   r   r   encoder_outputs_resr   avg_pitch_target	pitch_embr   avg_energy_target
energy_emblog_duration_predictionmel_pred_maskencoder_outputs_exr   r   dr_predr   rz   rz   r{   forwardB  s   




	


zAcousticModel.forward	p_control	d_controlpitch_transformenergy_transformc           "      C   s  t tj|jd gtj|jd}t|jdd |j}	| ||d\}
}}}| |}|	|
dd}d }|d urA|}n|d urMt| |
}t| j|jd |jd}| j||||d}| | j| j||d	|d
}|| || }| | j||d	}|| || }|}| jj|||t| dr| jnd t| dr| jnd d\}}| jj|||d\}}| dd| | }| j!|" |d	}t#|d |  | j$ }d||dk < t%|}|&d}| j'|||(d|d d d f  d\}}}t tj|jd gtj|jd}|jd |jd kr&t| j|jd |jd}| j)| dd|||d} | *| } | |||||d}!|!S )Nr#   )r   r   r   r   r   r   r   r   r   r   
pitch_mean	pitch_std)r   r   r   r   r   )r   r   r   r   r   )r   r   r   pitchenergyr   )+r
   rs   tensorr   int64r   r   r   rq   r   r   r   r   r   r   rF   rL   re   r   r^   rc   	expand_asrg   ra   rf   rQ   get_pitch_embeddingr   r   r   rR   get_energy_embeddingr   rU   r   r   rB   roundr   r   r   rm   rr   )"rw   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   pitch_emb_predr   energy_emb_predr   log_duration_predduration_predr   r   r   r   r   r   outputsrz   rz   r{   	inference  s    






	zAcousticModel.inference)NN)r   r   r   r   r   r   )N)TNN)NNNNN)__name__
__module____qualname__r?   r   r@   staticmethodr   r   r   r   r   r   r   rs   FloatTensor	IntTensorr   r   r   Tensorr   boolstrr   no_gradrD   r   r  __classcell__rz   rz   rx   r{   r      s     	

1
	

 	r   )$typingr   r   r   rs   torch.nn.functionalr   
functionalr   coqpitr   'TTS.tts.layers.delightful_tts.conformerr   &TTS.tts.layers.delightful_tts.encodersr   r	   r
   ,TTS.tts.layers.delightful_tts.energy_adaptorr   &TTS.tts.layers.delightful_tts.networksr   r   7TTS.tts.layers.delightful_tts.phoneme_prosody_predictorr   +TTS.tts.layers.delightful_tts.pitch_adaptorr   0TTS.tts.layers.delightful_tts.variance_predictorr   TTS.tts.layers.generic.alignerr   TTS.tts.utils.helpersr   r   r   Moduler   rz   rz   rz   r{   <module>   s   