o
    ´‹
jlJ  ã                   @   sü   d dl mZmZ d dlmZmZmZ d dlZd dlm	Z	 d dlm
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlm Z m!Z! d dl"m#Z# eG dd„ de	ƒƒZ$G dd„ deƒZ%dS )é    )Ú	dataclassÚfield)ÚDictÚListÚUnionN)ÚCoqpit)Únn)ÚMDNBlock)ÚDecoder)ÚDurationPredictor)ÚEncoder)ÚPositionalEncoding)ÚBaseTTS)Úgenerate_pathÚmaximum_pathÚsequence_mask)ÚSpeakerManager)ÚTTSTokenizer)Úplot_alignmentÚplot_spectrogram)Úload_fsspecc                   @   sÂ   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< d	Z
eed
< edd„ dZeed< d	Zeed< edd„ dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dS )ÚAlignTTSArgsa¾  
    Args:
        num_chars (int):
            number of unique input to characters
        out_channels (int):
            number of output tensor channels. It is equal to the expected spectrogram size.
        hidden_channels (int):
            number of channels in all the model layers.
        hidden_channels_ffn (int):
            number of channels in transformer's conv layers.
        hidden_channels_dp (int):
            number of channels in duration predictor network.
        num_heads (int):
            number of attention heads in transformer networks.
        num_transformer_layers (int):
            number of layers in encoder and decoder transformer blocks.
        dropout_p (int):
            dropout rate in transformer layers.
        length_scale (int, optional):
            coefficient to set the speech speed. <1 slower, >1 faster. Defaults to 1.
        num_speakers (int, optional):
            number of speakers for multi-speaker training. Defaults to 0.
        external_c (bool, optional):
            enable external speaker embeddings. Defaults to False.
        c_in_channels (int, optional):
            number of channels in speaker embedding vectors. Defaults to 0.
    NÚ	num_charséP   Úout_channelsé   Úhidden_channelsÚhidden_channels_dpÚfftransformerÚencoder_typec                   C   ó   dddddœS ©Ni   é   é   gš™™™™™¹?)Úhidden_channels_ffnÚ	num_headsÚ
num_layersÚ	dropout_p© r(   r(   r(   úK/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/models/align_tts.pyÚ<lambda>9   ó    zAlignTTSArgs.<lambda>)Údefault_factoryÚencoder_paramsÚdecoder_typec                   C   r    r!   r(   r(   r(   r(   r)   r*   =   r+   Údecoder_paramsç      ð?Úlength_scaler   Únum_speakersFÚuse_speaker_embeddingÚuse_d_vector_fileÚd_vector_dim)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   ÚintÚ__annotations__r   r   r   r   Ústrr   r-   Údictr.   r/   r1   Úfloatr2   r3   Úboolr4   r5   r(   r(   r(   r)   r      s&   
 ÿÿr   c                       sˆ  e Zd ZdZ			dCddddddd	ef‡ fd
d„Zedd„ ƒZdd„ ZedDdd„ƒZ	dd„ Z
dd„ Zedd„ ƒZdd„ ZdDdd„Zdd„ Zdd„ Zd didfd!d"„Ze ¡ d difd#d$„ƒZd%ed&ejfd'd(„Zd)d*„ Zd%ed+ed,d-d.ed/ed0dfd1d2„Zd%ed&ejfd3d4„Zd%ed+ed,d-d.ed/ed0dfd5d6„Z	7dEd8d9„Zd:d;„ Zed<d=„ ƒZ d>d?„ Z!edDddd@e"e#e# e#e$ f fdAdB„ƒZ%‡  Z&S )FÚAlignTTSa!  AlignTTS with modified duration predictor.
    https://arxiv.org/pdf/2003.01950.pdf

    Encoder -> DurationPredictor -> Decoder

    Check :class:`AlignTTSArgs` for the class arguments.

    Paper Abstract:
        Targeting at both high efficiency and performance, we propose AlignTTS to predict the
        mel-spectrum in parallel. AlignTTS is based on a Feed-Forward Transformer which generates mel-spectrum from a
        sequence of characters, and the duration of each character is determined by a duration predictor.Instead of
        adopting the attention mechanism in Transformer TTS to align text to mel-spectrum, the alignment loss is presented
        to consider all possible alignments in training by use of dynamic programming. Experiments on the LJSpeech dataset s
        how that our model achieves not only state-of-the-art performance which outperforms Transformer TTS by 0.03 in mean
        option score (MOS), but also a high efficiency which is more than 50 times faster than real-time.

    Note:
        Original model uses a separate character embedding layer for duration predictor. However, it causes the
        duration predictor to overfit and prevents learning higher level interactions among characters. Therefore,
        we predict durations based on encoder outputs which has higher level information about input characters. This
        enables training without phases as in the original paper.

        Original model uses Transormers in encoder and decoder layers. However, here you can set the architecture
        differently based on your requirements using ```encoder_type``` and ```decoder_type``` parameters.

    Examples:
        >>> from TTS.tts.configs.align_tts_config import AlignTTSConfig
        >>> config = AlignTTSConfig()
        >>> model = AlignTTS(config)

    NÚconfigÚAlignTTSConfigÚapÚAudioProcessorÚ	tokenizerr   Úspeaker_managerc                    s6  t ƒ  ||||¡ || _d| _t|jjtƒrt|jjƒn|jj| _t	 
| jjj| jjj¡| _d| _|  |¡ t|jjƒ| _t|jj|jj|jj|jj| jƒ| _t|jj|jj|jj|jjƒ| _t|jjƒ| _t	 |jj|jjd¡| _ t!|jjd|jj ƒ| _"| jdkr—| j|jjkr™t	 | j|jjd¡| _#d S d S d S )Néÿÿÿÿr   é   r"   )$ÚsuperÚ__init__rF   ÚphaseÚ
isinstanceÚ
model_argsr1   r:   r>   r   Ú	EmbeddingrA   r   r   ÚembÚembedded_speaker_dimÚinit_multispeakerr   Úpos_encoderr   r   r-   Úencoderr
   r   r.   r/   Údecoderr   r   Úduration_predictorÚConv1dÚ	mod_layerr	   Ú	mdn_blockÚproj_g)ÚselfrA   rC   rE   rF   ©Ú	__class__r(   r)   rJ   i   s<   ÿý
ûüÿzAlignTTS.__init__c                 C   sŽ   |  dd¡ d¡}|   dd¡ d¡} |  dd¡ d¡}t || ¡\}}dtjtjj ||d¡t | 	¡ d¡ dd }|d|jdd  }|S )NrH   r"   g      à¿r   rG   )Údimg      à?)
Ú	transposeÚ	unsqueezeÚtorchÚbroadcast_tensorsÚmeanÚ_CÚ_nnÚmse_lossÚpowÚexp)ÚmuÚ	log_sigmaÚyÚ
expanded_yÚexpanded_muÚexponentialÚlogpr(   r(   r)   Úcompute_log_probs•   s   "ÿzAlignTTS.compute_log_probsc           
      C   sV   t  |d¡t  |d¡ }|  |||¡}t|| d¡ƒ d¡}t  |d¡}	|	 d¡|fS )NrG   r"   rH   )r`   r_   ro   r   ÚsqueezeÚsum)
rZ   rh   ri   rj   Úx_maskÚy_maskÚ	attn_maskÚlog_pÚattnÚdr_masr(   r(   r)   Úcompute_align_path¢   s
   zAlignTTS.compute_align_pathc                 C   sp   |d u r|   d¡ ¡ }d||dk < t t|d ƒd¡ | j¡}t |d¡t |d¡ }t| | d¡ƒ | j¡}|S )NrH   rG   r"   )	rq   Úlongr`   r_   r   ÚtoÚdtyper   rp   )Údrrr   rs   Ú	y_lengthsrt   rv   r(   r(   r)   Úgenerate_attn«   s   zAlignTTS.generate_attnc                 C   s@   |   |||¡}t | d¡ dd¡| dd¡¡ dd¡}||fS )a½  Generate attention alignment map from durations and
        expand encoder outputs

        Examples::
            - encoder output: [a,b,c,d]
            - durations: [1, 3, 2, 1]

            - expanded: [a, b, b, b, c, c, d]
            - attention map: [[0, 0, 0, 0, 0, 0, 1],
                             [0, 0, 0, 0, 1, 1, 0],
                             [0, 1, 1, 1, 0, 0, 0],
                             [1, 0, 0, 0, 0, 0, 0]]
        rH   r"   )r~   r`   Úmatmulrp   r^   )rZ   Úenr|   rr   rs   rv   Úo_en_exr(   r(   r)   Úexpand_encoder_outputs¶   s   *zAlignTTS.expand_encoder_outputsc                 C   s2   t  |¡d | | j }d||dk < t  |¡}|S )NrH   r0   )r`   rg   r1   Úround)rZ   Úo_dr_logrr   Úo_drr(   r(   r)   Úformat_durationsÈ   s   
zAlignTTS.format_durationsc                 C   s(   |  dd|  d¡¡}t | |gd¡} | S )NrG   rH   )ÚexpandÚsizer`   Úcat)Úo_enÚgÚg_expr(   r(   r)   Ú_concat_speaker_embeddingÎ   s   z"AlignTTS._concat_speaker_embeddingc                 C   s   t | dƒr
|  |¡}|| S )NrY   )ÚhasattrrY   )rZ   Úxr‹   r(   r(   r)   Ú_sum_speaker_embeddingÔ   s   

zAlignTTS._sum_speaker_embeddingc                 C   s˜   t | dƒrtj |  |¡¡}|d ur| d¡}|  |¡}t |dd¡}t t	||j
d ƒd¡ |j¡}|  ||¡}|d urD|  ||¡}n|}||||fS )NÚemb_grG   rH   )rŽ   r   Ú
functionalÚ	normalizeÚspeaker_embeddingr_   rO   r`   r^   r   Úshaperz   r{   rS   r   )rZ   r   Ú	x_lengthsr‹   Úx_embrr   rŠ   Úo_en_dpr(   r(   r)   Ú_forward_encoderÛ   s   


 zAlignTTS._forward_encoderc                 C   sx   t  t|d ƒd¡ |j¡}|  ||||¡\}}	t| dƒr"|  ||¡}|d ur,|  ||¡}| j	|||d}
|
|	 
dd¡fS )NrH   rR   ©r‹   r"   )r`   r_   r   rz   r{   r‚   rŽ   rR   r   rT   r^   )rZ   rŠ   r˜   r|   rr   r}   r‹   rs   r   rv   Úo_der(   r(   r)   Ú_forward_decoderô   s   
zAlignTTS._forward_decoderc           
      C   sJ   |   |¡\}}t t|d ƒd¡ |j¡}|  |||||¡\}}	||||	fS )NrH   )rX   r`   r_   r   rz   r{   rx   )
rZ   rŠ   rj   r}   rr   rh   ri   rs   rw   rn   r(   r(   r)   Ú_forward_mdn  s   zAlignTTS._forward_mdnÚ	d_vectorsc                 C   s@  |  dd¡}d|v r|d nd}d\}}	}
}}}}|dkrI|  |||¡\}}}}|  ||||¡\}}}}t t|dƒd¡ |j¡}|  |||¡}n½|dkrx|  |||¡\}}}}|  ||||¡\}}}}| j	| 
¡ | 
¡ | 
¡ |||d\}}nŽ|dkr¡|  |||¡\}}}}|  ||||¡\}}}}| j	||||||d\}}ne|dkrÕ|  |||¡\}}}}|  ||¡}	|  ||||¡\}}}}| j	||||||d\}}|	 d¡}	n1|  |||¡\}}}}|  | 
¡ |¡}	|  ||||¡\}}}}| j	||||||d\}}|	 d¡}	t |d ¡ d¡}
|  dd¡||	|
|||d	œ}|S )
zÌ
        Shapes:
            - x: :math:`[B, T_max]`
            - x_lengths: :math:`[B]`
            - y_lengths: :math:`[B]`
            - dr: :math:`[B, T_max]`
            - g: :math:`[B, C]`
        rH   r"   rž   N)NNNNNNNr   rš   é   )Úmodel_outputsÚ
alignmentsÚdurations_logÚdurations_mas_logrh   ri   rn   )r^   r™   r   r`   r_   r   rz   r{   r~   rœ   ÚdetachrU   rp   Úlog)rZ   r   r–   rj   r}   Ú	aux_inputrK   r‹   r›   r„   Ú
dr_mas_logrv   rh   ri   rn   rŠ   r˜   rr   rw   rs   Ú_Úoutputsr(   r(   r)   Úforward	  sJ   (

ù	zAlignTTS.forwardc                 C   sž   d|v r|d nd}t  |jdd… ¡ |j¡}|  |||¡\}}}}|  ||¡}|  ||¡ d¡}	|	 	d¡}
| j
|||	||
|d\}}| dd¡|dœ}|S )z‚
        Shapes:
            - x: :math:`[B, T_max]`
            - x_lengths: :math:`[B]`
            - g: :math:`[B, C]`
        rž   NrH   r"   rš   )r    r¡   )r`   Útensorr•   rz   Údevicer™   rU   r†   rp   rq   rœ   r^   )rZ   r   r¦   r‹   r–   rŠ   r˜   rr   r„   r…   r}   r›   rv   r©   r(   r(   r)   Ú	inference@  s   
zAlignTTS.inferenceÚbatchÚ	criterionc              
   C   s‚   |d }|d }|d }|d }|d }|d }||dœ}	|   |||||	| j¡}
||
d |
d	 |||
d
 |
d || jd}|
|fS )NÚ
text_inputÚtext_lengthsÚ	mel_inputÚmel_lengthsrž   Úspeaker_ids)rž   r´   rn   r    r¢   r£   )rK   )rª   rK   )rZ   r®   r¯   r°   r±   r²   r³   rž   r´   r¦   r©   Ú	loss_dictr(   r(   r)   Ú
train_stepV  s&   
øzAlignTTS.train_stepc                 C   sŽ   |d }|d }|d }|d j  ¡  ¡ }|d j  ¡  ¡ }|d j  ¡  ¡ }	t||ddt||ddt|	dddœ}
| |j¡}|
d|ifS )	Nr    r¡   r²   r   F)Ú
output_fig)Ú
predictionÚground_truthÚ	alignmentÚaudio)ÚdataÚcpuÚnumpyr   r   Úinv_melspectrogramÚT)rZ   r®   r©   rC   r    r¡   r²   Ú	pred_specÚgt_specÚ	align_imgÚfiguresÚtrain_audior(   r(   r)   Ú_create_logsm  s   
ýzAlignTTS._create_logsr©   ÚloggerÚLoggerÚassetsÚstepsÚreturnc                 C   ó6   |   ||| j¡\}}| ||¡ | ||| jj¡ d S ©N)rÆ   rC   Útrain_figuresÚtrain_audiosÚsample_rate©rZ   r®   r©   rÇ   rÉ   rÊ   rÄ   Úaudiosr(   r(   r)   Ú	train_log€  s   zAlignTTS.train_logc                 C   s   |   ||¡S rÍ   )r¶   )rZ   r®   r¯   r(   r(   r)   Ú	eval_step‡  s   zAlignTTS.eval_stepc                 C   rÌ   rÍ   )rÆ   rC   Úeval_figuresÚeval_audiosrÐ   rÑ   r(   r(   r)   Úeval_logŠ  s   zAlignTTS.eval_logFc                 C   s@   t |t d¡|d}|  |d ¡ |r|  ¡  | jrJ ‚d S d S )Nr½   )Úmap_locationÚcacheÚmodel)r   r`   r¬   Úload_state_dictÚevalÚtraining)rZ   rA   Úcheckpoint_pathrÜ   rÙ   Ústater(   r(   r)   Úload_checkpoint  s   
þzAlignTTS.load_checkpointc                 C   s   ddl m} || jƒS )Nr   )ÚAlignTTSLoss)ÚTTS.tts.layers.lossesrá   rA   )rZ   rá   r(   r(   r)   Úget_criterion˜  s   
zAlignTTS.get_criterionc                    sn   t | jtƒr3‡ fdd„| jD ƒ}d|vrd}|S t| jƒ‡ fdd„| jD ƒddd…  d¡ d }|S d}|S )	zDecide AlignTTS training phasec                    ó   g | ]}|ˆ k ‘qS r(   r(   ©Ú.0Úi©Úglobal_stepr(   r)   Ú
<listcomp>¡  ó    z'AlignTTS._set_phase.<locals>.<listcomp>Tr   c                    rä   r(   r(   rå   rè   r(   r)   rê   §  rë   NrG   rH   )rL   Úphase_start_stepsÚlistÚlenÚindex)rA   ré   ÚvalsrK   r(   rè   r)   Ú
_set_phase  s   	ú"ÿþÿÿzAlignTTS._set_phasec                 C   s   |   |j|j¡| _dS )z+Set AlignTTS training phase on epoch start.N)rñ   rA   Útotal_steps_donerK   )rZ   Útrainerr(   r(   r)   Úon_epoch_start®  s   zAlignTTS.on_epoch_startÚsamplesc                 C   s>   ddl m} | | ¡}t | ¡\}}t | |¡}t||||ƒS )zðInitiate model from config

        Args:
            config (AlignTTSConfig): Model config.
            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
                Defaults to None.
        r   )rD   )ÚTTS.utils.audiorD   Úinit_from_configr   r   r@   )rA   rõ   rD   rC   rE   Ú
new_configrF   r(   r(   r)   r÷   ²  s
   	
zAlignTTS.init_from_config)NNNrÍ   )FF)'r6   r7   r8   r9   r   rJ   Ústaticmethodro   rx   r~   r‚   r†   r   r   r™   rœ   r   rª   r`   Úno_gradr­   r=   r   ÚModuler¶   rÆ   r:   rÓ   rÔ   r×   rà   rã   rñ   rô   r   r   r   r÷   Ú__classcell__r(   r(   r[   r)   r@   F   sp    %ûþýüû,
	



ÿ7ÿÿÿÿÿ
þ"
ÿ	
.r@   )&Údataclassesr   r   Útypingr   r   r   r`   Úcoqpitr   r   ÚTTS.tts.layers.align_tts.mdnr	   Ú#TTS.tts.layers.feed_forward.decoderr
   Ú.TTS.tts.layers.feed_forward.duration_predictorr   Ú#TTS.tts.layers.feed_forward.encoderr   Ú#TTS.tts.layers.generic.pos_encodingr   ÚTTS.tts.models.base_ttsr   ÚTTS.tts.utils.helpersr   r   r   ÚTTS.tts.utils.speakersr   ÚTTS.tts.utils.text.tokenizerr   ÚTTS.tts.utils.visualr   r   ÚTTS.utils.ior   r   r@   r(   r(   r(   r)   Ú<module>   s&    0