o
    ´‹
jú^  ã                   @   sð   d dl Z d dlmZmZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z% G dd„ deƒZ&dS )é    N)ÚDictÚListÚTupleÚUnion)ÚCoqpit)Únn)Úautocast)Ú
functional)ÚGlowTTSConfig)ÚDecoder)ÚEncoder)ÚBaseTTS)Úgenerate_pathÚmaximum_pathÚsequence_mask)ÚSpeakerManager)Ú	synthesis)ÚTTSTokenizer)Úplot_alignmentÚplot_spectrogram)Úload_fsspecc                       sð  e Zd ZdZ			dFdedddddef‡ fd	d
„Zdefdd„Ze	dd„ ƒZ
dd„ Zdd„ Zdefdd„Zdedeejdf fdd„Zddddœfdd„Ze ¡ dddddœfdd„ƒZe ¡ ddddœfdd„ƒZe ¡ dddd œfd!d"„ƒZd#ed$ejfd%d&„Zd'd(„ Zd#ed)ed*d+d,ed-eddfd.d/„Ze ¡ d#ed$ejfd0d1„ƒZd#ed)ed*d+d,ed-eddfd2d3„Z e ¡ d,ede!eef fd4d5„ƒZ"dGd6d7„Z#d8d9„ Z$	:dHd;d<„Z%e	d=d>„ ƒZ&d?d@„ Z'e	dIddBdCee(e( e(e f fdDdE„ƒZ)‡  Z*S )JÚGlowTTSa¬  GlowTTS model.

    Paper::
        https://arxiv.org/abs/2005.11129

    Paper abstract::
        Recently, text-to-speech (TTS) models such as FastSpeech and ParaNet have been proposed to generate
        mel-spectrograms from text in parallel. Despite the advantage, the parallel TTS models cannot be trained
        without guidance from autoregressive TTS models as their external aligners. In this work, we propose Glow-TTS,
        a flow-based generative model for parallel TTS that does not require any external aligner. By combining the
        properties of flows and dynamic programming, the proposed model searches for the most probable monotonic
        alignment between text and the latent representation of speech on its own. We demonstrate that enforcing hard
        monotonic alignments enables robust TTS, which generalizes to long utterances, and employing generative flows
        enables fast, diverse, and controllable speech synthesis. Glow-TTS obtains an order-of-magnitude speed-up over
        the autoregressive model, Tacotron 2, at synthesis with comparable speech quality. We further show that our
        model can be easily extended to a multi-speaker setting.

    Check :class:`TTS.tts.configs.glow_tts_config.GlowTTSConfig` for class arguments.

    Examples:
        Init only model layers.

        >>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig
        >>> from TTS.tts.models.glow_tts import GlowTTS
        >>> config = GlowTTSConfig(num_chars=2)
        >>> model = GlowTTS(config)

        Fully init a model ready for action. All the class attributes and class members
        (e.g Tokenizer, AudioProcessor, etc.). are initialized internally based on config values.

        >>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig
        >>> from TTS.tts.models.glow_tts import GlowTTS
        >>> config = GlowTTSConfig()
        >>> model = GlowTTS.init_from_config(config, verbose=False)
    NÚconfigÚapÚAudioProcessorÚ	tokenizerr   Úspeaker_managerc                    s¼   t ƒ  ||||¡ || _|D ]
}t| ||| ƒ q|j| _|  |¡ |jdk| _t	| j
| j| j| j| j| j| j| j| j| jd
| _t| j| j| j| j| j| j| j| j| j| j| jd| _d S )Nr   )	Úout_channelsÚhidden_channelsÚhidden_channels_dpÚencoder_typeÚencoder_paramsÚ	mean_onlyÚ
use_prenetÚdropout_p_dpÚc_in_channels)Ú	dropout_pÚ
num_splitsÚnum_squeezeÚsigmoid_scaler%   )ÚsuperÚ__init__r   Úsetattrr   Údecoder_output_dimÚinit_multispeakerÚdata_dep_init_stepsÚrun_data_dep_initr   Ú	num_charsÚhidden_channels_encr   r    r!   r"   Úuse_encoder_prenetr$   r%   Úencoderr   Úhidden_channels_decÚkernel_size_decÚdilation_rateÚnum_flow_blocks_decÚnum_block_layersÚdropout_p_decr'   r(   r)   Údecoder)Úselfr   r   r   r   Úkey©Ú	__class__© úJ/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/models/glow_tts.pyr+   ;   s@   
öõzGlowTTS.__init__c                 C   s¬   d| _ | jdur| jj| _|jr/d|v r|jdur|jnd| _ | jdur/|j| jjks/J dƒ‚|jrP|jsPtdƒ | j| _ t	 
| j| j¡| _t	j | jjdd¡ | j | _dS )	am  Init speaker embedding layer if `use_speaker_embedding` is True and set the expected speaker embedding
        vector dimension to the encoder layer channel size. If model uses d-vectors, then it only sets
        speaker embedding vector dimension to the d-vector dimension from the config.

        Args:
            config (Coqpit): Model configuration.
        r   NÚd_vector_dimi   z@ [!] d-vector dimension mismatch b/w config and speaker manager.z  > Init speaker_embedding layer.gš™™™™™¹¿gš™™™™™¹?)Úembedded_speaker_dimr   Únum_speakersÚuse_d_vector_filerB   Úembedding_dimÚuse_speaker_embeddingÚprintr2   r   Ú	EmbeddingÚemb_gÚinitÚuniform_Úweightr%   )r<   r   r@   r@   rA   r.   k   s    

ÿ
ÿzGlowTTS.init_multispeakerc                 C   sx   t  |  d¡ dd¡| dd¡¡ dd¡}t  |  d¡ dd¡| dd¡¡ dd¡}t  dt  | d¡ ¡| }|||fS )z@Compute and format the mode outputs with the given alignment mapé   é   éÿÿÿÿ)ÚtorchÚmatmulÚsqueezeÚ	transposeÚlogÚsum)ÚattnÚo_meanÚo_log_scaleÚx_maskÚy_meanÚy_log_scaleÚ
o_attn_durr@   r@   rA   Úcompute_outputs‰   s   "ÿ"ÿ
zGlowTTS.compute_outputsc                 C   s(   | j jD ]}t|ddƒr| d¡ qdS )zGUnlock activation normalization layers for data depended initalization.Úset_ddiFTN©r;   ÚflowsÚgetattrr_   ©r<   Úfr@   r@   rA   Úunlock_act_norm_layers–   ó
   
€þzGlowTTS.unlock_act_norm_layersc                 C   s(   | j jD ]}t|ddƒr| d¡ qdS )z%Lock activation normalization layers.r_   FNr`   rc   r@   r@   rA   Úlock_act_norm_layersœ   rf   zGlowTTS.lock_act_norm_layersÚ	aux_inputc                 C   st   |d u r	d }d }n|  dd ¡}|  dd ¡}|d ur!|d ur!tdƒ‚|d ur.t| dƒs.tdƒ‚|d ur6|}|S |}|S )NÚ	d_vectorsÚspeaker_idsz2[!] Cannot use d-vectors and speaker-ids together.rJ   z>[!] Cannot use speaker-ids without enabling speaker embedding.)ÚgetÚ
ValueErrorÚhasattr)r<   rh   ri   rj   Úgr@   r@   rA   Ú_set_speaker_input¢   s   ÿzGlowTTS._set_speaker_inputÚreturnc                 C   s\   |   |¡}|d ur,t| dƒr$| ¡ s| d¡}t |  |¡¡ d¡}|S t |¡ d¡}|S )NrJ   r   rP   )ro   rm   ÚsizeÚ	unsqueezeÚFÚ	normalizerJ   )r<   rh   rn   r@   r@   rA   Ú_speaker_embedding³   s   


ÿzGlowTTS._speaker_embedding©ri   rj   c              
   C   sÞ  |  dd¡}| d¡}|  |¡}| j|||d\}}	}
}|  |||d¡\}}}}t t||ƒd¡ |j	¡}t |d¡t |d¡ }| j
|||dd\}}t ¡ b t d|	 ¡}t d	t dtj ¡ |	 dg¡ d¡}t |  dd¡d	|d  ¡}t ||   dd¡|¡}t d	|d  | dg¡ d¡}|| | | }t|| d¡ƒ d¡ ¡ }W d  ƒ n1 sµw   Y  |  |||	|¡\}}}| d¡ d
dd¡}|  dd¡||  dd¡|  dd¡||
  dd¡|  dd¡dœ}|S )a?  
        Args:
            x (torch.Tensor):
                Input text sequence ids. :math:`[B, T_en]`

            x_lengths (torch.Tensor):
                Lengths of input text sequences. :math:`[B]`

            y (torch.Tensor):
                Target mel-spectrogram frames. :math:`[B, T_de, C_mel]`

            y_lengths (torch.Tensor):
                Lengths of target mel-spectrogram frames. :math:`[B]`

            aux_input (Dict):
                Auxiliary inputs. `d_vectors` is speaker embedding vectors for a multi-speaker model.
                :math:`[B, D_vec]`. `speaker_ids` is speaker ids for a multi-speaker model usind speaker-embedding
                layer. :math:`B`

        Returns:
            Dict:
                - z: :math: `[B, T_de, C]`
                - logdet: :math:`B`
                - y_mean: :math:`[B, T_de, C]`
                - y_log_scale: :math:`[B, T_de, C]`
                - alignments: :math:`[B, T_en, T_de]`
                - durations_log: :math:`[B, T_en, 1]`
                - total_durations_log: :math:`[B, T_en, 1]`
        rN   rO   ©rn   NrP   F©rn   Úreverseéþÿÿÿç      à¿r   )ÚzÚlogdetr[   r\   Ú
alignmentsÚdurations_logÚtotal_durations_log)rT   rq   ru   r4   Ú
preprocessrQ   rr   r   ÚtoÚdtyper;   Úno_gradÚexprV   ÚmathrU   ÚpirR   r   rS   Údetachr^   Úpermute©r<   ÚxÚ	x_lengthsÚyÚ	y_lengthsrh   Úy_max_lengthrn   rX   rY   Ú	o_dur_logrZ   rW   Úy_maskÚ	attn_maskr|   r}   Úo_scaleÚlogp1Úlogp2Úlogp3Úlogp4Úlogpr[   r\   r]   Úoutputsr@   r@   rA   ÚforwardÁ   s8   !


( ù




ù	zGlowTTS.forwardc              
   C   sÔ  |  dd¡}| d¡}|  |¡}| j|||d\}}	}
}|  |||d¡\}}}}t t||ƒd¡ |j	¡}t |d¡t |d¡ }| j
|||dd\}}t d|	 ¡}t d	t dtj ¡ |	 dg¡ d¡}t |  dd¡d	|d  ¡}t ||   dd¡|¡}t d	|d  | dg¡ d¡}|| | | }t|| d¡ƒ d¡ ¡ }|  |||	|¡\}}}| d¡ d
dd¡}|| }| j
|||dd\}}|  dd¡||  dd¡|  dd¡||
  dd¡|  dd¡dœ}|S )a>  
        It's similar to the teacher forcing in Tacotron.
        It was proposed in: https://arxiv.org/abs/2104.05557

        Shapes:
            - x: :math:`[B, T]`
            - x_lenghts: :math:`B`
            - y: :math:`[B, T, C]`
            - y_lengths: :math:`B`
            - g: :math:`[B, C] or B`
        rN   rO   rw   NrP   Frx   rz   r{   r   T©Úmodel_outputsr}   r[   r\   r~   r   r€   )rT   rq   ru   r4   r   rQ   rr   r   r‚   rƒ   r;   r…   rV   r†   rU   r‡   rR   r   rS   rˆ   r^   r‰   rŠ   r@   r@   rA   Úinference_with_MAS  s8   

( 




ù	zGlowTTS.inference_with_MASc           
      C   s†   |  dd¡}| d¡}|  |¡}t t||ƒd¡ |j¡}| j|||dd\}}| j|||dd\}}i }	|  dd¡|	d< ||	d< |	S )z„
        Shapes:
            - y: :math:`[B, T, C]`
            - y_lengths: :math:`B`
            - g: :math:`[B, C] or B`
        rN   rO   Frx   Trœ   r}   )	rT   rq   ru   rQ   rr   r   r‚   rƒ   r;   )
r<   r   rŽ   rh   r   rn   r‘   r|   r}   r™   r@   r@   rA   Údecoder_inference>  s   


zGlowTTS.decoder_inference©rŒ   ri   rj   c              
   C   sd  |d }|   |¡}| j|||d\}}}}t |¡d | | j }	t t |	¡d¡}
t t |
ddg¡d¡ ¡ }d }t 	t
||ƒd¡ |j¡}t 	|d¡t 	|d¡ }t|
 d¡| d¡ƒ 	d¡}|  ||||¡\}}}|t |¡t |¡ | j  | }| j|||dd\}}| d¡ ddd¡}| dd¡|| dd¡| dd¡|| dd¡| dd¡d	œ}|S )
NrŒ   rw   rN   rO   rP   Trx   r   r›   )ru   r4   rQ   r…   Úlength_scaleÚ	clamp_minÚceilrV   Úlongrr   r   r‚   rƒ   r   rS   r^   Ú
randn_likeÚinference_noise_scaler;   r‰   rT   )r<   r‹   rh   rŒ   rn   rX   rY   r   rZ   ÚwÚw_ceilrŽ   r   r‘   r’   rW   r[   r\   r]   r|   r   r}   r™   r@   r@   rA   Ú	inferenceU  s.   
"




ù	zGlowTTS.inferenceÚbatchÚ	criterionc              
   C   s6  |d }|d }|d }|d }|d }|d }| j rO| jrO|  ¡  t ¡  | j||||||dœd}	W d	  ƒ n1 s>w   Y  d	}
d	}|  ¡  |
|fS | j||||||dœd}
td
d. ||
d  ¡ |
d  ¡ |
d  ¡ |
d  ¡ ||
d  ¡ |
d  ¡ |ƒ}W d	  ƒ |
|fS 1 s’w   Y  |
|fS )a  A single training step. Forward pass and loss computation. Run data depended initialization for the
        first `config.data_dep_init_steps` steps.

        Args:
            batch (dict): [description]
            criterion (nn.Module): [description]
        Ú
text_inputÚtext_lengthsÚ	mel_inputÚmel_lengthsri   rj   rv   ©rh   NF)Úenabledr|   r[   r\   r}   r   r€   )	r0   Útrainingre   rQ   r„   rš   rg   r   Úfloat)r<   r©   rª   r«   r¬   r­   r®   ri   rj   Ú_r™   Ú	loss_dictr@   r@   rA   Ú
train_stepx  sV   
ûÿíû





ø
ÿõzGlowTTS.train_stepc                 C   s  |d }|d d ur|d d d… nd }|d }|d }|d d ur*|d d d… nd }|d d ur:|d d d… nd }	| j ||d d… ||	dœd	}
|
d
 }|d j ¡  ¡ }|d j ¡  ¡ }|d j ¡  ¡ }t||ddt||ddt|dddœ}| |j¡}|d|ifS )Nr~   r«   rN   r¬   r­   ri   rj   rŸ   r¯   rœ   r   F©Ú
output_fig)Ú
predictionÚground_truthÚ	alignmentÚaudio)r¨   ÚdataÚcpuÚnumpyr   r   Úinv_melspectrogramÚT)r<   r©   r™   r   r~   r«   r¬   r­   ri   rj   Úpred_outputsrœ   Ú	pred_specÚgt_specÚ	align_imgÚfiguresÚtrain_audior@   r@   rA   Ú_create_logs¬  s(      þ
ýzGlowTTS._create_logsr™   ÚloggerÚLoggerÚassetsÚstepsc                 C   ó6   |   ||| j¡\}}| ||¡ | ||| jj¡ d S ©N)rÇ   r   Útrain_figuresÚtrain_audiosÚsample_rate©r<   r©   r™   rÈ   rÊ   rË   rÅ   Úaudiosr@   r@   rA   Ú	train_logÉ  s   zGlowTTS.train_logc                 C   s   |   ||¡S rÍ   )rµ   )r<   r©   rª   r@   r@   rA   Ú	eval_stepÐ  s   zGlowTTS.eval_stepc                 C   rÌ   rÍ   )rÇ   r   Úeval_figuresÚeval_audiosrÐ   rÑ   r@   r@   rA   Úeval_logÔ  s   zGlowTTS.eval_logc           	      C   sÞ   t dƒ i }i }| jj}|  ¡ }t|ƒdkrt dƒ ||fS t|ƒD ]H\}}t| || jdtt|  	¡ ƒj
ƒv |d |d |d dd	d
	}|d |d |¡< t|d d | jd	d|d |¡< t|d d	d|d |¡< q"||fS )zàGeneric test run for `tts` models used by `Trainer`.

        You can override this for a different behaviour.

        Returns:
            Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
        z! | > Synthesizing test sentences.r   z" | [!] No test sentences provided.ÚcudaÚ
speaker_idÚd_vectorÚ	style_wavTF)rÙ   rÚ   rÛ   Úuse_griffin_limÚdo_trim_silenceÚwavz{}-audior™   rœ   r¶   z{}-predictionr~   z{}-alignment)rH   r   Útest_sentencesÚ_get_test_aux_inputÚlenÚ	enumerater   ÚstrÚnextÚ
parametersÚdeviceÚformatr   r   r   )	r<   rÊ   Útest_audiosÚtest_figuresrß   Ú
aux_inputsÚidxÚsenr™   r@   r@   rA   Útest_runÙ  s4   	î÷ÿzGlowTTS.test_runc                 C   s~   |d ur-|| j  | j  }|d d …d d …d |…f }|d ur-|d d …d d …d d …d |…f }tj|| j dd| j  }||||fS )NÚfloor)Úrounding_mode)r(   rQ   Údiv)r<   r   rŽ   r   rW   r@   r@   rA   r   þ  s    zGlowTTS.preprocessc                 C   s   | j  ¡  d S rÍ   )r;   Ústore_inverse)r<   r@   r@   rA   rñ     s   zGlowTTS.store_inverseFc                 C   sF   t |t d¡d}|  |d ¡ |r|  ¡  |  ¡  | jr!J ‚d S d S )Nr½   )Úmap_locationÚmodel)r   rQ   ræ   Úload_state_dictÚevalrñ   r±   )r<   r   Úcheckpoint_pathrõ   Ústater@   r@   rA   Úload_checkpoint
  s   
ýzGlowTTS.load_checkpointc                  C   s   ddl m}  | ƒ S )Nr   ©ÚGlowTTSLoss)ÚTTS.tts.layers.lossesrú   rù   r@   r@   rA   Úget_criterion  s   zGlowTTS.get_criterionc                 C   s   |j | jk | _dS )zQDecide on every training step wheter enable/disable data depended initialization.N)Útotal_steps_doner/   r0   )r<   Útrainerr@   r@   rA   Úon_train_step_start  s   zGlowTTS.on_train_step_startTr
   Úsamplesc                 C   s@   ddl m} | | |¡}t | ¡\}}t | |¡}t||||ƒS )a8  Initiate model from config

        Args:
            config (VitsConfig): Model config.
            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
                Defaults to None.
            verbose (bool): If True, print init messages. Defaults to True.
        r   )r   )ÚTTS.utils.audior   Úinit_from_configr   r   r   )r   r   Úverboser   r   r   Ú
new_configr   r@   r@   rA   r    s
   
zGlowTTS.init_from_config)NNNrÍ   )F)NT)+Ú__name__Ú
__module__Ú__qualname__Ú__doc__r
   r   r+   r   r.   Ústaticmethodr^   re   rg   r   ro   r   rQ   Útensorru   rš   r„   r   rž   r¨   Údictr   ÚModulerµ   rÇ   ÚintrÓ   rÔ   r×   r   rí   r   rñ   rø   rü   rÿ   r   r  Ú__classcell__r@   r@   r>   rA   r      sz    'ûþýüû0


ÿEÿ7
ÿ
ÿ"4ÿÿÿÿÿ
þ"
$	
ÿ

.r   )'r†   Útypingr   r   r   r   rQ   Úcoqpitr   r   Útorch.cuda.amp.autocast_moder   Útorch.nnr	   rs   ÚTTS.tts.configs.glow_tts_configr
   ÚTTS.tts.layers.glow_tts.decoderr   ÚTTS.tts.layers.glow_tts.encoderr   ÚTTS.tts.models.base_ttsr   ÚTTS.tts.utils.helpersr   r   r   ÚTTS.tts.utils.speakersr   ÚTTS.tts.utils.synthesisr   ÚTTS.tts.utils.text.tokenizerr   ÚTTS.tts.utils.visualr   r   ÚTTS.utils.ior   r   r@   r@   r@   rA   Ú<module>   s$    