o
    
j                     @   s  d dl mZmZ d dlmZmZmZmZ d dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$m%Z%m&Z&m'Z' d dl(m)Z) eG dd de
Z*G dd deZ+dS )    )	dataclassfield)DictListTupleUnionN)Coqpit)nn)autocast)Decoder)Encoder)AlignmentNetwork)PositionalEncoding)DurationPredictor)BaseTTS)average_over_durationsgenerate_pathmaximum_pathsequence_mask)SpeakerManager)TTSTokenizer)plot_alignmentplot_avg_energyplot_avg_pitchplot_spectrogram)load_fsspecc                   @   s  e Zd ZU dZdZeed< dZeed< dZeed< dZ	e
ed	< dZe
ed
< dZeed< dZeed< dZeed< dZeed< dZe
ed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZe
ed< dZe
ed< dZeed< dZeed < ed!d" d#Ze ed$< dZ!eed%< ed&d" d#Z"e ed'< dZ#e
ed(< d)Z$eed*< dZ%eed+< dZ&e
ed,< dZ'eed-< dZ(e
ed.< dZ)eed/< dZ*eed0< dS )1ForwardTTSArgsa  ForwardTTS Model arguments.

    Args:

        num_chars (int):
            Number of characters in the vocabulary. Defaults to 100.

        out_channels (int):
            Number of output channels. Defaults to 80.

        hidden_channels (int):
            Number of base hidden channels of the model. Defaults to 512.

        use_aligner (bool):
            Whether to use aligner network to learn the text to speech alignment or use pre-computed durations.
            If set False, durations should be computed by `TTS/bin/compute_attention_masks.py` and path to the
            pre-computed durations must be provided to `config.datasets[0].meta_file_attn_mask`. Defaults to True.

        use_pitch (bool):
            Use pitch predictor to learn the pitch. Defaults to True.

        use_energy (bool):
            Use energy predictor to learn the energy. Defaults to True.

        duration_predictor_hidden_channels (int):
            Number of hidden channels in the duration predictor. Defaults to 256.

        duration_predictor_dropout_p (float):
            Dropout rate for the duration predictor. Defaults to 0.1.

        duration_predictor_kernel_size (int):
            Kernel size of conv layers in the duration predictor. Defaults to 3.

        pitch_predictor_hidden_channels (int):
            Number of hidden channels in the pitch predictor. Defaults to 256.

        pitch_predictor_dropout_p (float):
            Dropout rate for the pitch predictor. Defaults to 0.1.

        pitch_predictor_kernel_size (int):
            Kernel size of conv layers in the pitch predictor. Defaults to 3.

        pitch_embedding_kernel_size (int):
            Kernel size of the projection layer in the pitch predictor. Defaults to 3.

        energy_predictor_hidden_channels (int):
            Number of hidden channels in the energy predictor. Defaults to 256.

        energy_predictor_dropout_p (float):
            Dropout rate for the energy predictor. Defaults to 0.1.

        energy_predictor_kernel_size (int):
            Kernel size of conv layers in the energy predictor. Defaults to 3.

        energy_embedding_kernel_size (int):
            Kernel size of the projection layer in the energy predictor. Defaults to 3.

        positional_encoding (bool):
            Whether to use positional encoding. Defaults to True.

        positional_encoding_use_scale (bool):
            Whether to use a learnable scale coeff in the positional encoding. Defaults to True.

        length_scale (int):
            Length scale that multiplies the predicted durations. Larger values result slower speech. Defaults to 1.0.

        encoder_type (str):
            Type of the encoder module. One of the encoders available in :class:`TTS.tts.layers.feed_forward.encoder`.
            Defaults to `fftransformer` as in the paper.

        encoder_params (dict):
            Parameters of the encoder module. Defaults to ```{"hidden_channels_ffn": 1024, "num_heads": 1, "num_layers": 6, "dropout_p": 0.1}```

        decoder_type (str):
            Type of the decoder module. One of the decoders available in :class:`TTS.tts.layers.feed_forward.decoder`.
            Defaults to `fftransformer` as in the paper.

        decoder_params (str):
            Parameters of the decoder module. Defaults to ```{"hidden_channels_ffn": 1024, "num_heads": 1, "num_layers": 6, "dropout_p": 0.1}```

        detach_duration_predictor (bool):
            Detach the input to the duration predictor from the earlier computation graph so that the duraiton loss
            does not pass to the earlier layers. Defaults to True.

        max_duration (int):
            Maximum duration accepted by the model. Defaults to 75.

        num_speakers (int):
            Number of speakers for the speaker embedding layer. Defaults to 0.

        speakers_file (str):
            Path to the speaker mapping file for the Speaker Manager. Defaults to None.

        speaker_embedding_channels (int):
            Number of speaker embedding channels. Defaults to 256.

        use_d_vector_file (bool):
            Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False.

        d_vector_dim (int):
            Number of d-vector channels. Defaults to 0.

    N	num_charsP   out_channelsi  hidden_channelsTuse_aligner	use_pitch   pitch_predictor_hidden_channels   pitch_predictor_kernel_size皙?pitch_predictor_dropout_ppitch_embedding_kernel_sizeF
use_energy energy_predictor_hidden_channelsenergy_predictor_kernel_sizeenergy_predictor_dropout_penergy_embedding_kernel_size"duration_predictor_hidden_channelsduration_predictor_kernel_sizeduration_predictor_dropout_ppositional_encodingpoisitonal_encoding_use_scale   length_scalefftransformerencoder_typec                   C      dddddS Ni   r4      r'   )hidden_channels_ffn	num_heads
num_layers	dropout_p r?   r?   r?   M/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/models/forward_tts.py<lambda>       zForwardTTSArgs.<lambda>)default_factoryencoder_paramsdecoder_typec                   C   r8   r9   r?   r?   r?   r?   r@   rA      rB   decoder_paramsdetach_duration_predictorK   max_durationnum_speakersuse_speaker_embeddingspeakers_fileuse_d_vector_filed_vector_dimd_vector_file)+__name__
__module____qualname____doc__r   int__annotations__r   r    r!   boolr"   r$   r&   r(   floatr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r5   r7   strr   rD   dictrE   rF   rG   rI   rJ   rK   rL   rM   rN   rO   r?   r?   r?   r@   r      sL   
 hr   c                       s  e Zd ZdZ			dMdedddddef fd	d
ZdefddZedNddZ	dd Z
dd Z	dNdejdejdejdeejejejejejf fddZdejdejdejdejdejdeejejf fddZ		dOdejdejdejdejdeejejf f
dd Z		dOdejdejd!ejdejdeejejf f
d"d#Zdejd$ejdejd%ejdeejejejejf f
d&d'Zd(efd)d*Zddddddd+fdejd,ejdejd$ejdejdejd!ejd(edefd-d.Ze ddd+fd/d0Zd1ed2ejfd3d4Zd5d6 Zd1ed7ed8d9d:ed;e ddfd<d=Z!d1ed2ejfd>d?Z"d1ed7ed8d9d:ed;e ddfd@dAZ#	BdPdCdDZ$dEdF Z%dGdH Z&edNddIdJe'e(e( e(e f fdKdLZ)  Z*S )Q
ForwardTTSa  General forward TTS model implementation that uses an encoder-decoder architecture with an optional alignment
    network and a pitch predictor.

    If the alignment network is used, the model learns the text-to-speech alignment
    from the data instead of using pre-computed durations.

    If the pitch predictor is used, the model trains a pitch predictor that predicts average pitch value for each
    input character as in the FastPitch model.

    `ForwardTTS` can be configured to one of these architectures,

        - FastPitch
        - SpeedySpeech
        - FastSpeech
        - FastSpeech2 (requires average speech energy predictor)

    Args:
        config (Coqpit): Model coqpit class.
        speaker_manager (SpeakerManager): Speaker manager for multi-speaker training. Only used for multi-speaker models.
            Defaults to None.

    Examples:
        >>> from TTS.tts.models.fast_pitch import ForwardTTS, ForwardTTSArgs
        >>> config = ForwardTTSArgs()
        >>> model = ForwardTTS(config)
    NconfigapAudioProcessor	tokenizerr   speaker_managerc                    s  t  |||| | | | | | jj| _| jj| _| jj| _| jj| _d| _	t
| jjtr7t| jjn| jj| _t| jj| jj| _t| jj| jj| jj| jj| j| _| jjrdt| jj| _t| jj| jj| jj| jj| _t | jj| jj!| jj"| jj#| _$| jjrt | jj| jj%| jj&| jj'| _(tj)d| jj| jj*t| jj*d d d| _+| jjrt | jj| jj,| jj-| jj.| _/tj)d| jj| jj0t| jj0d d d| _1| jjrt2| jj| jjd| _3d S d S )Ng        r4      )kernel_sizepadding)in_query_channelsin_key_channels)4super__init___set_model_argsinit_multispeakerargsrI   r!   r"   r*   binary_loss_weight
isinstancer5   rT   rW   r	   	Embeddingr   r    embr   r7   rD   embedded_speaker_dimencoderr2   r   pos_encoderr   r   rE   rF   decoderr   r/   r0   r1   duration_predictorr$   r&   r(   pitch_predictorConv1dr)   	pitch_embr+   r,   r-   energy_predictorr.   
energy_embr   aligner)selfr[   r\   r^   r_   	__class__r?   r@   rf      s~   





 zForwardTTS.__init__c                 C   s   d| _ | jdu r|js|jrtd| jdur| jj| _|jr7|j| _ | jj| jjkr7t	j
| jj| jjd| _|jrW|jsYtd t	| j| jj| _t	j| jjdd dS dS dS )zjInit for multi-speaker training.

        Args:
            config (Coqpit): Model configuration.
        r   Nzq > SpeakerManager is not provided. You must provide the SpeakerManager before initializing a multi-speaker model.)in_featuresout_featuresz  > Init speaker_embedding layer.gr'   )rn   r_   rM   rK   
ValueErrorrJ   rN   ri   r    r	   Linearproj_gprintrl   emb_ginituniform_weight)ry   r[   r?   r?   r@   rh     s    

zForwardTTS.init_multispeakerc                 C   sp   |du r|  d }d||dk < tt|dd| j}t|dt|d }t| |d| j}|S )zGenerate an attention mask from the durations.

        Shapes
           - dr: :math:`(B, T_{en})`
           - x_mask: :math:`(B, T_{en})`
           - y_mask: :math:`(B, T_{de})`
        Nr4   r`   )	sumlongtorch	unsqueezer   todtyper   squeeze)drx_masky_mask	y_lengths	attn_maskattnr?   r?   r@   generate_attn6  s   
zForwardTTS.generate_attnc                 C   sH   |  |||}t|ddd|j|dddd}||fS )al  Generate attention alignment map from durations and
        expand encoder outputs

        Shapes:
            - en: :math:`(B, D_{en}, T_{en})`
            - dr: :math:`(B, T_{en})`
            - x_mask: :math:`(B, T_{en})`
            - y_mask: :math:`(B, T_{de})`

        Examples::

            encoder output: [a,b,c,d]
            durations: [1, 3, 2, 1]

            expanded: [a, b, b, b, c, c, d]
            attention map: [[0, 0, 0, 0, 0, 0, 1],
                            [0, 0, 0, 0, 1, 1, 0],
                            [0, 1, 1, 1, 0, 0, 0],
                            [1, 0, 0, 0, 0, 0, 0]]
        r4   r`   )r   r   matmulr   	transposer   r   )ry   enr   r   r   r   o_en_exr?   r?   r@   expand_encoder_outputsH  s   2z!ForwardTTS.expand_encoder_outputsc                 C   s2   t |d | | j }d||dk < t |}|S )a  Format predicted durations.
        1. Convert to linear scale from log scale
        2. Apply the length scale for speed adjustment
        3. Apply masking.
        4. Cast 0 durations to 1.
        5. Round the duration values.

        Args:
            o_dr_log: Log scale durations.
            x_mask: Input text mask.

        Shapes:
            - o_dr_log: :math:`(B, T_{de})`
            - x_mask: :math:`(B, T_{en})`
        r4         ?)r   expr5   round)ry   o_dr_logr   o_drr?   r?   r@   format_durationsa  s   
zForwardTTS.format_durationsxr   greturnc                 C   s   t | dr|tj}| |}|dur|d}| |}| t|dd||}|durFt | drB| 	|
|jd dd}|| }||||fS )aP  Encoding forward pass.

        1. Embed speaker IDs if multi-speaker mode.
        2. Embed character sequences.
        3. Run the encoder network.
        4. Sum encoder outputs and speaker embeddings

        Args:
            x (torch.LongTensor): Input sequence IDs.
            x_mask (torch.FloatTensor): Input squence mask.
            g (torch.FloatTensor, optional): Conditioning vectors. In general speaker embeddings. Defaults to None.

        Returns:
            Tuple[torch.tensor, torch.tensor, torch.tensor, torch.tensor, torch.tensor]:
                encoder output, encoder output for the duration predictor, input sequence mask, speaker embeddings,
                character embeddings

        Shapes:
            - x: :math:`(B, T_{en})`
            - x_mask: :math:`(B, 1, T_{en})`
            - g: :math:`(B, C)`
        r   Nr   r4   r   r   )hasattrtyper   
LongTensorr   r   rm   ro   r   r   viewshape)ry   r   r   r   x_embo_enr?   r?   r@   _forward_encoderv  s   




zForwardTTS._forward_encoderr   r   r   c           
      C   sl   t t|dd|j}| ||||\}}t| dr"| ||}| j|||d}	|		dd|	ddfS )a  Decoding forward pass.

        1. Compute the decoder output mask
        2. Expand encoder output with the durations.
        3. Apply position encoding.
        4. Add speaker embeddings if multi-speaker mode.
        5. Run the decoder.

        Args:
            o_en (torch.FloatTensor): Encoder output.
            dr (torch.IntTensor): Ground truth durations or alignment network durations.
            x_mask (torch.IntTensor): Input sequence mask.
            y_lengths (torch.IntTensor): Output sequence lengths.
            g (torch.FloatTensor): Conditioning vectors. In general speaker embeddings.

        Returns:
            Tuple[torch.FloatTensor, torch.FloatTensor]: Decoder output, attention map from durations.
        Nr4   rp   r   r`   )
r   r   r   r   r   r   r   rp   rq   r   )
ry   r   r   r   r   r   r   r   r   o_der?   r?   r@   _forward_decoder  s   
zForwardTTS._forward_decoderpitchc                 C   D   |  ||}|durt||}| |}|||fS | |}||fS )aM  Pitch predictor forward pass.

        1. Predict pitch from encoder outputs.
        2. In training - Compute average pitch values for each input character from the ground truth pitch values.
        3. Embed average pitch values.

        Args:
            o_en (torch.FloatTensor): Encoder output.
            x_mask (torch.IntTensor): Input sequence mask.
            pitch (torch.FloatTensor, optional): Ground truth pitch values. Defaults to None.
            dr (torch.IntTensor, optional): Ground truth durations. Defaults to None.

        Returns:
            Tuple[torch.FloatTensor, torch.FloatTensor]: Pitch embedding, pitch prediction.

        Shapes:
            - o_en: :math:`(B, C, T_{en})`
            - x_mask: :math:`(B, 1, T_{en})`
            - pitch: :math:`(B, 1, T_{de})`
            - dr: :math:`(B, T_{en})`
        N)rs   r   ru   )ry   r   r   r   r   o_pitch	avg_pitcho_pitch_embr?   r?   r@   _forward_pitch_predictor     



z#ForwardTTS._forward_pitch_predictorenergyc                 C   r   )aT  Energy predictor forward pass.

        1. Predict energy from encoder outputs.
        2. In training - Compute average pitch values for each input character from the ground truth pitch values.
        3. Embed average energy values.

        Args:
            o_en (torch.FloatTensor): Encoder output.
            x_mask (torch.IntTensor): Input sequence mask.
            energy (torch.FloatTensor, optional): Ground truth energy values. Defaults to None.
            dr (torch.IntTensor, optional): Ground truth durations. Defaults to None.

        Returns:
            Tuple[torch.FloatTensor, torch.FloatTensor]: Energy embedding, energy prediction.

        Shapes:
            - o_en: :math:`(B, C, T_{en})`
            - x_mask: :math:`(B, 1, T_{en})`
            - pitch: :math:`(B, 1, T_{de})`
            - dr: :math:`(B, T_{en})`
        N)rv   r   rw   )ry   r   r   r   r   o_energy
avg_energyo_energy_embr?   r?   r@   _forward_energy_predictor  r   z$ForwardTTS._forward_energy_predictoryr   c           
      C   s   t |dt |d }| |dd|dd|d\}}t|ddd |d }t |d }	|ddd}|	|||fS )ax  Aligner forward pass.

        1. Compute a mask to apply to the attention map.
        2. Run the alignment network.
        3. Apply MAS to compute the hard alignment map.
        4. Compute the durations from the hard alignment map.

        Args:
            x (torch.FloatTensor): Input sequence.
            y (torch.FloatTensor): Output sequence.
            x_mask (torch.IntTensor): Input sequence mask.
            y_mask (torch.IntTensor): Output sequence mask.

        Returns:
            Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
                Durations from the hard alignment map, soft alignment potentials, log scale alignment potentials,
                hard alignment map.

        Shapes:
            - x: :math:`[B, T_en, C_en]`
            - y: :math:`[B, T_de, C_de]`
            - x_mask: :math:`[B, 1, T_en]`
            - y_mask: :math:`[B, 1, T_de]`

            - o_alignment_dur: :math:`[B, T_en]`
            - alignment_soft: :math:`[B, T_en, T_de]`
            - alignment_logprob: :math:`[B, 1, T_de, T_en]`
            - alignment_mas: :math:`[B, T_en, T_de]`
        r   r`   r4   N)	r   r   rx   r   r   r   
contiguousr   rT   )
ry   r   r   r   r   r   alignment_softalignment_logprobalignment_maso_alignment_durr?   r?   r@   _forward_aligner  s    $ zForwardTTS._forward_aligner	aux_inputc                 C   sb   | dd }| dd }|d ur|d urtd|d ur%t| ds%td|d ur-|}|S |}|S )N	d_vectorsspeaker_idsz2[!] Cannot use d-vectors and speaker-ids together.r   z>[!] Cannot use speaker-ids without enabling speaker embedding.)getr~   r   )ry   r   r   r   r   r?   r?   r@   _set_speaker_input6  s   zForwardTTS._set_speaker_inputr   r   	x_lengthsc	                 C   s  |  |}	tt|dd }
tt||jd d }| |||	\}}}	}| jjr6| 	|
 |}n| 	||}tt|d d| j}| |d|}d}d}d}d}| jrw| ||||
\}}}}|dd}|dd}|}d}d}| jjr| ||||\}}}|| }d}d}| jjr| ||||\}}}|| }| j||||dd\}}||d|d||||||||||||
d}|S )a  Model's forward pass.

        Args:
            x (torch.LongTensor): Input character sequences.
            x_lengths (torch.LongTensor): Input sequence lengths.
            y_lengths (torch.LongTensor): Output sequnce lengths. Defaults to None.
            y (torch.FloatTensor): Spectrogram frames. Only used when the alignment network is on. Defaults to None.
            dr (torch.IntTensor): Character durations over the spectrogram frames. Only used when the alignment network is off. Defaults to None.
            pitch (torch.FloatTensor): Pitch values for each spectrogram frame. Only used when the pitch predictor is on. Defaults to None.
            energy (torch.FloatTensor): energy values for each spectrogram frame. Only used when the energy predictor is on. Defaults to None.
            aux_input (Dict): Auxiliary model inputs for multi-speaker training. Defaults to `{"d_vectors": 0, "speaker_ids": None}`.

        Shapes:
            - x: :math:`[B, T_max]`
            - x_lengths: :math:`[B]`
            - y_lengths: :math:`[B]`
            - y: :math:`[B, T_max2]`
            - dr: :math:`[B, T_max]`
            - g: :math:`[B, C]`
            - pitch: :math:`[B, 1, T]`
        Nr4   r   r`   r   )model_outputsdurations_log	durationsattn_durations	pitch_avgpitch_avg_gt
energy_avgenergy_avg_gt
alignmentsr   r   r   r   r   r   )r   r   r   r   rW   r   r   ri   rG   rr   detachclampr   rI   r   r   r!   r   r   r"   r   r*   r   r   )ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   o_attnr   r   r   r   r   r   r   r   r   r   r   r   outputsr?   r?   r@   forwardC  sd   
 

zForwardTTS.forwardc                 C   s   |  |}t|jdd |j}tt||jd d|j	 }| 
|||\}}}}| | |}| ||d}	|	d}
d}| jjrX| ||\}}|| }d}| jjrj| ||\}}|| }| j||	||
dd\}}|||||d}|S )a;  Model's inference pass.

        Args:
            x (torch.LongTensor): Input character sequence.
            aux_input (Dict): Auxiliary model inputs. Defaults to `{"d_vectors": None, "speaker_ids": None}`.

        Shapes:
            - x: [B, T_max]
            - x_lengths: [B]
            - g: [B, C]
        r4   r`   Nr   )r   r   r   r   r   )r   r   tensorr   r   devicer   r   r   rW   r   rr   r   r   r   ri   r"   r   r*   r   r   )ry   r   r   r   r   r   r   _r   r   r   r   r   r   r   r   r   r   r?   r?   r@   	inference  s.   
$
zForwardTTS.inferencebatch	criterionc                 C   sp  |d }|d }|d }|d }| j jr|d nd }| j jr"|d nd }|d }	|d }
|d	 }|	|
d
}| j||||||||d}| jrI|d }tdd^ ||d |||d || jr`|d nd | jrh|d nd | jrp|d nd | jrx|d nd || jr|d nd |d |d | jd}|d	 }t|| 	 |	  }||d< W d    ||fS 1 sw   Y  ||fS )N
text_inputtext_lengths	mel_inputmel_lengthsr   r   r   r   r   r   )r   r   r   r   r   r   F)enabledr   r   r   r   r   r   r   r   r   )decoder_outputdecoder_targetdecoder_output_lens
dur_output
dur_targetpitch_outputpitch_targetenergy_outputenergy_target
input_lensr   r   alignment_hardrj   duration_error)
ri   r"   r*   r   r!   r
   rj   r   absr   )ry   r   r   r   r   r   r   r   r   r   r   r   r   r   	loss_dictdurations_predr   r?   r?   r@   
train_step  s\   


zForwardTTS.train_stepc                 C   s  |d }|d }|d }|d j   }|d j   }|d j   }	t||ddt||ddt|	ddd}
| jjr|t|d d	 j   }t|d
 d	 j   }| j	|d d j   }t
||ddt
||ddd}|
| | jjrt|d d	 j   }t|d d	 j   }| j	|d d j   }t||ddt||ddd}|
| d|v r|d d j   }t|jdd|
d< ||j}|
d|ifS )zCreate common logger outputs.r   r   r   r   F)
output_fig)
predictionground_truth	alignmentr   )r   r   r   r   )pitch_ground_truthpitch_avg_predictedr   r   )energy_ground_truthenergy_avg_predictedr   alignment_hataudio)datacpunumpyr   r   ri   r"   r   r^   decoder   updater*   r   Tinv_melspectrogram)ry   r   r   r\   r   r   r   	pred_specgt_spec	align_imgfiguresr   pitch_avg_hatcharspitch_figuresr   energy_avg_hatenergy_figuresalignments_hattrain_audior?   r?   r@   _create_logs   s>   


zForwardTTS._create_logsr   loggerLoggerassetsstepsc                 C   6   |  ||| j\}}||| |||| jj d S N)r  r\   train_figurestrain_audiossample_ratery   r   r   r  r  r  r  audiosr?   r?   r@   	train_log/  s   zForwardTTS.train_logc                 C   s   |  ||S r  )r   )ry   r   r   r?   r?   r@   	eval_step6  s   zForwardTTS.eval_stepc                 C   r  r  )r  r\   eval_figureseval_audiosr  r  r?   r?   r@   eval_log9  s   zForwardTTS.eval_logFc                 C   s@   t |td|d}| |d  |r|   | jrJ d S d S )Nr   )map_locationcachemodel)r   r   r   load_state_dictevaltraining)ry   r[   checkpoint_pathr"  r  stater?   r?   r@   load_checkpoint>  s   
zForwardTTS.load_checkpointc                 C   s   ddl m} || jS )Nr   )ForwardTTSLoss)TTS.tts.layers.lossesr'  r[   )ry   r'  r?   r?   r@   get_criterionG  s   
zForwardTTS.get_criterionc                 C   s   t |j| jj dd | _dS )zSchedule binary loss weight.r   N)minepochs_doner[   binary_loss_warmup_epochsrj   )ry   trainerr?   r?   r@   on_train_step_startL  s   zForwardTTS.on_train_step_startForwardTTSConfigsamplesc                 C   s>   ddl m} || }t| \}}t| |}t||||S )zInitiate model from config

        Args:
            config (ForwardTTSConfig): Model config.
            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
                Defaults to None.
        r   )r]   )TTS.utils.audior]   init_from_configr   r   rZ   )r[   r0  r]   r\   r^   
new_configr_   r?   r?   r@   r2  P  s
   	
zForwardTTS.init_from_config)NNNr  )NN)FF)+rP   rQ   rR   rS   r   r   rf   rh   staticmethodr   r   r   r   r   FloatTensorr   r   	IntTensorr   r   r   r   r   r   r   no_gradr   rY   r	   Moduler   r  rT   r  r  r  r&  r)  r.  r   r   r2  __classcell__r?   r?   rz   r@   rZ      s   R
+
(
(
$
)	

]+4/
"
	.rZ   ),dataclassesr   r   typingr   r   r   r   r   coqpitr   r	   torch.cuda.amp.autocast_moder
   #TTS.tts.layers.feed_forward.decoderr   #TTS.tts.layers.feed_forward.encoderr   TTS.tts.layers.generic.alignerr   #TTS.tts.layers.generic.pos_encodingr   *TTS.tts.layers.glow_tts.duration_predictorr   TTS.tts.models.base_ttsr   TTS.tts.utils.helpersr   r   r   r   TTS.tts.utils.speakersr   TTS.tts.utils.text.tokenizerr   TTS.tts.utils.visualr   r   r   r   TTS.utils.ior   r   rZ   r?   r?   r?   r@   <module>   s*     