o
    
j                    @   s  d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZ d dlZd dlZd dlmZ d dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z(m)Z) d dl*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9m:Z:m;Z;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZBmCZCmDZDmEZE d dlFmGZGmHZH d dlFmIZJ d dlFmKZL d dlMmNZN d dlOmPZP d dlQmRZR d dlSmTZT d dlUmVZV drd!d"ZWdrd#d$ZXdrd%d&ZYd'ejZd(ejZfd)d*Z[d+e
ejZ d,e\d(ejZfd-d.Z]dsd1ej^d2e_d3e_fd4d5Z`dtd7ejZd8e\d(ejZfd9d:Zad;ee\ d(ejZfd<d=Zbd>e\d(ee\e\f fd?d@Zci adi aeef d1ej^fdAdBZgdCej^fdDdEZhdFeifdGdHZjdudKdLZkdvdMdNZldOdP ZmdQdR ZIdrdSdTZndrdUdVZodrdWdXZpdYdZ Zqd[d\ Zrdrd]d^Zsdwd_etd`eidaeufdbdcZvG ddde dee+ZwG dfdg dge,ZxeG dhdi dieZyeG djdk dkeZzeG dldm dmeZ{G dndo doe6Z|G dpdq dqej^Z}dS )x    N)	dataclassfield)chain)Path)DictListOptionalTupleUnion)Coqpit)mel)nn)autocast)
functional)
DataLoader)WeightedRandomSampler)DistributedSamplerDistributedSamplerWrapper)get_optimizerget_scheduler)	F0Dataset
TTSDataset_parse_sample)AcousticModel)ForwardSumLossVitsDiscriminatorLoss)VitsDiscriminator)
BaseTTSE2E)average_over_durationscompute_attn_priorrand_segmentssegmentsequence_mask)SpeakerManager)TTSTokenizer)plot_alignmentplot_avg_pitch
plot_pitchplot_spectrogram)build_mel_basis
compute_f0)	db_to_amp)
mel_to_wav)AudioProcessor)load_fsspec)MultiScaleSTFTLoss)HifiganGenerator)plot_resultsFc                 C   s,   | d urt | } t| } |r|  S | S N)npasarraytorch
from_numpycuda)aux_idr7    r9   P/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/models/delightful_tts.pyid_to_torch(   s   

r;   c                 C   s>   | d urt | } t|  } |  d} |r|  S | S )Nr   )r3   r4   r5   r6   floatsqueeze	unsqueezer7   )d_vectorr7   r9   r9   r:   embedding_to_torch1   s   
r@   c                 C   s*   | d u rd S t j| |d}|r| S |S )Ndtype)r5   	as_tensorr7   )np_arrayrB   r7   tensorr9   r9   r:   numpy_to_torch;   s   rF   lengthsreturnc                 C   sR   | j d }t|  }tjd|| jdd|d}|| dd|k}|S )Nr   device   )shaper5   maxitemarangerJ   r>   expand)rG   
batch_sizemax_lenidsmaskr9   r9   r:   get_mask_from_lengthsD   s
   
 rV   	input_elerS   c                 C   s   t jtt j g }| D ]/}t|jdkr%t|d||	d fdd}nt|ddd||	d fdd}|
| qt |}|S )NrL   r   constant        )r5   jitannotater   TensorlenrM   Fpadsizeappendstack)rW   rS   out_listbatchone_batch_padded
out_paddedr9   r9   r:   r_   L   s    "
r_   rY   {Gz?mmeanstdc                 C   s.   | j j}|ddkr| jj|| d S d S )NConvrK   )	__class____name__findweightdatanormal_)rh   ri   rj   	classnamer9   r9   r:   init_weightsX   s   rs      lensstridec                 C   s   t | |  S r2   )r5   ceilint)ru   rv   r9   r9   r:   stride_lens^   s   ry   rM   c                 C   s0   t | dks
J dt| td| d   S )Nrt   z.Can only initialize 2-D embedding matrices ...rL   )r]   r5   randnr3   sqrtrM   r9   r9   r:   initialize_embeddingsb   s   r}   kernel_sizec                 C   s   | d }||| d d  fS )Nrt   rL   r9   )r~   r_   r9   r9   r:   calc_same_paddingh   s   r   c                 C   s$   t | dd }t|r|   d S d S )Nreset_parameters)getattrcallabler   )rh   r   r9   r9   r:   weights_resetq   s   r   mdlc                 C   s8   i }|   D ]\}}d|v r|j  }|||< q|S )Nro   )named_parametersrp   sumrO   )r   	dict_sumsnamewvaluer9   r9   r:   get_module_weights_sumy   s   r   	file_pathc                 C   s6   t | \}}|dk |dk   dksJ ||fS )z^Load the audio file normalized in [-1, 1]

    Return Shapes:
        - x: :math:`[1, T]`
    rL   rK   r   )
torchaudioloadr   )r   xsrr9   r9   r:   
load_audio   s
    r   rL   h㈵>c                 C   s   t t j| |d| S )Nmin)r5   logclamp)r   Cclip_valr9   r9   r:   
_amp_to_db   s   r   c                 C   s   t | | S r2   )r5   exp)r   r   r9   r9   r:   
_db_to_amp      r   c                 C      t | }|S r2   )r   
magnitudesoutputr9   r9   r:   	amp_to_db      r   c                 C   r   r2   )r   r   r9   r9   r:   r+      r   r+   c                 C   s   |  d} t| dk rtdt|  t| dkr#tdt|  t| jd t| j }t|d | }|tvrIt|j	| j| jdt|< tj
jj| dt|| d t|| d fd	d
} |  d} tj| |||t| |d	dddd
}|S )NrL         min value is       ?max value is _rB   rJ   rt   reflectmodeFT
hop_length
win_lengthwindowcenterpad_mode
normalizedonesidedreturn_complex)r=   r5   r   printrN   strrB   rJ   hann_windowtor   r   r_   r>   rx   stft)yn_fftr   r   r   dtype_devicewnsize_dtype_devicespecr9   r9   r:   _wav_to_spec   s8   

r   c                 C   s0   t | ||||d}t|ddd }|S )zk
    Args Shapes:
        - y : :math:`[B, 1, T]`

    Return Shapes:
        - spec : :math:`[B,C,T]`
    r   rt   rK   ư>)r   r5   r{   powr   r   r   r   r   r   r   r9   r9   r:   wav_to_spec   s   r   c                 C   s<   t | ||||d}t|ddd }tj|dddS )Nr   rt   rK   r   rL   T)dimkeepdim)r   r5   r{   r   r   normr   r9   r9   r:   wav_to_energy   s   r   c                 C   s"   | d| d| j  d| j }|S )Nr   r   )r   r   fmax	n_fft_lenr9   r9   r:   name_mel_basis   s   r   c                 C   s\   t | ||}|tvr t|||||}t|j| j| jdt|< tt| | }t	|}|S )zk
    Args Shapes:
        - spec : :math:`[B,C,T]`

    Return Shapes:
        - mel : :math:`[B,C,T]`
    r   )
r   	mel_basislibrosa_mel_fnr5   r6   r   rB   rJ   matmulr   )r   r   num_melssample_ratefminr   mel_basis_keyr   r9   r9   r:   spec_to_mel   s   	r   c	                 C   sh  |  d} t| dk rtdt|  t| dkr#tdt|  t| ||}	t|d t| j d t| j }
|	t	vrVt
|||||d}t|j| j| jdt	|	< |
tvrht|j| j| jdt|
< tjjj| dt|| d	 t|| d	 fd
d} |  d} tj| |||t|
 |d
dddd
}t|d	dd }tt	|	 |}t|}|S )zr
    Args Shapes:
        - y : :math:`[B, 1, T_y]`

    Return Shapes:
        - spec : :math:`[B,C,T_spec]`
    rL   r   r   r   r   r   )r   r   n_melsr   r   r   rt   r   r   FTr   rK   r   )r=   r5   r   r   rN   r   r   rB   rJ   r   r   r6   r   r   r   r   r_   r>   rx   r   r{   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r9   r9   r:   
wav_to_mel   sH   
$

r   items	attr_name
multi_dictc                    s   t  fdd| D t  fddD }t fddD }d| t fdd|D }|t j| }durUt  fdd| D }||9 }t| t | fS )	z0Create balancer weight for torch WeightedSamplerc                       g | ]}|  qS r9   r9   .0rO   )r   r9   r:   
<listcomp>/      z2get_attribute_balancer_weights.<locals>.<listcomp>c                    s   g | ]}  |qS r9   )indexr   l)unique_attr_namesr9   r:   r   1      c                    s"   g | ]}t t |kd  qS r   )r]   r3   wherer   )attr_names_samplesr9   r:   r   2  s   " r   c                    s   g | ]} | qS r9   r9   r   )weight_attrr9   r:   r   4  r   Nc                    s   g | ]
} |  d qS )r   )getr   )r   r   r9   r:   r   7  s    )	r3   arrayuniquetolistlinalgr   r5   r6   r<   )r   r   r   attr_idx
attr_countdataset_samples_weightmultiplier_samplesr9   )r   r   r   r   r   r:   get_attribute_balancer_weights-  s   r   c                       sV   e Zd ZdZ				ddeee ee f def fdd	Zdd
dZ	dd Z
  ZS )ForwardTTSE2eF0Datasetz5Override F0Dataset to avoid slow computing of pitchesFNr   Tsamples
cache_pathc                    s   t  j||||||d d S )N)r   apverboser   precompute_num_workersnormalize_f0)super__init__)selfr   r   r   r   r   r   rl   r9   r:   r   C  s   	
zForwardTTSE2eF0Dataset.__init__c                 C   sr   t |\}}t| d | jj| jj| jj| jj| jjd}|j	d | jj dkr/|d d }|r7t
|| |S )Nr   )r   r   r   
pitch_fmax
pitch_fminr   rL   rK   )r   r*   numpyr   r   r   r   r   r   rM   r3   save)r   wav_file
pitch_filewavr   f0r9   r9   r:   _compute_and_save_pitchU  s   
	z.ForwardTTSE2eF0Dataset._compute_and_save_pitchc                 C   s@   |  || j}tj|s| j||d}nt|}|tj	S )zH
        compute pitch and return a numpy array of pitch values
        )r   r  )
create_pitch_file_pathr   ospathexistsr  r3   r   astypefloat32)r   r   
audio_namer  pitchr9   r9   r:   compute_or_loadf  s
   
z&ForwardTTSE2eF0Dataset.compute_or_load)FNr   Tr2   )rm   
__module____qualname____doc__r
   r   r   r   r   r  r  __classcell__r9   r9   r   r:   r   @  s    
r   c                       s@   e Zd Z fddZdd Zdd Zedd Zd	d
 Z  Z	S )ForwardTTSE2eDatasetc                    s   | dd}d|d< | d| _t j|i | || _| jjj| _|d | _| jr9t	| j| j
|d |d d| _| jd urHtj| jdd	 d S d S )
Nr*   Fattn_prior_cache_pathr   f0_cache_pathr   )r   r   r   r   T)exist_ok)popr  r   r   r*   	tokenizer
characterspad_idr   r   r   
f0_datasetr  makedirs)r   argskwargsr*   r   r9   r:   r   s  s"   

zForwardTTSE2eDataset.__init__c                 C   s  | j | }t|d |d d}t|dd}|d }t|d \}}tj	|d }z
| 
||d }W n
   t|| td }	| jrP| |d }	t|| jks_|jd | jk rl|  jd7  _| | jS d }
| jd urz| |||}
||t|||	||d	 |d
 |
|d d
S )N
audio_file	root_path /r   textr  rL   speaker_namelanguageaudio_unique_name)
raw_text	token_ids	token_lenr  r  r   r#  language_name
attn_priorr%  )r   r   relative_towith_suffixr   replacer   r  r  basenameget_token_idsr   OSErrorr*   get_f0r]   max_text_lenrM   min_audio_lenrescue_item_idx__getitem__r  load_or_compute_attn_prior)r   idxrO   rel_wav_pathr&  r  r   wav_filenamer'  r  r*  r9   r9   r:   r5    s>   


z ForwardTTSE2eDataset.__getitem__c                 C   s`   t j| j| d}t j|rt|S t|}|jd | j	j
 }t||}t|| |S )z-Load or compute and save the attention prior.z.npyrL   )r  r  joinr  r  r3   r   r]   rM   r   r   r   r   )r   r'  r  r8  attn_prior_filer(  mel_lenr*  r9   r9   r:   r6    s   

z/ForwardTTSE2eDataset.load_or_compute_attn_priorc                 C   s@   g }| j D ]}t|^}}}tj|d d }|| q|S )N      )r   r   r  r  getsizera   )r   ru   rO   r   r   	audio_lenr9   r9   r:   rG     s   
zForwardTTSE2eDataset.lengthsc                    s  t  } fdd d D  tdd  d D }t d }||  }dd  d	 D }t|}t|}|| }d
}	| jrddd  d D }
t|
}
t|
}t|d|}	|	 | j }	t||}t|d|}| | j }| | j }t|D ]E} d | }t|||d
 d | f<  d	 | }t|||d
d
d
|	df< | jrȈ d | }t||	|dd
t |f< q||||	||| d  d  d  d  d  d d d
ur d dS d
dS )a  
        Return Shapes:
            - tokens: :math:`[B, T]`
            - token_lens :math:`[B]`
            - token_rel_lens :math:`[B]`
            - pitch :math:`[B, T]`
            - waveform: :math:`[B, 1, T]`
            - waveform_lens: :math:`[B]`
            - waveform_rel_lens: :math:`[B]`
            - speaker_names: :math:`[B]`
            - language_names: :math:`[B]`
            - audiofile_paths: :math:`[B]`
            - raw_texts: :math:`[B]`
            - attn_prior: :math:`[[T_token, T_mel]]`
        c                    s    i | ]   fd dD qS )c                    r   r9   r9   )r   dickr9   r:   r     r   z>ForwardTTSE2eDataset.collate_fn.<locals>.<dictcomp>.<listcomp>r9   )r   rd   rB  r:   
<dictcomp>  s     z3ForwardTTSE2eDataset.collate_fn.<locals>.<dictcomp>r   c                 S   s   g | ]}t |qS r9   )r]   )r   r   r9   r9   r:   r     r   z3ForwardTTSE2eDataset.collate_fn.<locals>.<listcomp>r'  r(  c                 S      g | ]}|j d  qS rL   r|   r   r   r9   r9   r:   r     r   r  Nc                 S   rF  r   r|   )r   pr9   r9   r:   r     r   r  rL   r#  r)  r%  r   r&  r*  )
text_inputtext_lengthstext_rel_lensr  waveformwaveform_lenswaveform_rel_lensspeaker_nameslanguage_namesaudio_unique_namesaudio_filesr&  attn_priors)
r]   rN   r5   
LongTensorr*   FloatTensorzero_r  ranger`   )r   rd   Br2  
token_lenstoken_rel_lenswav_lenswav_lens_maxwav_rel_lenspitch_padded
pitch_lenspitch_lens_maxtoken_padded
wav_paddedir'  r  r  r9   rD  r:   
collate_fn  sZ   



"zForwardTTSE2eDataset.collate_fn)
rm   r  r  r   r5  r6  propertyrG   re  r  r9   r9   r   r:   r  r  s    ,
r  c                   @   s   e Zd ZU dZeed< edd dZee	 ed< edd dZ
eee	  ed< ed	d dZee	 ed
< dZe	ed< edd dZee	 ed< dZeed< edd dZee	 ed< edd dZee	 ed< dZee ed< dS )VocoderConfig1resblock_type_decoderc                   C      g dS )N)         r9   r9   r9   r9   r:   <lambda>      zVocoderConfig.<lambda>)default_factoryresblock_kernel_sizes_decoderc                   C   s   g dg dg dgS )N)rL   rk     r9   r9   r9   r9   r:   rn    r   resblock_dilation_sizes_decoderc                   C   rj  )N)r>  r>  rt   rt   r9   r9   r9   r9   r:   rn    ro  upsample_rates_decoder    upsample_initial_channel_decoderc                   C   rj  )N)r=  r=     rw  r9   r9   r9   r9   r:   rn    ro  upsample_kernel_sizes_decoderFuse_spectral_norm_discriminatorc                   C   rj  )N)rw  rw  rw  rw  r9   r9   r9   r9   r:   rn     ro  upsampling_rates_discriminatorc                   C   rj  )N)rt   rk  rr  rl  rm  r9   r9   r9   r9   r:   rn  !  ro  periods_discriminatorNpretrained_model_path)rm   r  r  ri  r   __annotations__r   rq  r   rx   rs  rt  rv  rx  ry  boolrz  r{  r|  r   r9   r9   r9   r:   rg    s   
 rg  c                   @   s:  e Zd ZU dZeed< dZeed< dZeed< dZeed< dZ	e
ed	< d
Ze
ed< dZeed< dZe
ed< dZe
ed< dZeed< dZe
ed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZe
ed < d!Ze
ed"< d#Zeed$< dZeed%< dZeed&< dZeed'< d(Zeed)< d*Z e
ed+< dS ),DelightfulTtsAudioConfigi"V  r      r   i   r   fft_sizerY   mel_fmini@  mel_fmaxd   r   g      @r   r   r   Fresamplepreemphasis   ref_level_dbdo_sound_normznp.log10log_funcTdo_trim_silence-   trim_dbdo_rms_normNdb_levelg      ?power<   griffin_lim_iters	spec_gaindo_amp_to_db_lineardo_amp_to_db_melimin_level_dbg      @max_norm)!rm   r  r  r   rx   r}  r   r   r  r  r<   r  r   r   r   r  r~  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r9   r9   r9   r:   r  %  s4   
 r  c                   @   s  e Zd ZU dZeed< dZeed< dZeed< dZeed< d	Z	eed
< dZ
eed< dZeed< dZeed< dZeed< dZeed< dZeed< d	Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< g dZdZeed< g dZd d gZdZeed!< d"Zeed#< dZeed$< d%Zeed&< dZeed'< d%Z eed(< d)Z!eed*< d+Z"eed,< dZ#eed-< d.Z$e%ed/< d0Z&eed1< d2Z'e(ed3< d2Z)e(ed4< d5Z*eed6< d.Z+e%ed7< d0Z,eed8< d.Z-e%ed9< d.Z.e%ed:< d.Z/e%ed;< d.Z0e%ed<< d.Z1e%ed=< d.Z2e%ed>< d.Z3e%ed?< d@Z4eedA< d2S )BDelightfulTtsArgsr  	num_chars    spec_segment_sizeru  n_hidden_conformer_encoder   n_layers_conformer_encoderr>  n_heads_conformer_encoderg?dropout_conformer_encoderrl  &kernel_size_conv_mod_conformer_encoder'kernel_size_depthwise_conformer_encoderg333333?lrelu_slopen_hidden_conformer_decodern_layers_conformer_decodern_heads_conformer_decoderdropout_conformer_decoderrm  &kernel_size_conv_mod_conformer_decoder'kernel_size_depthwise_conformer_decoderrw  #bottleneck_size_p_reference_encoder#bottleneck_size_u_reference_encoder)r  r  @   r     r  rk  ref_enc_size_reference_encoder)rL   rt   rL   rt   rL   rL   "ref_enc_gru_size_reference_encoderg?'ref_attention_dropout_reference_encodertoken_num_reference_encoderrr  'predictor_kernel_size_reference_encodern_hidden_variance_adaptorkernel_size_variance_adaptor      ?dropout_variance_adaptorr  n_bins_variance_adaptor emb_kernel_size_variance_adaptorFuse_speaker_embeddingr   num_speakersNspeakers_filed_vector_filei  speaker_embedding_channelsuse_d_vector_filed_vector_dimfreeze_vocoderfreeze_text_encoderfreeze_duration_predictorfreeze_pitch_predictorfreeze_energy_predictorfreeze_basis_vectors_predictorfreeze_decoderr   length_scale)5rm   r  r  r  rx   r}  r  r  r  r  r  r<   r  r  r  r  r  r  r  r  r  r  r  !ref_enc_filters_reference_encoderr  !ref_enc_strides_reference_encoderref_enc_pad_reference_encoderr  r  r  r  r  r  r  r  r  r  r~  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r9   r9   r9   r:   r  B  s\   
 r  c                       sL  e Zd ZdZ		d|dedddef fddZed	d
 Zedd Z	edd Z
e
jdd Z
edd Zejdd Zedd Zejdd Zedd Zd}ddZdefddZdd Zdd  Zd!d" Z					d~d#ejd$ejd%ejd&ejd'ejd(ejd)ejd*ejd+ejd,ejdefd-d.Ze ddd/ddfd0d1Ze ddd/fd2d3Zd4ed5ejd6efd7d8Z d4ed5ejd6efd9d:Z!dd<d=Z"d4ed>ed?d@dAedBef
dCdDZ#d4ed>ed?d@dAedBeddfdEdFZ$dGdH Z%dIdJ Z&			ddKe'dLe'dMej(fdNdOZ)dKe'fdPdQZ*e de+eef fdRdSZ,d>ed?d@dAedBeddf
dTdUZ-d4edefdVdWZ.dXdY Z/dded[e0fd\d]Z1	ddedAed^e2d_e3e4e e4e4 f d`e2daedbeddcfdddeZ5dfdg Z6de4fdhdiZ7de4fdjdkZ8de4fdldmZ9dndo Z:e;	pdddqd_e3e4e4 e4e f fdrdsZ<ddtduZ=dvdw Z>dxdy Z?d}dzd{Z@  ZAS )DelightfulTTSa  
    Paper::
        https://arxiv.org/pdf/2110.12612.pdf

    Paper Abstract::
        This paper describes the Microsoft end-to-end neural text to speech (TTS) system: DelightfulTTS for Blizzard Challenge 2021.
        The goal of this challenge is to synthesize natural and high-quality speech from text, and we approach this goal in two perspectives:
        The first is to directly model and generate waveform in 48 kHz sampling rate, which brings higher perception quality than previous systems
        with 16 kHz or 24 kHz sampling rate; The second is to model the variation information in speech through a systematic design, which improves
        the prosody and naturalness. Specifically, for 48 kHz modeling, we predict 16 kHz mel-spectrogram in acoustic model, and
        propose a vocoder called HiFiNet to directly generate 48 kHz waveform from predicted 16 kHz mel-spectrogram, which can better trade off training
        efficiency, modelling stability and voice quality. We model variation information systematically from both explicit (speaker ID, language ID, pitch and duration) and
        implicit (utterance-level and phoneme-level prosody) perspectives: 1) For speaker and language ID, we use lookup embedding in training and
        inference; 2) For pitch and duration, we extract the values from paired text-speech data in training and use two predictors to predict the values in inference; 3)
        For utterance-level and phoneme-level prosody, we use two reference encoders to extract the values in training, and use two separate predictors to predict the values in inference.
        Additionally, we introduce an improved Conformer block to better model the local and global dependency in acoustic model. For task SH1, DelightfulTTS achieves 4.17 mean score in MOS test
        and 4.35 in SMOS test, which indicates the effectiveness of our proposed system


    Model training::
        text --> ForwardTTS() --> spec_hat --> rand_seg_select()--> GANVocoder() --> waveform_seg
        spec --------^

    Examples:
        >>> from TTS.tts.models.forward_tts_e2e import ForwardTTSE2e, ForwardTTSE2eConfig
        >>> config = ForwardTTSE2eConfig()
        >>> model = ForwardTTSE2e(config)
    Nconfigr  r$   speaker_managerc                    s   t  j||||d || _| | | | d | _| jjj| j	_
| jjj| j	_t| j	||d| _t| jjjd| jjj| jjj| jjj| jjj| jjj| jjjddddd| _| jjrjt| jjj| jjjd| _d S d S )N)r  r   r  r  )r  r  r  rL   r   F)inference_paddingconv_pre_weight_normconv_post_weight_normconv_post_bias)use_spectral_normperiods)r   r   r   _set_model_argsinit_multispeakerbinary_loss_weightr  audior   r  out_channelsr   acoustic_modelr0   vocoderri  rs  rq  rx  rv  rt  waveform_decoderinit_discriminatorr   ry  r{  disc)r   r  r   r  r  r   r9   r:   r     s8   

zDelightfulTTS.__init__c                 C   s   t |  jS r2   )next
parametersrJ   r   r9   r9   r:   rJ     s   zDelightfulTTS.devicec                 C      | j jS r2   )r  energy_scalerr  r9   r9   r:   r       zDelightfulTTS.energy_scalerc                 C   r  r2   r  r  r  r9   r9   r:   r    r  zDelightfulTTS.length_scalec                 C      || j _d S r2   r  r   r   r9   r9   r:   r       c                 C   r  r2   r  
pitch_meanr  r9   r9   r:   r    r  zDelightfulTTS.pitch_meanc                 C   r  r2   r  r  r9   r9   r:   r    r  c                 C   r  r2   r  	pitch_stdr  r9   r9   r:   r    r  zDelightfulTTS.pitch_stdc                 C   r  r2   r  r  r9   r9   r:   r    r  c                 C   s&   t | jj| jj| jj| jj| jjdS )N)r   r  r   r  r  )r)   r   r   r  r   r  r  r  r9   r9   r:   r     s   zDelightfulTTS.mel_basisrH   c                 C   s   | j jdk| _d| _d S )Nr   T)r  steps_to_start_discriminator
train_discupdate_energy_scalerr  r9   r9   r:   init_for_training  s   

zDelightfulTTS.init_for_trainingc                 C   sZ   d| _ | jj| _d| _| jr| jj| _| jj| j_| jjr!|   | jjr+|   dS dS )zjInit for multi-speaker training.

        Args:
            config (Coqpit): Model configuration.
        r   N)	embedded_speaker_dimr  r  audio_transformr  r  _init_speaker_embeddingr  _init_d_vectorr   r  r9   r9   r:   r    s   

zDelightfulTTS.init_multispeakerc                 C   s0   | j dkrtd | jj| _| jj| j_d S d S )Nr   z. > initialization of speaker-embedding layers.)r  r   r  r  r  r  r9   r9   r:   r    s
   

z%DelightfulTTS._init_speaker_embeddingc                 C   s,   t | dr	td| jj| _| jj| j_d S )Nemb_gzI[!] Speaker embedding layer already initialized before d_vector settings.)hasattr
ValueErrorr  r  r  r  r9   r9   r:   r    s   

zDelightfulTTS._init_d_vectorc                 C   s   | j jr| j D ]}d|_q	| j jr| j D ]}d|_q| j jr-| j	 D ]}d|_q'| j j
r<| j D ]}d|_q6| j jrK| j D ]}d|_qE| j jrZ| j D ]}d|_qTd S d S NF)r  r  r  paramsetersrequires_gradr  text_encoderr  r  durarion_predictorr  pitch_predictorr  energy_predictorr  decoder)r   paramr9   r9   r:   _freeze_layers  s(   zDelightfulTTS._freeze_layersr   	x_lengthsspec_lengthsr   rM  r  energyrT  	d_vectorsspeaker_idxc                 C   s   | j ||||||||	|
d	}|d }t|dd|| jjddd\}}|d dur1|d d	}nd}| j| |d
}t||| j	j
 | jj| j	j
 dd}i |}|d |d< ||d< ||d< ||d< |S )aC  Model's forward pass.

        Args:
            x (torch.LongTensor): Input character sequences.
            x_lengths (torch.LongTensor): Input sequence lengths.
            spec_lengths (torch.LongTensor): Spectrogram sequnce lengths. Defaults to None.
            spec (torch.FloatTensor): Spectrogram frames. Only used when the alignment network is on. Defaults to None.
            waveform (torch.FloatTensor): Waveform. Defaults to None.
            pitch (torch.FloatTensor): Pitch values for each spectrogram frame. Only used when the pitch predictor is on. Defaults to None.
            energy (torch.FloatTensor): Spectral energy values for each spectrogram frame. Only used when the energy predictor is on. Defaults to None.
            attn_priors (torch.FloatTentrasor): Attention priors for the aligner network. Defaults to None.
            aux_input (Dict): Auxiliary model inputs for multi-speaker training. Defaults to `{"d_vectors": 0, "speaker_ids": None}`.

        Shapes:
            - x: :math:`[B, T_max]`
            - x_lengths: :math:`[B]`
            - spec_lengths: :math:`[B]`
            - spec: :math:`[B, T_max2, C_spec]`
            - waveform: :math:`[B, 1, T_max2 * hop_length]`
            - g: :math:`[B, C]`
            - pitch: :math:`[B, 1, T_max2]`
            - energy: :math:`[B, 1, T_max2]`
        )	tokenssrc_lensmel_lensmelspitchesenergiesrT  r  r  model_outputsrL   rt   T)r   r  segment_sizelet_short_samples	pad_shortspk_embNrK   r   gr  acoustic_model_outputswaveform_seg	slice_ids)r  r    	transposer  r  r>   r  detachr!   r   r   )r   r   r  r  r   rM  r  r  rT  r  r  encoder_outputsvocoder_inputvocoder_input_slicesr  r  vocoder_outputwav_segr  r9   r9   r:   forward(  sF   $


zDelightfulTTS.forwardr  speaker_idsc           
   	   C   st   | j j||d |d ||d d d}|d dd}|d d ur'|d d}nd }| j||d	}i |}	||	d< |	S )
Nr  r!  )r  r  r  pitch_transformenergy_transform	p_control	d_controlr  rL   rt   r  rK   r  )r  	inferencer  r>   r  )
r   r   	aux_inputr"  r#  r  r  r  r  r  r9   r9   r:   r&  u  s"   	zDelightfulTTS.inferencec                 C   s&   | j j||d |d d}i |}|S )Nr  r!  )r  r  r  )r  r&  )r   r   r'  r  r  r9   r9   r:   inference_spec_decoder  s   z$DelightfulTTS.inference_spec_decoderrd   	criterionoptimizer_idxc                 C   s  |dkrw|d }|d }|d }|d }|d }|d }	|d }
|d	 }|d
 }|d }| j ||||||	|||
|d
}|| _| jru| |d  |d \}}}}tdd || ||d}W d    ||fS 1 slw   Y  ||fS dS |dkr|d }tdd= t| | jd | jj	dd}t
| jd  | jj| jj| jj| jj| jj| jj| jjdd	}d }d }d }W d    n1 sw   Y  | jr| | jd | jd \}}}}tdd || d=i d| jd ddd|d d|d d| jd d| jd   d!| jd" d#| jd# d$| jd% d&| jd& d'|d d| jd d(| jd d)| jd) d*| jd* d+| jd+ d,| jd, d-| jd- d.| jd/ d0| jd0 d1| jd2|d3|d4|d5|d6|d7| j }|d   |d8< |d   |d9< |d  |d     |d:< |d  |d     |d;< W d    n	1 sw   Y  | j|fS td<)>Nr   rJ  rK  	mel_inputmel_lengthsrM  r  r  r!  rT  r  )
r   r  r  r   rM  r  r  rT  r  r  r  r  F)enabled)scores_disc_fakescores_disc_realNNrL   r  Tr  	r   r   r   r   r   r   r   r   r   
mel_outputr  rt   
mel_targetr	  
dur_outputdr_log_pred
dur_targetdr_log_targetpitch_output
pitch_predpitch_targetenergy_outputenergy_predenergy_targetr  waveform_hatp_prosody_refp_prosody_predu_prosody_refu_prosody_predaligner_logprobaligner_hardaligner_masaligner_softr  
feats_fake
feats_realscores_fake
spec_slicespec_slice_hat	skip_discavg_text_lengthavg_mel_lengthavg_text_batch_occupancyavg_mel_batch_occupancyz  [!] Unexpected `optimizer_idx`.r9   )r  model_outputs_cacher  r  r  r   r!   r<   r  r  r   r   r  r   r   r   r   r  r  r  r  ri   rN   r  )r   rd   r)  r*  r  token_lenghtsr   r	  rM  r  r  r!  rT  r  outputsscores_d_faker   scores_d_real	loss_dict	mel_slicemel_slice_hatfeats_d_fakefeats_d_realr9   r9   r:   
train_step  s   






	










'zDelightfulTTS.train_stepc                 C   s   |  |||S r2   )r[  )r   rd   r)  r*  r9   r9   r:   	eval_step  r   zDelightfulTTS.eval_steptrainc                 C   sR  i i }}|d d }|d d }|d }|d j   }	|d j   }
|d j   }t|	d ddt|
jd ddt|ddd}t|d d	 d
 j   }t|d d d
 j   }| j|d d j   }t	||ddt	||ddd}|
| t|d d d
 j   }t|d d d
 j   }| j|d d j   }t	||ddt	||ddd}|
| |d d d j   }t|jdd|d< tdt|	jdd d| jd| jj}||| d< |d d }|d d }t||| j|d}|
| |d d   }||| d< ||fS )NrL   r  
alignmentsr+  r   F
output_fig)
predictionground_truth	alignmentr:  )r   r   r9  rJ  )pitch_ground_truthpitch_avg_predictedr=  r<  )energy_ground_truthenergy_avg_predictedalignments_dpalignment_hatr   gainbaser   r   z/encoder_audior  r  )y_hatr   r   name_prefixz/vocoder_audior9   )rp   cpur   r(   Tr%   absr  decoder&   updatemel_to_wav_numpydb_to_amp_numpyr   r  r  r1   r   r=   r  )r   rd   rS  ro  figuresaudiosr  r^  r+  	pred_specgt_spec	align_img	pitch_avgpitch_avg_hatcharspitch_figures
energy_avgenergy_avg_hatenergy_figuresalignments_hatencoder_audiorn  r   vocoder_figuressample_voicer9   r9   r:   _log  sP   




zDelightfulTTS._logrS  loggerLoggerassetsstepsc                 C   s6   | j ||dd\}}||| |||| jj dS )a  Create visualizations and waveform examples.

        For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
        be projected onto Tensorboard.

        Args:
            batch (Dict): Model inputs used at the previous training step.
            outputs (Dict): Model outputs generated at the previous training step.

        Returns:
            Tuple[Dict, np.ndarray]: training plots and output waveform.
        vocoder/rd   rS  ro  N)r  train_figurestrain_audiosr   r   r   rd   rS  r  r  r  rw  rx  r9   r9   r:   	train_logK  s   zDelightfulTTS.train_logc                 C   s6   | j ||dd\}}||| |||| jj d S )Nr  r  )r  eval_figureseval_audiosr   r   r  r9   r9   r:   eval_log^  s   zDelightfulTTS.eval_logc                 C   s   t | jdr| jj}n| j}d\}}}t|tr:t|dkr#|d }nt|dkr.|\}}nt|dkr9|\}}}n|}d\}}t | dro|jr\|d u rR| j }n| jj	|d d	d
}n|j
ro|d u ri| j }n| jj| }||||dS )N
model_argsNNNrL   r   rt   rk  r0  r  Fnum_samples	randomize)r"  
speaker_id	style_wavr?   )r  r  r  
isinstancelistr]   r  r  get_random_embeddingget_mean_embeddingr  get_random_id
name_to_id)r   sentence_infor  r"  r#  r  r  r?   r9   r9   r:   !get_aux_input_from_test_sentencesc  s0   






z/DelightfulTTS.get_aux_input_from_test_sentencesc                 C   sZ  i }|d   }|d   }tt|d d d f | jj| jj| jj| jj| jj	| jj
| jjdd	d dd}t|d | jj| jj| jjd}	| j| jj|dd	}
|
d
d}
|d }tt|	d d d d f |  }|| j | j }t|dddd|d< t||d< t|	||d< t| |
|d< t| |
|d< t| |
|d< |S )Nr  r  Fr1  r   rL   )r   r   r   r   enr$  z<BLNK>r   	durationsrt   r_  rc  spectrogrampitch_from_wavpitch_avg_from_wavpitch_avg_predenergy_avg_pred)rp  r   r5   r6   r   r  r   r   r   r   r  r  r  r*   r   r  ids_to_texttext_to_idsr-  r   r  r  r%   r(   r'   r&   r=   )r   r"  r  rc  rS  rw  r  r  r   r  
input_textr  r|  pitch_avg_pred_denormr9   r9   r:   plot_outputs  sH   

"zDelightfulTTS.plot_outputsr"  r  r?   c                 K   s   t |  j}tj| jj|d dtjd}d }|d ur4| jj	r4t
|tr4| jj	r4| jj| }t||d}|d urE| jjrE| jj|d dd}t||d}t|tj|d}|d}| j|||d|d}	|	d	 d j  }
|	d
 }|
|||	d}|S )Nr  rA   r7   Fr  r   r   )r'  r"  r  r^  r  r^  text_inputsrS  )r  r  is_cudar3   r4   r  r  int32r  r  r  r   r  r  r;   r  r  r@   rF   r5   longr>   r&  rp   rp  r   )r   r"  r  r?   r"  r  r  r  _speaker_idrS  r  r^  return_dictr9   r9   r:   
synthesize  s8   	
zDelightfulTTS.synthesizec                 C   s   t |  j}tj| jj|d dtjd}|d urt||d}|d ur)t	||d}t
|tj|d}|d}| j|||dd}|d   d j}t|dd d	}td|| jd
| jj}|d }	|d d d f |	||d}
|
S )Nr  rA   r  r   r   )r   r'  r  rL   rj  rm  r^  r  r9   )r  r  r  r3   r4   r  r  r  r;   r@   rF   r5   r  r>   r(  rp  r   rq  rv  ru  r   r  r  )r   r"  r  r?   r  r  rS  Sr  r^  r  r9   r9   r:   synthesize_with_gl  s2   
z DelightfulTTS.synthesize_with_glc           
      C   s   t d i }i }| jj}t|D ]H\}}| |}| j|d | j|d |d d}| j|d |d |d d}	|d j|d|< |	d j|d	|< t	|d
 dd|d|< q||dS )zGeneric test run for `tts` models used by `Trainer`.

        You can override this for a different behaviour.

        Returns:
            Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
        z! | > Synthesizing test sentences.r"  r  r?   )r  r  r?   )r  r?   r  z{}-audioz{}-audio_encoderr^  Fr_  z{}-alignment)rw  rx  )
r   r  test_sentences	enumerater  r  r  rq  formatr%   )
r   r  test_audiostest_figuresr  r7  s_info
aux_inputsrS  
outputs_glr9   r9   r:   test_run  s*   	

zDelightfulTTS.test_runc                 C   s,   | ||d | jjj |||d  d S )Nrx  rw  )r  r  r  r   r  )r   rS  r  r  r  r9   r9   r:   test_log!  s   zDelightfulTTS.test_logc                    s   d}d}j durj jrjjrfdd|d D }|dur)t|}||d< j durJj jrJjjrJj j  fdd|d D }t|}||d< ||d< |S )	zGCompute speaker, langugage IDs and d_vector for the batch if necessary.Nc                    s   g | ]} j j| qS r9   )r  r  )r   snr  r9   r:   r   .      z.DelightfulTTS.format_batch.<locals>.<listcomp>rP  r!  c                    s   g | ]} | d  qS )	embeddingr9   rH  )d_vector_mappingr9   r:   r   7  r  rR  r  )	r  rP  r  r  r5   rU  
embeddingsr  rV  )r   rd   r!  r  r9   )r  r   r:   format_batch'  s   

zDelightfulTTS.format_batchc                 C   s  | j }t|d |j|j|j|j|j|j|jdd	|d< |d dur5|d ddddd|d j	d f nd|d< |d j	d |d  
 |d	< |d t|d	 d
 |d< | jjr|d }tj|d j	d |d	  |d  |d jd|d< t|d j	d D ]}t|| |d |d|| j	d d|| j	d
 f< qd|d< t|d |j|j|jdd|d< | |d |d< |S )z#Compute spectrograms on the device.rM  F)r   r   r   r   r   r   r   r   r+  r  Nrt   rO  r,  rL   rT  r   rK  rI   r  )r   r   r   r   )r   r   r   r   r  r   r   r  r  rM   rx   r"   r>   r  use_attn_priorsr5   zerosrN   rJ   rX  r6   r   r  )r   rd   acattn_priors_nprd  r9   r9   r:   format_batch_on_device>  sJ   
<


0
z$DelightfulTTS.format_batch_on_devicerL   datasetc                 C   s   d }|j }t|ddrC|j D ]2\}}td| d| d |j|d }t| t|||d\}}	}
|| }td|	 d|
  q|d urOt|t	|}nd }|d u rc|d	kr_t
|}|S d }|S |d	krkt|n|}|S )
Nuse_weighted_samplerFz) > Using weighted sampler for attribute 'z' with alpha '')r   r   r   z > Attribute weights for 'z' 
 | > rL   )r   r   weighted_sampler_attrsr   r   weighted_sampler_multipliersr   r   r   r]   r   r   )r   r  r  num_gpusweights
data_itemsr   alphar   
attr_namesattr_weightssamplerr9   r9   r:   get_samplerw  s,   zDelightfulTTS.get_sampleris_evalr   r   r  rankr   c                 C   s   |r	|j s	d }|S t|| j|rdn|j|j |j|j|j|j|j	|j
|j|j|jr,|jnd || j|jd}	|dkr=t  |	  | ||	|}
t|	|rO|jn|jdd|
|	j|r[|jn|jdd}|	jj| _|	jj| _|S )Nr   )r   r   batch_group_sizemin_text_lenr2  r3  max_audio_lenphoneme_cache_pathr   r*   r  r  r   r  start_by_longestrL   FT)rR   shuffle	drop_lastr  re  num_workers
pin_memory)run_evalr  r   r  rR   r  r2  r3  r  r  r   r*   r  r  r  r  r  distbarrierpreprocess_samplesr  r   eval_batch_sizere  num_eval_loader_workersnum_loader_workersr  ri   r  rj   r  )r   r  r  r  r   r   r  r  loaderr  r  r9   r9   r:   get_data_loader  sJ   

-

zDelightfulTTS.get_data_loaderc                 C   s   t | jt| jgS r2   )r   r  DelightfulTTSLossr  r9   r9   r:   get_criterion  s   zDelightfulTTS.get_criterionc                 C   sV   t | jj| jj| jj| j}tdd |  D }t | jj| jj| jj|d}||gS )zInitiate and return the GAN optimizers based on the config parameters.
        It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator.
        Returns:
            List: optimizers.
        c                 s   s"    | ]\}}| d s|V  qdS )zdisc.N)
startswith)r   rC  paramsr9   r9   r:   	<genexpr>  s     z.DelightfulTTS.get_optimizer.<locals>.<genexpr>)r  )	r   r  	optimizeroptimizer_paramslr_discr  r   r   lr_gen)r   optimizer_discgen_parametersoptimizer_genr9   r9   r:   r     s   zDelightfulTTS.get_optimizerc                 C   s   | j j| j jgS )zSet the initial learning rates for each optimizer.

        Returns:
            List: learning rates for each optimizer.
        )r  r  r  r  r9   r9   r:   get_lr  s   zDelightfulTTS.get_lrc                 C   s8   t | jj| jj|d }t | jj| jj|d }||gS )zSet the schedulers for each optimizer.

        Args:
            optimizer (List[`torch.optim.Optimizer`]): List of optimizers.

        Returns:
            List: Schedulers, one for each optimizer.
        r   rL   )r   r  lr_scheduler_genlr_scheduler_gen_paramslr_scheduler_disclr_scheduler_disc_params)r   r  scheduler_Dscheduler_Gr9   r9   r:   r     s   	zDelightfulTTS.get_schedulerc                 C   s   | j   d S r2   )r  evalr   trainerr9   r9   r:   on_epoch_end  s   zDelightfulTTS.on_epoch_endFDelightfulTTSConfigc                 C   s8   t | \}}t| j|}tj| d}t||||dS )zInitiate model from config

        Args:
            config (ForwardTTSE2eConfig): Model config.
            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
                Defaults to None.
        )r  )r  r  r  r   )r$   init_from_configr#   r  r-   r  )r  r   r   r  
new_configr  r   r9   r9   r:   r
    s   zDelightfulTTS.init_from_configc                 C   s>   t |tdd}| |d  |r|   | jrJ dS dS )u0   Load model from a checkpoint created by the 👟rp  )map_locationmodelN)r.   r5   rJ   load_state_dictr  training)r   r  checkpoint_pathr  stater9   r9   r:   load_checkpoint  s   
zDelightfulTTS.load_checkpointc                 C   s>   | j  | j | jd}t| dr| jj|d< | jjr	 |S )zOCustom state dict of the model with all the necessary components for inference.)r  r  r  r  r!  )r  to_dictr  
state_dictr  r  rP  r  )r   
save_stater9   r9   r:   get_state_dict  s   
zDelightfulTTS.get_state_dictc                 C   s0   |  ||}| j|d< | j|d< t|| dS )zSave model to a file.r  r  N)r  r  r  r5   r   )r   r  r  r  r9   r9   r:   r     s   

zDelightfulTTS.savec                 C   s.   t |j| jj dd | _|j| jjk| _dS )zEnable the discriminator training based on `steps_to_start_discriminator`

        Args:
            trainer (Trainer): Trainer object.
        r   N)r   epochs_doner  binary_loss_warmup_epochsr  total_steps_doner  r  r  r9   r9   r:   on_train_step_start#  s   z!DelightfulTTS.on_train_step_startr0  )rH   N)NNNNN)r]  r  rG  r2   r  F)Brm   r  r  r  r   r#   r   rf  rJ   r  r  setterr  r  r   r  r  r  r  r  r5   rU  rV  r   r  no_gradr&  r(  dictr   Modulerx   r[  r\  r  r  r  r  r  r   rE   r  r  r	   r  r  r  r  r   r  r~  r
   r   r  r  r   r  r   r  staticmethodr
  r  r  r   r  r  r9   r9   r   r:   r  v  s"   "(









	 	

M	v
;
"#%
4%
9!	
:
	r  c                       sX   e Zd Z fddZedd Zedd Zedd Z													
dddZ  Z	S )r  c                    s   t    t | _t | _t | _t	di |j
| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _d S )Nr9   )r   r   r   MSELossmse_lossL1Lossmae_lossr   forward_sum_lossr/   multi_scale_stft_loss_paramsmulti_scale_stft_lossmel_loss_alphaaligner_loss_alphapitch_loss_alphaenergy_loss_alphau_prosody_loss_alphap_prosody_loss_alphadur_loss_alphachar_dur_loss_alphabinary_align_loss_alphabinary_alignment_loss_alphavocoder_mel_loss_alphafeat_loss_alphagen_loss_alphamulti_scale_stft_loss_alphar  r   r9   r:   r   0  s$   


zDelightfulTTSLoss.__init__c                 C   s.   t t j|| dk dd }| |   S )zBinary loss that forces soft alignments to match the hard alignments as
        explained in `https://arxiv.org/pdf/2108.10447.pdf`.
        rL   g-q=r   )r5   r   r   r   )alignment_hardalignment_softlog_sumr9   r9   r:   _binary_alignment_lossG  s    z(DelightfulTTSLoss._binary_alignment_lossc              	   C   s`   d}t | |D ]$\}}t ||D ]\}}|  }| }|tt|| 7 }qq|d S )Nr   rt   )zipr<   r  r5   ri   rr  )rH  feats_generatedlossdrdgrlglr9   r9   r:   feature_lossO  s   zDelightfulTTSLoss.feature_lossc                 C   sF   d}g }| D ]}|  }td| d }|| ||7 }q||fS )Nr   rL   rt   )r<   r5   ri   ra   )rI  r<  
gen_lossesr>  r   r9   r9   r:   generator_lossY  s   

z DelightfulTTSLoss.generator_lossNFc           .   	   C   s
  i }t |
|j}t ||j}d|_d|_d|_||dddf }||dddf }| ||} | }d| ||d||d }!| }d| || }"| ||}#||dddf }||dddf }| ||}$||dddf }|	|dddf }	| ||	}%| 	||
|}&| | j
 |#| j  |"| j  |!| j  |$| j  |%| j  |&| j  }'| jdkr|dur| ||}(|'| j|( |  }'|r| j|( | |d< n| j|( |d< | j|& |d< | j
|  |d< | j|# |d	< | j|" |d
< | j|! |d< | j|$ |d< | j|% |d< |'|d< |sM| j||d| j })| j|dd | j }*|)|d< |*|d< |d |) |* |d< tjj||| j }+| j||d\},}-|,| j },|-| j }-|+|d< |,|d< |-|d< |d |+ |- |, |d< |S )a)  
        Shapes:
            - mel_output: :math:`(B, C_mel, T_mel)`
            - mel_target: :math:`(B, C_mel, T_mel)`
            - mel_lens: :math:`(B)`
            - dur_output: :math:`(B, T_src)`
            - dur_target: :math:`(B, T_src)`
            - pitch_output: :math:`(B, 1, T_src)`
            - pitch_target: :math:`(B, 1, T_src)`
            - energy_output: :math:`(B, 1, T_src)`
            - energy_target: :math:`(B, 1, T_src)`
            - src_lens: :math:`(B)`
            - waveform: :math:`(B, 1, T_wav)`
            - waveform_hat: :math:`(B, 1, T_wav)`
            - p_prosody_ref: :math:`(B, T_src, 4)`
            - p_prosody_pred: :math:`(B, T_src, 4)`
            - u_prosody_ref: :math:`(B, 1, 256)
            - u_prosody_pred: :math:`(B, 1, 256)
            - aligner_logprob: :math:`(B, 1, T_mel, T_src)`
            - aligner_hard: :math:`(B, T_mel, T_src)`
            - aligner_soft: :math:`(B, T_mel, T_src)`
            - spec_slice: :math:`(B, C_mel, T_mel)`
            - spec_slice_hat: :math:`(B, C_mel, T_mel)`
        FNr  rK   r   loss_binary_alignmentloss_alignerloss_melloss_durationloss_u_prosodyloss_p_prosody
loss_pitchloss_energyr<  )rH  r;  )rI  vocoder_loss_featvocoder_loss_gen)rn  r   vocoder_loss_melvocoder_loss_stft_mgvocoder_loss_stft_sc)r"   r   rJ   r  masked_selectr$  r  r>   r"  r%  r(  r.  r,  r-  r*  r+  r)  r1  r9  rA  r3  rC  r4  r5   r   r   l1_lossr2  r'  r5  ).r   r2  r3  r	  r4  r6  r8  r:  r;  r=  r  rM  r>  r?  r@  rA  rB  rC  rD  rF  r  rG  rH  rI  rJ  rK  rL  rV  src_maskmel_maskmasked_mel_predictionsmel_targetsmel_lossp_prosody_lossu_prosody_lossduration_loss
pitch_lossenergy_lossr%  
total_lossbinary_alignment_loss	loss_featloss_genrF  loss_stft_mgloss_stft_scr9   r9   r:   r  e  s   5


zDelightfulTTSLoss.forward)NNNNNNF)
rm   r  r  r   r   r9  rA  rC  r  r  r9   r9   r   r:   r  /  s     

	
 r  r  )rY   rg   )rt   )rL   r   rG  r2   )~r  dataclassesr   r   	itertoolsr   pathlibr   typingr   r   r   r	   r
   r   r3   r5   torch.distributeddistributedr  r   coqpitr   librosa.filtersr   r   r   torch.cuda.amp.autocast_moder   torch.nnr   r^   torch.utils.datar   torch.utils.data.samplerr   trainer.torchr   r   trainer.trainer_utilsr   r   TTS.tts.datasets.datasetr   r   r   ,TTS.tts.layers.delightful_tts.acoustic_modelr   TTS.tts.layers.lossesr   r   !TTS.tts.layers.vits.discriminatorr   TTS.tts.models.base_ttsr   TTS.tts.utils.helpersr   r   r    r!   r"   TTS.tts.utils.speakersr#   TTS.tts.utils.text.tokenizerr$   TTS.tts.utils.visualr%   r&   r'   r(    TTS.utils.audio.numpy_transformsr)   r*   r+   rv  r,   ru  TTS.utils.audio.processorr-   TTS.utils.ior.   TTS.vocoder.layers.lossesr/   $TTS.vocoder.models.hifigan_generatorr0   TTS.vocoder.utils.generic_utilsr1   r;   r@   rF   r\   rV   rx   r_   r  r<   rs   ry   r}   r   r   r   r  r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r   r  rg  r  r  r  r  r9   r9   r9   r:   <module>   s    

	

		



%

:2 &3       @