o
    
j                     @   sT   d dl mZmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 eG dd deZdS )    )	dataclassfield)List)BaseTTSConfig)DelightfulTtsArgsDelightfulTtsAudioConfigVocoderConfigc                   @   s  e Zd ZU dZdZeed< eedZ	eed< ee
dZe
ed< dZeed< eedZeed	< dZeed
< dZeed< edd dZee ed< dZeed< dZeed< dZeed< edd dZeed< dZeed< edd dZeed< dZeed< dZeed< edd dZeed< dZ eed < dZ!eed!< dZ"eed"< dZ#eed#< dZ$eed$< d%Z%eed&< d%Z&eed'< dZ'eed(< d)Z(eed*< d+Z)eed,< d-Z*eed.< dZ+eed/< dZ,eed0< dZ-eed1< d2Z.eed3< d4Z/eed5< ed6d dZ0eed7< dZ1eed8< d9Z2eed:< ed;d dZ3eed<< ed=d dZ4eed>< d?Z5eed@< dZ6eedA< dBZ7eedC< dBZ8eedD< dEZ9eedF< d9Z:eedG< dBZ;eedH< dIZ<eedJ< dBZ=eedK< d9Z>eedL< d9Z?eedM< dBZ@eedN< dBZAeedO< edPd dZBeee  edQ< dRdS ZCdBS )TDelightfulTTSConfiga  
    Configuration class for the DelightfulTTS model.

    Attributes:
        model (str): Name of the model ("delightful_tts").
        audio (DelightfulTtsAudioConfig): Configuration for audio settings.
        model_args (DelightfulTtsArgs): Configuration for model arguments.
        use_attn_priors (bool): Whether to use attention priors.
        vocoder (VocoderConfig): Configuration for the vocoder.
        init_discriminator (bool): Whether to initialize the discriminator.
        steps_to_start_discriminator (int): Number of steps to start the discriminator.
        grad_clip (List[float]): Gradient clipping values.
        lr_gen (float): Learning rate for the  gan generator.
        lr_disc (float): Learning rate for the gan discriminator.
        lr_scheduler_gen (str): Name of the learning rate scheduler for the generator.
        lr_scheduler_gen_params (dict): Parameters for the learning rate scheduler for the generator.
        lr_scheduler_disc (str): Name of the learning rate scheduler for the discriminator.
        lr_scheduler_disc_params (dict): Parameters for the learning rate scheduler for the discriminator.
        scheduler_after_epoch (bool): Whether to schedule after each epoch.
        optimizer (str): Name of the optimizer.
        optimizer_params (dict): Parameters for the optimizer.
        ssim_loss_alpha (float): Alpha value for the SSIM loss.
        mel_loss_alpha (float): Alpha value for the mel loss.
        aligner_loss_alpha (float): Alpha value for the aligner loss.
        pitch_loss_alpha (float): Alpha value for the pitch loss.
        energy_loss_alpha (float): Alpha value for the energy loss.
        u_prosody_loss_alpha (float): Alpha value for the utterance prosody loss.
        p_prosody_loss_alpha (float): Alpha value for the phoneme prosody loss.
        dur_loss_alpha (float): Alpha value for the duration loss.
        char_dur_loss_alpha (float): Alpha value for the character duration loss.
        binary_align_loss_alpha (float): Alpha value for the binary alignment loss.
        binary_loss_warmup_epochs (int): Number of warm-up epochs for the binary loss.
        disc_loss_alpha (float): Alpha value for the discriminator loss.
        gen_loss_alpha (float): Alpha value for the generator loss.
        feat_loss_alpha (float): Alpha value for the feature loss.
        vocoder_mel_loss_alpha (float): Alpha value for the vocoder mel loss.
        multi_scale_stft_loss_alpha (float): Alpha value for the multi-scale STFT loss.
        multi_scale_stft_loss_params (dict): Parameters for the multi-scale STFT loss.
        return_wav (bool): Whether to return audio waveforms.
        use_weighted_sampler (bool): Whether to use a weighted sampler.
        weighted_sampler_attrs (dict): Attributes for the weighted sampler.
        weighted_sampler_multipliers (dict): Multipliers for the weighted sampler.
        r (int): Value for the `r` override.
        compute_f0 (bool): Whether to compute F0 values.
        f0_cache_path (str): Path to the F0 cache.
        attn_prior_cache_path (str): Path to the attention prior cache.
        num_speakers (int): Number of speakers.
        use_speaker_embedding (bool): Whether to use speaker embedding.
        speakers_file (str): Path to the speaker file.
        speaker_embedding_channels (int): Number of channels for the speaker embedding.
        language_ids_file (str): Path to the language IDs file.
    delightful_ttsmodel)default_factoryaudio
model_argsTuse_attn_priorsvocoderinit_discriminatori@ steps_to_start_discriminatorc                   C   s   ddgS )Ni   r   r   r   X/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/configs/delightful_tts_config.py<lambda>L   s    zDelightfulTTSConfig.<lambda>	grad_clipg-C6*?lr_genlr_discExponentialLRlr_scheduler_genc                   C   
   dddS Ng"?)gamma
last_epochr   r   r   r   r   r   P      
 lr_scheduler_gen_paramslr_scheduler_discc                   C   r   r   r   r   r   r   r   r   R   r    lr_scheduler_disc_paramsscheduler_after_epochAdamW	optimizerc                   C   s   ddgdddS )Ng?gGz?g&.>{Gz?)betasepsweight_decayr   r   r   r   r   r   U   s    optimizer_paramsg      ?ssim_loss_alphamel_loss_alphaaligner_loss_alphapitch_loss_alphaenergy_loss_alphag      ?u_prosody_loss_alphap_prosody_loss_alphadur_loss_alphar'   char_dur_loss_alphag?binary_align_loss_alpha
   binary_loss_warmup_epochsdisc_loss_alphagen_loss_alphafeat_loss_alphag      $@vocoder_mel_loss_alphag      @multi_scale_stft_loss_alphac                   C   s   g dg dg ddS )N)i   i   i   )x      2   )iX  i  r>   )n_fftshop_lengthswin_lengthsr   r   r   r   r   r   k   s   multi_scale_stft_loss_params
return_wavFuse_weighted_samplerc                   C      i S Nr   r   r   r   r   r   u       weighted_sampler_attrsc                   C   rF   rG   r   r   r   r   r   r   v   rH   weighted_sampler_multipliers   r
compute_f0Nf0_cache_pathattn_prior_cache_pathr   num_speakersuse_speaker_embeddingspeakers_file   speaker_embedding_channelslanguage_ids_fileuse_language_embeddinguse_d_vector_filed_vector_filed_vector_dimc                   C   s   dgdgdgdgdggS )NzcIt took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.zBe a voice, not an echo.z+I'm sorry Dave. I'm afraid I can't do that.z0This cake is great. It's so delicious and moist.zPrior to November 22, 1963.r   r   r   r   r   r      s   test_sentencesc                 C   sv   | j dkr
| j | j_ | jrd| j_| jr| j| j_| jr d| j_| jd ur/| jdkr/| j| j_| jr9| j| j_d S d S )Nr   T)rP   r   rQ   rR   rW   rY   rX   )selfr   r   r   __post_init__   s   



z!DelightfulTTSConfig.__post_init__)D__name__
__module____qualname____doc__r   str__annotations__r   r   r   r   r   r   boolr   r   r   r   intr   r   floatr   r   r   r!   dictr"   r#   r$   r&   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r7   r8   r9   r:   r;   r<   rC   rD   rE   rI   rJ   rL   rM   rN   rO   rP   rQ   rR   rT   rU   rV   rW   rX   rY   rZ   r\   r   r   r   r   r	      sv   
 5	
r	   N)dataclassesr   r   typingr   TTS.tts.configs.shared_configsr   TTS.tts.models.delightful_ttsr   r   r   r	   r   r   r   r   <module>   s    