o
    
j                     @   sP   d dl mZmZ d dlmZ d dlmZ d dlmZm	Z	 eG dd deZ
dS )    )	dataclassfield)List)BaseTTSConfig)VitsArgsVitsAudioConfigc                   @   s4  e Zd ZU dZdZeed< eedZ	eed< ee
dZe
ed< edd dZee ed	< d
Zeed< d
Zeed< dZeed< edd dZeed< dZeed< edd dZeed< dZeed< dZeed< edd dZeed< dZeed< dZeed< dZeed< dZeed< dZeed < dZeed!< dZ eed"< dZ!eed#< dZ"eed$< d%Z#eed&< ed'd dZ$eed(< ed)d dZ%eed*< d+Z&e'ed,< dZ(eed-< ed.d dZ)ee ed/< d0Z*e'ed1< d%Z+eed2< d3Z,eed4< d5Z-e'ed6< d3Z.eed7< d%Z/eed8< d%Z0eed9< d3Z1ee ed:< d3Z2e'ed;< d<d= Z3d3S )>
VitsConfigad  Defines parameters for VITS End2End TTS model.

    Args:
        model (str):
            Model name. Do not change unless you know what you are doing.

        model_args (VitsArgs):
            Model architecture arguments. Defaults to `VitsArgs()`.

        audio (VitsAudioConfig):
            Audio processing configuration. Defaults to `VitsAudioConfig()`.

        grad_clip (List):
            Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.

        lr_gen (float):
            Initial learning rate for the generator. Defaults to 0.0002.

        lr_disc (float):
            Initial learning rate for the discriminator. Defaults to 0.0002.

        lr_scheduler_gen (str):
            Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
            `ExponentialLR`.

        lr_scheduler_gen_params (dict):
            Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.

        lr_scheduler_disc (str):
            Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
            `ExponentialLR`.

        lr_scheduler_disc_params (dict):
            Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.

        scheduler_after_epoch (bool):
            If true, step the schedulers after each epoch else after each step. Defaults to `False`.

        optimizer (str):
            Name of the optimizer to use with both the generator and the discriminator networks. One of the
            `torch.optim.*`. Defaults to `AdamW`.

        kl_loss_alpha (float):
            Loss weight for KL loss. Defaults to 1.0.

        disc_loss_alpha (float):
            Loss weight for the discriminator loss. Defaults to 1.0.

        gen_loss_alpha (float):
            Loss weight for the generator loss. Defaults to 1.0.

        feat_loss_alpha (float):
            Loss weight for the feature matching loss. Defaults to 1.0.

        mel_loss_alpha (float):
            Loss weight for the mel loss. Defaults to 45.0.

        return_wav (bool):
            If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.

        compute_linear_spec (bool):
            If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.

        use_weighted_sampler (bool):
            If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.

        weighted_sampler_attrs (dict):
            Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
            by overweighting `root_path` by 2.0. Defaults to `{}`.

        weighted_sampler_multipliers (dict):
            Weight each unique value of a key returned by the formatter for weighted sampling.
            For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
            It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.

        r (int):
            Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.

        add_blank (bool):
            If true, a blank token is added in between every character. Defaults to `True`.

        test_sentences (List[List]):
            List of sentences with speaker and language information to be used for testing.

        language_ids_file (str):
            Path to the language ids file.

        use_language_embedding (bool):
            If true, language embedding is used. Defaults to `False`.

    Note:
        Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.

    Example:

        >>> from TTS.tts.configs.vits_config import VitsConfig
        >>> config = VitsConfig()
    vitsmodel)default_factory
model_argsaudioc                   C   s   ddgS )Ni   r   r   r   N/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/configs/vits_config.py<lambda>s   s    zVitsConfig.<lambda>	grad_clipg-C6*?lr_genlr_discExponentialLRlr_scheduler_genc                   C   
   dddS Ng"?)gamma
last_epochr   r   r   r   r   r   w      
 lr_scheduler_gen_paramslr_scheduler_discc                   C   r   r   r   r   r   r   r   r   y   r   lr_scheduler_disc_paramsTscheduler_after_epochAdamW	optimizerc                   C   s   ddgdddS )Ng?gGz?g&.>g{Gz?)betasepsweight_decayr   r   r   r   r   r   |   s    optimizer_paramsg      ?kl_loss_alphadisc_loss_alphagen_loss_alphafeat_loss_alphag     F@mel_loss_alphadur_loss_alphaspeaker_encoder_loss_alpha
return_wavcompute_linear_specFuse_weighted_samplerc                   C      i S Nr   r   r   r   r   r          weighted_sampler_attrsc                   C   r0   r1   r   r   r   r   r   r      r2   weighted_sampler_multipliers   r	add_blankc                   C   s   dgdgdgdgdggS )NzcIt took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.zBe a voice, not an echo.z+I'm sorry Dave. I'm afraid I can't do that.z0This cake is great. It's so delicious and moist.zPrior to November 22, 1963.r   r   r   r   r   r      s   test_sentencesr   num_speakersuse_speaker_embeddingNspeakers_file   speaker_embedding_channelslanguage_ids_fileuse_language_embeddinguse_d_vector_filed_vector_filed_vector_dimc                 C   s*   | j  D ]\}}t| |r|| |< qd S r1   )r   itemshasattr)selfkeyvalr   r   r   __post_init__   s
   
zVitsConfig.__post_init__)4__name__
__module____qualname____doc__r
   str__annotations__r   r   r   r   r   r   r   floatr   r   r   r   dictr   r   r   boolr!   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r3   r4   r6   intr7   r8   r9   r:   r;   r=   r>   r?   r@   rA   rB   rH   r   r   r   r   r      sT   
 cr   N)dataclassesr   r   typingr   TTS.tts.configs.shared_configsr   TTS.tts.models.vitsr   r   r   r   r   r   r   <module>   s    