o
    
j/%                     @   sx   d dl mZmZ d dlmZmZ d dlmZ d dlm	Z	 eG dd deZ
eG dd deZeG d	d
 d
e	ZdS )    )	dataclassfield)ListOptional)Coqpit)BaseVCConfigc                   @   s   e Zd ZU dZeddZeed< eddZe	ed< eddZ
e	ed< ed	dZe	ed
< eddZe	ed< ed	dZe	ed< eddZe	ed< eddZeed< eddZee ed< dS )FreeVCAudioConfiga  Audio configuration

    Args:
        max_wav_value (float):
            The maximum value of the waveform.

        input_sample_rate (int):
            The sampling rate of the input waveform.

        output_sample_rate (int):
            The sampling rate of the output waveform.

        filter_length (int):
            The length of the filter.

        hop_length (int):
            The hop length.

        win_length (int):
            The window length.

        n_mel_channels (int):
            The number of mel channels.

        mel_fmin (float):
            The minimum frequency of the mel filterbank.

        mel_fmax (Optional[float]):
            The maximum frequency of the mel filterbank.
    g      @defaultmax_wav_valuei>  input_sample_ratei]  output_sample_ratei   filter_lengthi@  
hop_length
win_lengthP   n_mel_channelsg        mel_fminNmel_fmax)__name__
__module____qualname____doc__r   r   float__annotations__r   intr   r   r   r   r   r   r   r    r   r   O/home/kuhnn/.local/lib/python3.10/site-packages/TTS/vc/configs/freevc_config.pyr   	   s   
 r   c                   @   s  e Zd ZU dZeddZeed< eddZeed< eddZ	eed< eddZ
eed	< ed
dZeed< eddZeed< eddZeed< eddZeed< eddZeed< edd dZee ed< edd dZeee  ed< edd dZee ed< eddZeed< edd dZee ed< eddZeed < ed!dZeed"< ed#dZeed$< ed%dZeed&< ed!dZeed'< ed(dZeed)< ed*dZeed+< d,S )-
FreeVCArgsag  FreeVC model arguments

    Args:
        spec_channels (int):
            The number of channels in the spectrogram.

        inter_channels (int):
            The number of channels in the intermediate layers.

        hidden_channels (int):
            The number of channels in the hidden layers.

        filter_channels (int):
            The number of channels in the filter layers.

        n_heads (int):
            The number of attention heads.

        n_layers (int):
            The number of layers.

        kernel_size (int):
            The size of the kernel.

        p_dropout (float):
            The dropout probability.

        resblock (str):
            The type of residual block.

        resblock_kernel_sizes (List[int]):
            The kernel sizes for the residual blocks.

        resblock_dilation_sizes (List[List[int]]):
            The dilation sizes for the residual blocks.

        upsample_rates (List[int]):
            The upsample rates.

        upsample_initial_channel (int):
            The number of channels in the initial upsample layer.

        upsample_kernel_sizes (List[int]):
            The kernel sizes for the upsample layers.

        n_layers_q (int):
            The number of layers in the quantization network.

        use_spectral_norm (bool):
            Whether to use spectral normalization.

        gin_channels (int):
            The number of channels in the global conditioning vector.

        ssl_dim (int):
            The dimension of the self-supervised learning embedding.

        use_spk (bool):
            Whether to use external speaker encoder.
    i  r	   spec_channels   inter_channelshidden_channelsi   filter_channels   n_heads   n_layers   kernel_sizeg?	p_dropout1resblockc                   C      g dS )N)r(         r   r   r   r   r   <lambda>}       zFreeVCArgs.<lambda>default_factoryresblock_kernel_sizesc                   C   s   g dg dg dgS )N)   r(      r   r   r   r   r   r0   ~   s    resblock_dilation_sizesc                   C   r-   )N)
      r$   r$   r   r   r   r   r   r0      r1   upsample_ratesi   upsample_initial_channelc                   C   r-   )N)   r<      r=   r   r   r   r   r   r0      r1   upsample_kernel_sizes
n_layers_qFuse_spectral_norm   gin_channelsi   ssl_dimuse_spkr   num_spksi #  segment_sizeN) r   r   r   r   r   r   r   r   r!   r"   r#   r%   r'   r)   r*   r   r,   strr4   r   r7   r:   r;   r>   r?   r@   boolrB   rC   rD   rE   rF   r   r   r   r   r   5   s.   
 =r   c                   @   s   e Zd ZU dZdZeed< eedZ	eed< ee
dZe
ed< dZeed< dZeed	< d
Zeed< edd dZeed< edd dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< d
Zeed< dZee ed< dZeed< dd ZdS )FreeVCConfigas  Defines parameters for FreeVC End2End TTS model.

    Args:
        model (str):
            Model name. Do not change unless you know what you are doing.

        model_args (FreeVCArgs):
            Model architecture arguments. Defaults to `FreeVCArgs()`.

        audio (FreeVCAudioConfig):
            Audio processing configuration. Defaults to `FreeVCAudioConfig()`.

        grad_clip (List):
            Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.

        lr_gen (float):
            Initial learning rate for the generator. Defaults to 0.0002.

        lr_disc (float):
            Initial learning rate for the discriminator. Defaults to 0.0002.

        lr_scheduler_gen (str):
            Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
            `ExponentialLR`.

        lr_scheduler_gen_params (dict):
            Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.

        lr_scheduler_disc (str):
            Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
            `ExponentialLR`.

        lr_scheduler_disc_params (dict):
            Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.

        scheduler_after_epoch (bool):
            If true, step the schedulers after each epoch else after each step. Defaults to `False`.

        optimizer (str):
            Name of the optimizer to use with both the generator and the discriminator networks. One of the
            `torch.optim.*`. Defaults to `AdamW`.

        kl_loss_alpha (float):
            Loss weight for KL loss. Defaults to 1.0.

        disc_loss_alpha (float):
            Loss weight for the discriminator loss. Defaults to 1.0.

        gen_loss_alpha (float):
            Loss weight for the generator loss. Defaults to 1.0.

        feat_loss_alpha (float):
            Loss weight for the feature matching loss. Defaults to 1.0.

        mel_loss_alpha (float):
            Loss weight for the mel loss. Defaults to 45.0.

        return_wav (bool):
            If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.

        compute_linear_spec (bool):
            If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.

        use_weighted_sampler (bool):
            If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.

        weighted_sampler_attrs (dict):
            Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
            by overweighting `root_path` by 2.0. Defaults to `{}`.

        weighted_sampler_multipliers (dict):
            Weight each unique value of a key returned by the formatter for weighted sampling.
            For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
            It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.

        r (int):
            Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.

        add_blank (bool):
            If true, a blank token is added in between every character. Defaults to `True`.

        test_sentences (List[List]):
            List of sentences with speaker and language information to be used for testing.

        language_ids_file (str):
            Path to the language ids file.

        use_language_embedding (bool):
            If true, language embedding is used. Defaults to `False`.

    Note:
        Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.

    Example:

        >>> from TTS.vc.configs.freevc_config import FreeVCConfig
        >>> config = FreeVCConfig()
    freevcmodelr2   
model_argsaudioT
return_wavcompute_linear_specFuse_weighted_samplerc                   C      i S Nr   r   r   r   r   r0         zFreeVCConfig.<lambda>weighted_sampler_attrsc                   C   rQ   rR   r   r   r   r   r   r0     rS   weighted_sampler_multipliersr5   r	add_blankr   num_speakersNspeakers_filerA   speaker_embedding_channelsuse_d_vector_filed_vector_filed_vector_dimc                 C   s*   | j  D ]\}}t| |r|| |< qd S rR   )rL   itemshasattr)selfkeyvalr   r   r   __post_init__  s
   
zFreeVCConfig.__post_init__)r   r   r   r   rK   rG   r   r   r   rL   r   rM   rN   rH   rO   rP   rT   dictrU   rV   r   rW   rX   rY   rZ   r[   r\   r   r]   rc   r   r   r   r   rI      s&   
 c	rI   N)dataclassesr   r   typingr   r   coqpitr   TTS.vc.configs.shared_configsr   r   r   rI   r   r   r   r   <module>   s    +U