o
    
jy&                     @   sx   d dl mZmZ d dlmZ d dlmZmZ d dlm	Z	 eG dd deZ
eG dd deZeG d	d
 d
e	ZdS )    )asdict	dataclass)List)Coqpitcheck_argument)TrainerConfigc                   @   s  e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed	< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed < d!Zeed"< dZeed#< dZeed$< dZeed%< dZeed&< dZ eed'< d(Z!eed)< d*Z"eed+< dZ#eed,< d-Z$eed.< dZ%eed/< d0Z&eed1< dZ'eed2< dZ(eed3< d4d5 Z)dS )6BaseAudioConfigam  Base config to definge audio processing parameters. It is used to initialize
    ```TTS.utils.audio.AudioProcessor.```

    Args:
        fft_size (int):
            Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.

        win_length (int):
            Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
            ```fft_size```. Defaults to 1024.

        hop_length (int):
            Number of audio samples between adjacent STFT columns. Defaults to 1024.

        frame_shift_ms (int):
            Set ```hop_length``` based on milliseconds and sampling rate.

        frame_length_ms (int):
            Set ```win_length``` based on milliseconds and sampling rate.

        stft_pad_mode (str):
            Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.

        sample_rate (int):
            Audio sampling rate. Defaults to 22050.

        resample (bool):
            Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.

        preemphasis (float):
            Preemphasis coefficient. Defaults to 0.0.

        ref_level_db (int): 20
            Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
            Defaults to 20.

        do_sound_norm (bool):
            Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.

        log_func (str):
            Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.

        do_trim_silence (bool):
            Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.

        do_amp_to_db_linear (bool, optional):
            enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.

        do_amp_to_db_mel (bool, optional):
            enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.

        pitch_fmax (float, optional):
            Maximum frequency of the F0 frames. Defaults to ```640```.

        pitch_fmin (float, optional):
            Minimum frequency of the F0 frames. Defaults to ```1```.

        trim_db (int):
            Silence threshold used for silence trimming. Defaults to 45.

        do_rms_norm (bool, optional):
            enable/disable RMS volume normalization when loading an audio file. Defaults to False.

        db_level (int, optional):
            dB level used for rms normalization. The range is -99 to 0. Defaults to None.

        power (float):
            Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
            artifacts in the synthesized voice. Defaults to 1.5.

        griffin_lim_iters (int):
            Number of Griffing Lim iterations. Defaults to 60.

        num_mels (int):
            Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.

        mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
            It needs to be adjusted for a dataset. Defaults to 0.

        mel_fmax (float):
            Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.

        spec_gain (int):
            Gain applied when converting amplitude to DB. Defaults to 20.

        signal_norm (bool):
            enable/disable signal normalization. Defaults to True.

        min_level_db (int):
            minimum db threshold for the computed melspectrograms. Defaults to -100.

        symmetric_norm (bool):
            enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
            [0, k], Defaults to True.

        max_norm (float):
            ```k``` defining the normalization range. Defaults to 4.0.

        clip_norm (bool):
            enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.

        stats_path (str):
            Path to the computed stats file. Defaults to None.
    i   fft_size
win_length   
hop_lengthNframe_shift_msframe_length_msreflectstft_pad_modei"V  sample_rateFresample        preemphasis   ref_level_dbdo_sound_normznp.log10log_funcTdo_trim_silence-   trim_dbdo_rms_normdb_levelg      ?power<   griffin_lim_itersP   num_melsmel_fminmel_fmax	spec_gaindo_amp_to_db_lineardo_amp_to_db_melg      @
pitch_fmaxg      ?
pitch_fminsignal_normimin_level_dbsymmetric_normg      @max_norm	clip_norm
stats_pathc                 C   sR  t | }td|dddd td|dddd td	|dd
dd td|ddddd td|ddddd td|dddd td|dddd td|dddd td|dddd td|dddd td|dd td|dd td|dddd td |dd td!|dd"dd td#|dd$dd% td&|ddd'd td(|dd td)|dd d*S )+Check config fieldsr"   T
   i  )
restrictedmin_valmax_valr	      i  r   i   i r   i  r
   )r2   r3   r4   alternativer      r   r   r   r+   ir   r      r    r*   r2   r,   r-   g?r.   r#   r   r$   g     @@)r2   r3   
allow_noner%   d   r   r   Nr   r   selfc r@   L/home/kuhnn/.local/lib/python3.10/site-packages/TTS/config/shared_configs.pycheck_values   s6   zBaseAudioConfig.check_values)*__name__
__module____qualname____doc__r	   int__annotations__r
   r   r   r   r   strr   r   boolr   floatr   r   r   r   r   r   r   r   r    r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   rB   r@   r@   r@   rA   r      sF   
 jr   c                   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
ee ed< dZeed	< dZeed
< dZeed< dZeed< dd ZdS )BaseDatasetConfiga  Base config for TTS datasets.

    Args:
        formatter (str):
            Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.

        dataset_name (str):
            Unique name for the dataset. Defaults to `""`.

        path (str):
            Root path to the dataset files. Defaults to `""`.

        meta_file_train (str):
            Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
            Defaults to `""`.

        ignored_speakers (List):
            List of speakers IDs that are not used at the training. Default None.

        language (str):
            Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.

        phonemizer (str):
            Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.

        meta_file_val (str):
            Name of the dataset meta file that defines the instances used at validation.

        meta_file_attn_mask (str):
            Path to the file that lists the attention mask files used with models that require attention masks to
            train the duration predictor.
     	formatterdataset_namepathmeta_file_trainNignored_speakerslanguage
phonemizermeta_file_valmeta_file_attn_maskc                 C   sR   t | }td|dd td|dd td|dd td|dd td|dd d	S )
r0   rN   Tr9   rP   rQ   rU   FrV   Nr<   r=   r@   r@   rA   rB      s   zBaseDatasetConfig.check_values)rC   rD   rE   rF   rN   rI   rH   rO   rP   rQ   rR   r   rS   rT   rU   rV   rB   r@   r@   r@   rA   rL      s   
 !rL   c                   @   sB   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< dS )	BaseTrainingConfigu  Base config to define the basic 🐸TTS training parameters that are shared
    among all the models. It is based on ```Trainer.TrainingConfig```.

    Args:
        model (str):
            Name of the model that is used in the training.

        num_loader_workers (int):
            Number of workers for training time dataloader.

        num_eval_loader_workers (int):
            Number of workers for evaluation time dataloader.
    Nmodelr   num_loader_workersnum_eval_loader_workersFuse_noise_augment)rC   rD   rE   rF   rX   rI   rH   rY   rG   rZ   r[   rJ   r@   r@   r@   rA   rW      s   
 rW   N)dataclassesr   r   typingr   coqpitr   r   trainerr   r   rL   rW   r@   r@   r@   rA   <module>   s     78