o
    
jS\                     @   s   d dl mZ d dlmZmZ d dlZd dlZd dlZ	d dl
Z	d dlmZ d dlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ G dd deZdS )    )BytesIO)DictTupleN)StandardScaler)	amp_to_dbbuild_mel_basis
compute_f0	db_to_amp
deemphasisfind_endpointgriffin_limload_wavmel_to_specmillisec_to_lengthpreemphasisrms_volume_normspec_to_melstfttrim_silencevolume_normc                
   @   s<  e Zd ZdZ																											
										dKddZedLdMddZdejdejfddZ	dejdejfddZ
dedeejejejejef fddZdejdejdejdejddf
ddZd ejdejfd!d"Zd ejdejfd#d$Zd%ejdejfd&d'Zd%ejdejfd(d)Zd*ejdejfd+d,Zd-ejdejfd.d/Zd0ejdejfd1d2Zd3d4 Zd ejdejfd5d6ZdNd8ejdefd9d:Zd;d< Zed ejdejfd=d>ZdOd ejd?edejfd@dAZdOdBedCedejfdDdEZ dPd8ejdFedCeddfdGdHZ!dBedefdIdJZ"dS )QAudioProcessora  Audio Processor for TTS.

    Note:
        All the class arguments are set to default values to enable a flexible initialization
        of the class with the model config. They are not meaningful for all the arguments.

    Args:
        sample_rate (int, optional):
            target audio sampling rate. Defaults to None.

        resample (bool, optional):
            enable/disable resampling of the audio clips when the target sampling rate does not match the original sampling rate. Defaults to False.

        num_mels (int, optional):
            number of melspectrogram dimensions. Defaults to None.

        log_func (int, optional):
            log exponent used for converting spectrogram aplitude to DB.

        min_level_db (int, optional):
            minimum db threshold for the computed melspectrograms. Defaults to None.

        frame_shift_ms (int, optional):
            milliseconds of frames between STFT columns. Defaults to None.

        frame_length_ms (int, optional):
            milliseconds of STFT window length. Defaults to None.

        hop_length (int, optional):
            number of frames between STFT columns. Used if ```frame_shift_ms``` is None. Defaults to None.

        win_length (int, optional):
            STFT window length. Used if ```frame_length_ms``` is None. Defaults to None.

        ref_level_db (int, optional):
            reference DB level to avoid background noise. In general <20DB corresponds to the air noise. Defaults to None.

        fft_size (int, optional):
            FFT window size for STFT. Defaults to 1024.

        power (int, optional):
            Exponent value applied to the spectrogram before GriffinLim. Defaults to None.

        preemphasis (float, optional):
            Preemphasis coefficient. Preemphasis is disabled if == 0.0. Defaults to 0.0.

        signal_norm (bool, optional):
            enable/disable signal normalization. Defaults to None.

        symmetric_norm (bool, optional):
            enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else [0, k], Defaults to None.

        max_norm (float, optional):
            ```k``` defining the normalization range. Defaults to None.

        mel_fmin (int, optional):
            minimum filter frequency for computing melspectrograms. Defaults to None.

        mel_fmax (int, optional):
            maximum filter frequency for computing melspectrograms. Defaults to None.

        pitch_fmin (int, optional):
            minimum filter frequency for computing pitch. Defaults to None.

        pitch_fmax (int, optional):
            maximum filter frequency for computing pitch. Defaults to None.

        spec_gain (int, optional):
            gain applied when converting amplitude to DB. Defaults to 20.

        stft_pad_mode (str, optional):
            Padding mode for STFT. Defaults to 'reflect'.

        clip_norm (bool, optional):
            enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.

        griffin_lim_iters (int, optional):
            Number of GriffinLim iterations. Defaults to None.

        do_trim_silence (bool, optional):
            enable/disable silence trimming when loading the audio signal. Defaults to False.

        trim_db (int, optional):
            DB threshold used for silence trimming. Defaults to 60.

        do_sound_norm (bool, optional):
            enable/disable signal normalization. Defaults to False.

        do_amp_to_db_linear (bool, optional):
            enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.

        do_amp_to_db_mel (bool, optional):
            enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.

        do_rms_norm (bool, optional):
            enable/disable RMS volume normalization when loading an audio file. Defaults to False.

        db_level (int, optional):
            dB level used for rms normalization. The range is -99 to 0. Defaults to None.

        stats_path (str, optional):
            Path to the computed stats file. Defaults to None.

        verbose (bool, optional):
            enable/disable logging. Defaults to True.

    NFnp.log10              reflectT<   c"           *      K   s  || _ || _|| _|| _|pd| _|| _|| _|
| _|| _|| _	|| _
|| _|| _|| _|p/d| _|| _|| _|| _t|| _|| _|d u rHdnt|| _|| _|| _|| _|| _|| _|| _|| _|| _| | _|dkrqtj | _!n|dkryd| _!nt"d|d u rt#| j| j| j d\| _$| _%n|| _%|	| _$|dksJ d	| j$| jksJ d
| j$ d| j t&| }#|!rt'd |#( D ]\}$}%t'd)|$|% qt*| j | j| j| j| jd| _+| r|r| ,| \}&}'}(})}"| -|&|'|(|) d| _d | _d | _d | _d S d S d S )Nr   g      ?znp.logr   
   z [!] unknown `log_func` value.)frame_length_msframe_shift_mssample_rater   z [!] min_level_db is 0z1 [!] win_length cannot be larger than fft_size -  vs z  > Setting up Audio Processor...z
 | > {}:{})r    fft_sizenum_melsmel_fmaxmel_fminT).r    resampler#   log_funcmin_level_dbr   r   ref_level_dbr"   powerr   griffin_lim_iterssignal_normsymmetric_normr%   r$   
pitch_fmin
pitch_fmaxfloat	spec_gainstft_pad_modemax_norm	clip_normdo_trim_silencetrim_dbdo_sound_normdo_amp_to_db_lineardo_amp_to_db_meldo_rms_normdb_level
stats_pathnpebase
ValueErrorr   
win_length
hop_lengthvarsprintitemsformatr   	mel_basis
load_statssetup_scaler)*selfr    r&   r#   r'   r(   r   r   rB   rA   r)   r"   r*   r   r,   r-   r3   r%   r$   r/   r.   r1   r2   r4   r+   r5   r6   r7   r8   r9   r:   r;   r<   verbose_memberskeyvaluemel_meanmel_stdlinear_mean
linear_std rT   L/home/kuhnn/.local/lib/python3.10/site-packages/TTS/utils/audio/processor.py__init__   s   &





zAudioProcessor.__init__configCoqpitc                 C   s.   d| v rt dd|i| jS t dd|i| S )NaudiorK   rT   )r   rY   )rW   rK   rT   rT   rU   init_from_config   s   zAudioProcessor.init_from_configSreturnc                 C   s   |  }| jrnt| dr2|jd | jkr| j|jjS |jd | jd kr.| j	|jjS t
d|| j8 }|| j | j  }| jr\d| j | | j }| jrZt|| j | j}|S | j| }| jrlt|d| j}|S |S )aF  Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]`

        Args:
            S (np.ndarray): Spectrogram to normalize.

        Raises:
            RuntimeError: Mean and variance is computed from incompatible parameters.

        Returns:
            np.ndarray: Normalized spectrogram.
        
mel_scalerr      @ [!] Mean-Var stats does not match the given feature dimensions.)copyr,   hasattrshaper#   r]   	transformTr"   linear_scalerRuntimeErrorr)   r(   r-   r3   r4   r=   clip)rJ   r[   S_normrT   rT   rU   	normalize  s,   


zAudioProcessor.normalizec                 C   s   |  }| jrtt| dr2|jd | jkr| j|jjS |jd | jd kr.| j	|jjS t
d| jrX| jrBt|| j | j}|| j | j  d| j  | j }|| j S | jrct|d| j}|| j  | j | j }|| j S |S )a  Denormalize spectrogram values.

        Args:
            S (np.ndarray): Spectrogram to denormalize.

        Raises:
            RuntimeError: Mean and variance are incompatible.

        Returns:
            np.ndarray: Denormalized spectrogram.
        r]   r   r^   r_   )r`   r,   ra   rb   r#   r]   inverse_transformrd   r"   re   rf   r-   r4   r=   rg   r3   r(   r)   )rJ   r[   S_denormrT   rT   rU   denormalize,  s(   
"

zAudioProcessor.denormalizer<   c           
   	   C   s   t j|dd }|d }|d }|d }|d }|d }g d}| D ]&}	|	|v r,q%|	d	vrK||	 | j|	 ksKJ d
|	 d||	  d| j|	  q%|||||fS )a)  Loading mean and variance statistics from a `npy` file.

        Args:
            stats_path (str): Path to the `npy` file containing

        Returns:
            Tuple[np.array, np.array, np.array, np.array, Dict]: loaded statistics and the config used to
                compute them.
        T)allow_picklerP   rQ   rR   rS   audio_config)r+   r<   r5   r)   r*   )r    r6   z [!] Audio param z= does not match the value used for computing mean-var stats. r!   )r=   loaditemkeys__dict__)
rJ   r<   statsrP   rQ   rR   rS   stats_configskip_parametersrN   rT   rT   rU   rH   S  s    
zAudioProcessor.load_statsrP   rQ   rR   rS   c                 C   s0   t  | _| j|| t  | _| j|| dS )aW  Initialize scaler objects used in mean-std normalization.

        Args:
            mel_mean (np.ndarray): Mean for melspectrograms.
            mel_std (np.ndarray): STD for melspectrograms.
            linear_mean (np.ndarray): Mean for full scale spectrograms.
            linear_std (np.ndarray): STD for full scale spectrograms.
        N)r   r]   	set_statsre   )rJ   rP   rQ   rR   rS   rT   rT   rU   rI   o  s   zAudioProcessor.setup_scalerxc                 C   s   t || j dS )a=  Apply pre-emphasis to the audio signal. Useful to reduce the correlation between neighbouring signal values.

        Args:
            x (np.ndarray): Audio signal.

        Raises:
            RuntimeError: Preemphasis coeff is set to 0.

        Returns:
            np.ndarray: Decorrelated audio signal.
        rw   coef)r   rJ   rw   rT   rT   rU   apply_preemphasis  s   z AudioProcessor.apply_preemphasisc                 C   s   t || jdS )zReverse pre-emphasis.rx   )r
   r   rz   rT   rT   rU   apply_inv_preemphasis  s   z$AudioProcessor.apply_inv_preemphasisyc                 C   sj   | j dkr
| |}t|| j| j| j| jd}| jr'tt	
|| j| jd}nt	
|}| |t	jS )zCompute a spectrogram from a waveform.

        Args:
            y (np.ndarray): Waveform.

        Returns:
            np.ndarray: Spectrogram.
        r   r}   r"   rB   rA   pad_moderw   gainr?   )r   r{   r   r"   rB   rA   r2   r8   r   r=   absr1   r?   ri   astypefloat32rJ   r}   Dr[   rT   rT   rU   spectrogram  s   
	

zAudioProcessor.spectrogramc                 C   sl   | j dkr
| |}t|| j| j| j| jd}tt	|| j
d}| jr-t|| j| jd}| |tjS )z)Compute a melspectrogram from a waveform.r   r~   specrG   r   )r   r{   r   r"   rB   rA   r2   r   r=   r   rG   r9   r   r1   r?   ri   r   r   r   rT   rT   rU   melspectrogram  s   

zAudioProcessor.melspectrogramr   c                 C   sD   |  |}t|| j| jd}| || j }| jdkr | |S |S )z=Convert a spectrogram to a waveform using Griffi-Lim vocoder.r   r   )rl   r	   r1   r?   _griffin_limr*   r   r|   )rJ   r   r[   WrT   rT   rU   inv_spectrogram  s   
zAudioProcessor.inv_spectrogrammel_spectrogramc                 C   sR   |  |}t|| j| jd}t|| jd}| || j }| jdkr'| 	|S |S )z@Convert a melspectrogram to a waveform using Griffi-Lim vocoder.r   )melrG   r   )
rl   r	   r1   r?   r   rG   r   r*   r   r|   )rJ   r   r   r[   r   rT   rT   rU   inv_melspectrogram  s
   
z!AudioProcessor.inv_melspectrogramlinear_specc                 C   sP   |  |}t|| j| jd}tt|| jd}t|| j| jd}| 	|}|S )zConvert a full scale linear spectrogram output of a network to a melspectrogram.

        Args:
            linear_spec (np.ndarray): Normalized full scale linear spectrogram.

        Returns:
            np.ndarray: Normalized melspectrogram.
        r   r   )
rl   r	   r1   r?   r   r=   r   rG   r   ri   )rJ   r   r[   r   rT   rT   rU   out_linear_to_mel  s   
	
z AudioProcessor.out_linear_to_melc                 C   s   t || j| j| j| j| jdS )N)r   num_iterrB   rA   r"   r   )r   r+   rB   rA   r"   r2   )rJ   r[   rT   rT   rU   r     s   zAudioProcessor._griffin_limc              
   C   sV   t || j dkrtj|d| jd f| jd}t|| j| j| j| j| j	| jdd}|S )a  Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.

        Args:
            x (np.ndarray): Waveform.

        Returns:
            np.ndarray: Pitch.

        Examples:
            >>> WAV_FILE = filename = librosa.example('vibeace')
            >>> from TTS.config import BaseAudioConfig
            >>> from TTS.utils.audio import AudioProcessor
            >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
            >>> ap = AudioProcessor(**conf)
            >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
            >>> pitch = ap.compute_f0(wav)
        r   r^   )modeT)rw   r/   r.   rB   rA   r    r2   center)
lenrB   r=   padr2   r   r/   r.   rA   r    )rJ   rw   f0rT   rT   rU   r     s   zAudioProcessor.compute_f0皙?wavc                 C   s   t || j| j|| j| jdS )a  Find the last point without silence at the end of a audio signal.

        Args:
            wav (np.ndarray): Audio signal.
            threshold_db (int, optional): Silence threshold in decibels. Defaults to -40.
            min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8.

        Returns:
            int: Last point without silence.
        )r   r6   r    min_silence_secr   r?   )r   r6   r    r1   r?   )rJ   r   r   rT   rT   rU   r   
  s   zAudioProcessor.find_endpointc                 C   s   t || j| j| j| jdS )z6Trim silent parts with a threshold and 0.01 sec margin)r   r    r6   rA   rB   )r   r    r6   rA   rB   )rJ   r   rT   rT   rU   r     s   zAudioProcessor.trim_silencec                 C   s
   t | dS )zNormalize the volume of an audio signal.

        Args:
            x (np.ndarray): Raw waveform.

        Returns:
            np.ndarray: Volume normalized waveform.
        rw   )r   r   rT   rT   rU   
sound_norm(  s   

zAudioProcessor.sound_normr;   c                 C   s   |du r| j }t||dS )zNormalize the volume based on RMS of the signal.

        Args:
            x (np.ndarray): Raw waveform.

        Returns:
            np.ndarray: RMS normalized waveform.
        N)rw   r;   )r;   r   )rJ   rw   r;   rT   rT   rU   r   4  s   	zAudioProcessor.rms_volume_normfilenamesrc                 C   s   |durt ||dd}n	t || j| jd}| jr0z| |}W n ty/   td|  Y nw | jr8| |}| j	rB| 
|| j}|S )a  Read a wav file using Librosa and optionally resample, silence trim, volume normalize.

        Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.

        Args:
            filename (str): Path to the wav file.
            sr (int, optional): Sampling rate for resampling. Defaults to None.

        Returns:
            np.ndarray: Loaded waveform.
        NT)r   r    r&   z* [!] File cannot be trimmed for silence - )r   r    r&   r5   r   r@   rD   r7   r   r:   r   r;   )rJ   r   r   rw   rT   rT   rU   r   B  s   
zAudioProcessor.load_wavpathc              	   C   s   | j r| || jd }n|dtdtt|  }|tj}|rBt }t	j
j||r0|n| j| |d |j|  t	j
j||rK|n| j| dS )a^  Save a waveform to a file using Scipy.

        Args:
            wav (np.ndarray): Waveform to save.
            path (str): Path to a output file.
            sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
            pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
        i  g{Gz?r   N)r:   r   r;   maxr=   r   r   int16r   scipyiowavfilewriter    seekbufferread)rJ   r   r   r   pipe_outwav_norm
wav_bufferrT   rT   rU   save_wav]  s   	
 zAudioProcessor.save_wavc                 C   s   t j|dS )zwGet the duration of a wav file using Librosa.

        Args:
            filename (str): Path to the wav file.
        )r   )librosaget_duration)rJ   r   rT   rT   rU   r   s  s   zAudioProcessor.get_duration)!NFNr   NNNNNNr   Nr   NNNNNNNr   r   TNFr   FTTFNNT)T)rW   rX   )r   )N)NN)#__name__
__module____qualname____doc__rV   staticmethodrZ   r=   ndarrayri   rl   strr   arrayr   rH   rI   r{   r|   r   r   r   r   r   r   r   intr   r   r   r0   r   r   r   r   rT   rT   rT   rU   r       s    n
o)('

$
r   ) r   r   typingr   r   r   numpyr=   scipy.io.wavfiler   scipy.signalTTS.tts.utils.helpersr    TTS.utils.audio.numpy_transformsr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   objectr   rT   rT   rT   rU   <module>   s    H