o
    
j<                     @   s  d dl mZ d dlmZ d dlZd dlZd dlZd dlZ	d dlm
Z
mZ dddddddededed	ed
edejfddZdddddedededeeef fddZdd Zdd ZdddddejdededejfddZdddddejdededejfdd Zd!d"dejd#edejfd$d%Zdd!d&dejd#edejfd'd(Zdd)d*ejd+ejdejfd,d-Zddd.d/ejd+ejdejfd0d1Zdd2d3ejdejfd4d5Zddd6d3ejdejfd7d8Zd9d:d*ejd;edejfd<d=Zdd9d>d/ejd;edejfd?d@ZdddddAdBdCdDdEejdedFedGedHedIedJedejfdKdLZddddBdCdMdEejdFedGedIedJedejfdNdOZ ddPdQd*ejdejfdRdSZ!dddTdUdejdFedVedeeef fdWdXZ"dddddddAdCdYdejdZed[edFedGeded\edJedejfd]d^Z#dEejdejfd_d`Z$ddaddbdddcd3ejddededededefdedfZ%ddddddgd3ejdeddedGedFedejfdhdiZ&ddjd&dejd#edejfdkdlZ'ddmdnd3ejdoedejfdpdqZ(dmdrdejdoedejfdsdtZ)ddTdudvededwedejfdxdyZ*dddzd3ejd{ededdfd|d}Z+d3ejd~edejfddZ,d~edejfddZ-dejdejfddZ.dejdedejfddZ/dejfddZ0dS )    )BytesIO)TupleN)magphasepyin)sample_ratefft_sizenum_melsmel_fmaxmel_fminr   r   r   r	   r
   returnc                 K   s>   |dur|| d ksJ || dksJ t jj| ||||dS )zXBuild melspectrogram basis.

    Returns:
        np.ndarray: melspectrogram basis.
    N   r   )srn_fftn_melsfminfmax)librosafiltersmel)r   r   r   r	   r
   kwargs r   S/home/kuhnn/.local/lib/python3.10/site-packages/TTS/utils/audio/numpy_transforms.pybuild_mel_basis   s   r   )frame_length_msframe_shift_msr   r   r   c                 K   s@   | | }|  sJ dt| d | }t|t| }||fS )zCompute hop and window length from milliseconds.

    Returns:
        Tuple[int, int]: hop length and window length for STFT.
    z1 [!] frame_shift_ms should divide frame_length_msg     @@)
is_integerintfloat)r   r   r   r   factor
win_length
hop_lengthr   r   r   millisec_to_length"   s
   r!   c                 C   s   |dkr	t | S t | S N
   )nplog10logxbaser   r   r   _log1   s   

r*   c                 C   s   |dkr
t d| S t | S r"   )r$   powerexpr'   r   r   r   _exp7   s   
r-      r#   r(   gainr)   r(   r0   r)   c                 K   s.   | dk   dksJ d|ttd| | S )a  Convert amplitude values to decibels.

    Args:
        x (np.ndarray): Amplitude spectrogram.
        gain (float): Gain factor. Defaults to 1.
        base (int): Logarithm base. Defaults to 10.

    Returns:
        np.ndarray: Decibels spectrogram.
    r   ' [!] Input values must be non-negative.g:0yE>)sumr*   r$   maximumr(   r0   r)   r   r   r   r   	amp_to_db=   s   r5   c                 K   s   t | | |S )a  Convert decibels spectrogram to amplitude spectrogram.

    Args:
        x (np.ndarray): Decibels spectrogram.
        gain (float): Gain factor. Defaults to 1.
        base (int): Logarithm base. Defaults to 10.

    Returns:
        np.ndarray: Amplitude spectrogram.
    )r-   r4   r   r   r   	db_to_ampM   s   r6   g
ףp=
?)coefr7   c                 K   s(   |dkrt dtjd| gdg| S )a!  Apply pre-emphasis to the audio signal. Useful to reduce the correlation between neighbouring signal values.

    Args:
        x (np.ndarray): Audio signal.

    Raises:
        RuntimeError: Preemphasis coeff is set to 0.

    Returns:
        np.ndarray: Decorrelated audio signal.
    r    [!] Preemphasis is set 0.0.r.   RuntimeErrorscipysignallfilterr(   r7   r   r   r   r   preemphasis[   s   r?   )r(   r7   c                 K   s(   |dkrt dtjdgd| g| S )zReverse pre-emphasis.r   r8   r.   r9   r>   r   r   r   
deemphasisl   s   r@   )	mel_basisspecrA   c                 K   s   t || S )a  Convert a full scale linear spectrogram output of a network to a melspectrogram.

    Args:
        spec (np.ndarray): Normalized full scale linear spectrogram.

    Shapes:
        - spec: :math:`[C, T]`

    Returns:
        np.ndarray: Normalized melspectrogram.
    )r$   dot)rB   rA   r   r   r   r   spec_to_mels   s   rD   r   rA   r   c                 K   s8   | dk   dksJ dtj|}tdt|| S )z3Convert a melspectrogram to full scale spectrogram.r   r1   g|=)r2   r$   linalgpinvr3   rC   )r   rA   r   inv_mel_basisr   r   r   mel_to_spec   s   rI   )wavrJ   c                 K   s(   t dd| i|}t|}|tjS )zCompute a spectrogram from a waveform.

    Args:
        wav (np.ndarray): Waveform. Shape :math:`[T_wav,]`

    Returns:
        np.ndarray: Spectrogram. Shape :math:`[C, T_spec]`. :math:`T_spec == T_wav / hop_length`
    yNr   )stftr$   absastypefloat32)rJ   r   DSr   r   r   wav_to_spec   s   	
rR   )rJ   rA   c                 K   s8   t dd| i|}tdt||d|}|tjS )z)Compute a melspectrogram from a waveform.rK   )rB   rA   Nr   )rL   rD   r$   rM   rN   rO   )rJ   rA   r   rP   rQ   r   r   r   
wav_to_mel   s   rS   g      ?)r+   r+   c                 K   s   |   }tdd|| i|S )z=Convert a spectrogram to a waveform using Griffi-Lim vocoder.rB   Nr   )copygriffin_lim)rB   r+   r   rQ   r   r   r   spec_to_wav   s   rV   )r   r+   c                 K   s.   |   }t||d d}tdd|| i|S )z@Convert a melspectrogram to a waveform using Griffi-Lim vocoder.rA   rE   rB   Nr   )rT   rI   rU   )r   r+   r   rQ   r   r   r   
mel_to_wav   s   rW   reflecthannT)rK   r   r    r   pad_modewindowcenterrK   r    r   rZ   r[   r\   c              	   K   s   t j| ||||||dS )zLibrosa STFT wrapper.

    Check http://librosa.org/doc/main/generated/librosa.stft.html argument details.

    Returns:
        np.ndarray: Complex number array.
    )rK   r   r    r   rZ   r[   r\   )r   rL   )rK   r   r    r   rZ   r[   r\   r   r   r   r   rL      s   rL   )rK   r    r   r[   r\   c                 K   s   t j| ||||dS )zLibrosa iSTFT wrapper.

    Check http://librosa.org/doc/main/generated/librosa.istft.html argument details.

    Returns:
        np.ndarray: Complex number array.
    )r    r   r\   r[   )r   istft)rK   r    r   r[   r\   r   r   r   r   r]      s   r]   <   )rB   num_iterc              
   K   s   t dt j t jj| j  }t | t}t	dd|| i|}t 
| s3td t dgS t|D ]}t dt tdd|i| }t	dd|| i|}q7|S )Ny               @rK   z8 [!] Waveform is not finite everywhere. Skipping the GL.        y              ?r   )r$   r,   pirandomrandshaperM   rN   complexr]   isfiniteallprintarrayrangeanglerL   )rB   r_   r   angles	S_complexrK   _r   r   r   rU      s   "rU   F)r(   r    pad_two_sidesro   c                 K   sD   | j d | d | | j d  }|sd|fS |d |d |d  fS )zCompute paddings used by Librosa's STFT. Compute right padding (final frame) or both sides padding
    (first and final frames)r   r.   r   )rd   )r(   r    ro   r   padr   r   r   compute_stft_paddings   s    rq   )r(   
pitch_fmax
pitch_fminr    r   r   stft_pad_moder\   rr   rs   rt   c                 K   s   |dusJ d|dusJ dt di d| tjd|d|d|d|d	|d
 d|d|d|dddddd
dddddddd\}	}
}d|	|
 < |	S )a  Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.

    Args:
        x (np.ndarray): Waveform. Shape :math:`[T_wav,]`
        pitch_fmax (float): Pitch max value.
        pitch_fmin (float): Pitch min value.
        hop_length (int): Number of frames between STFT columns.
        win_length (int): STFT window length.
        sample_rate (int): Audio sampling rate.
        stft_pad_mode (str): Padding mode for STFT.
        center (bool): Centered padding.

    Returns:
        np.ndarray: Pitch. Shape :math:`[T_pitch,]`. :math:`T_pitch == T_wav / hop_length`

    Examples:
        >>> WAV_FILE = filename = librosa.example('vibeace')
        >>> from TTS.config import BaseAudioConfig
        >>> from TTS.utils.audio import AudioProcessor
        >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
        >>> ap = AudioProcessor(**conf)
        >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
        >>> pitch = ap.compute_f0(wav)
    Nz1 [!] Set `pitch_fmax` before caling `compute_f0`.z1 [!] Set `pitch_fmin` before caling `compute_f0`.rK   r   r   r   frame_lengthr   r   r    rZ   r\   n_thresholdsd   beta_parameters)r      boltzmann_parameter
resolutiong?max_transition_rateg(\A@switch_prob{Gz?no_trough_probr`   r   )r   rN   r$   double)r(   rr   rs   r    r   r   rt   r\   r   f0voiced_maskrn   r   r   r   
compute_f0   sJ   $	


r   c                 K   s:   t dd| i|}t|\}}ttj|d dd}|S )ax  Compute energy of a waveform using the same parameters used for computing melspectrogram.
    Args:
      x (np.ndarray): Waveform. Shape :math:`[T_wav,]`
    Returns:
      np.ndarray: energy. Shape :math:`[T_energy,]`. :math:`T_energy == T_wav / hop_length`
    Examples:
      >>> WAV_FILE = filename = librosa.example('vibeace')
      >>> from TTS.config import BaseAudioConfig
      >>> from TTS.utils.audio import AudioProcessor
      >>> conf = BaseAudioConfig()
      >>> ap = AudioProcessor(**conf)
      >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
      >>> energy = ap.compute_energy(wav)
    rK   r   r   )axisNr   )rL   r   r$   sqrtr2   )rK   r   r(   magrn   energyr   r   r   compute_energy2  s   r   ig?)rJ   trim_dbr   min_silence_secr0   r)   r   c                 K   sp   t || }t |d }t| ||d}	t|t| | |D ]}
t| |
|
|  |	k r3|
|   S qt| S )aA  Find the last point without silence at the end of a audio signal.

    Args:
        wav (np.ndarray): Audio signal.
        threshold_db (int, optional): Silence threshold in decibels. Defaults to -40.
        min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8.
        gian (float, optional): Gain to be used to convert trim_db to trim_amp. Defaults to None.
        base (int, optional): Base of the logarithm used to convert trim_db to trim_amp. Defaults to 10.

    Returns:
        int: Last point without silence.
       r/   )r   r6   rj   lenr$   max)rJ   r   r   r   r0   r)   r   window_lengthr    	thresholdr(   r   r   r   find_endpointH  s   r   )rJ   r   r   r   r    c                 K   s2   t |d }| ||  } tjj| |||dd S )z6Trim silent parts with a threshold and 0.01 sec marginr~   )top_dbru   r    r   )r   r   effectstrim)rJ   r   r   r   r    r   marginr   r   r   trim_silenceg  s   
r   gffffff?c                 K   s   | t |   | S )zNormalize the volume of an audio signal.

    Args:
        x (np.ndarray): Raw waveform.
        coef (float): Coefficient to rescale the maximum value. Defaults to 0.95.

    Returns:
        np.ndarray: Volume normalized waveform.
    )rM   r   r>   r   r   r   volume_normv  s   
r   g      ;rJ   db_levelr   c                 K   s8   d|d  }t t| |d  t | d  }| | S )Nr#      r   )r$   r   r   r2   )rJ   r   r   rar   r   r   rms_norm  s   $r   )r   c                 K   s4   d|  krdksJ d J dt | |d}|S )zNormalize the volume based on RMS of the signal.

    Args:
        x (np.ndarray): Raw waveform.
        db_level (float): Target dB level in RMS. Defaults to -27.0.

    Returns:
        np.ndarray: RMS normalized waveform.
    ir   z) [!] db_level should be between -99 and 0r   )r   )r(   r   r   rJ   r   r   r   rms_volume_norm  s   $
r   )r   resamplefilenamer   c                 K   s,   |rt j| |d\}}|S t| \}}|S )a  Read a wav file using Librosa and optionally resample, silence trim, volume normalize.

    Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.

    Args:
        filename (str): Path to the wav file.
        sr (int, optional): Sampling rate for resampling. Defaults to None.
        resample (bool, optional): Resample the audio file when loading. Slows down the I/O time. Defaults to False.

    Returns:
        np.ndarray: Loaded waveform.
    )r   )r   loadsfread)r   r   r   r   r(   rn   r   r   r   load_wav  s
   r   )r   pipe_outpathc              	   K   sv   | dt dt t|   }|tj}|r0t }tjj	||| |
d |j	|  tjj	||| dS )am  Save float waveform to a file using Scipy.

    Args:
        wav (np.ndarray): Waveform with float values in range [-1, 1] to save.
        path (str): Path to a output file.
        sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
        pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
      r~   r   N)r   r$   rM   rN   int16r   r;   iowavfilewriteseekbufferr   )rJ   r   r   r   r   wav_norm
wav_bufferr   r   r   save_wav  s   	
r   mulaw_qcc                 K   sZ   d| d }t | t d|t |    t d|  }|d d | d }t |S )Nr   r.         ?g      ?)r$   signr&   rM   floor)rJ   r   r   mur<   r   r   r   mulaw_encode  s   0r   c                 K   s4   d| d }t | | d| t |  d  }|S )z(Recovers waveform from quantized values.r   r.   )r$   r   rM   )rJ   r   r   r   r(   r   r   r   mulaw_decode  s   $r   c                 K   s   t | d ddt jS )Ni   i r   )r$   cliprN   r   )r(   r   r   r   r   encode_16bits  s   r   quantize_bitsc                 K   s   | d d| d  d S )a	  Quantize a waveform to a given number of bits.

    Args:
        x (np.ndarray): Waveform to quantize. Must be normalized into the range `[-1, 1]`.
        quantize_bits (int): Number of quantization bits.

    Returns:
        np.ndarray: Quantized waveform.
    r   r   r.   r   r(   r   r   r   r   r   quantize  s   
r   c                 K   s   d|  d| d  d S )z4Dequantize a waveform from the given number of bits.r   r.   r   r   r   r   r   
dequantize  s   r   )1r   r   typingr   r   numpyr$   r;   	soundfiler   r   r   r   ndarrayr   r!   r*   r-   r   r5   r6   r?   r@   rD   rI   rR   rS   rV   rW   strboolrL   r]   rU   rq   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s^   


(( ""$ "





	
>	
!
"" $$	