o
    
jZf                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlZd dlZ	d dl
Z
d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ e eZdd Z dd Z!dd Z"g fdee# fddZ$dd Z%g fde#dee# fddZ&d7ddZ'd8dd Z(d9d"d#Z)d$d% Z*		&			'	(		)		)d:d*d+Z+d,d- Z,		&			'	.	/		)d;d0d1Z-		2	)	d<d3d4Z.d5d6 Z/dS )=    N)glob)DictList)convert_audio)softmax)
functional)HubertManager)CustomHubert)HubertTokenizer)clear_cuda_cacheinference_modec                 C   s   | j |ddS )NF)add_special_tokens)encode)	tokenizertext r   V/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/bark/inference_funcs.py	_tokenize   s   r   c                 C   s
   |  |S N)decode)r   enc_textr   r   r   _detokenize   s   
r   c                 C   s   t dd|  S )Nz\s+ )resubstrip)r   r   r   r   _normalize_whitespace    s   r   extra_voice_dirsc                 C   s   | }i }|D ]@}t |}|D ]6}t j||}t j|rEtt| d||< t|| dkrEtt| dtt| d ||< qq|S )Nz/*.npzr   z/*.wavz/*.mp3)oslistdirpathjoinisdirlistr   len)r   dirsvoicesdsubsr   subjr   r   r   
get_voices$   s   
(r*   c                 C   s,   t | }|d }|d }|d }|||fS )Nsemantic_promptcoarse_promptfine_prompt)npload)npz_file	x_historysemanticcoarsefiner   r   r   load_npz3   s
   

r5   voicec           	   
   C   s   |dkrdS t |}|| }t|dkrtd| d| z|| }W n ty; } ztd| d| |d }~ww t|dkrO|d drOt|d S |d }tj|d d }t	|| |d	 t
| ||S )
Nrandom)NNN   zVoice z has multiple paths: z not found in r   z.npz)audiomodeloutput_path)r*   r$   
ValueErrorKeyErrorendswithr5   r   r    splitextgenerate_voice
load_voice)	r:   r6   r   r&   pathsr    e
audio_pathr;   r   r   r   rA   ;   s$   rA         c              	   C   s@   t t t t | d }dtt| | |  }|| S )N   r8   )r.   sumabsdiffsignintr$   )r9   frame_length
hop_lengthzero_crossingstotal_framesr   r   r   zero_crossing_rateU   s    rQ            i@c                 C   s   t jj| |||d}t|S )N)ysrn_bandsfmin)librosafeaturespectral_contrastr.   mean)
audio_datasample_raterV   rW   rZ   r   r   r   compute_spectral_contrast[   s   
r^      c                 C   sX   t | }t|d }t j||jd d}||k}|t|t|jd   }|S )NrG   r   )rU   n_fftr8   )	rX   stftr.   rI   fft_frequenciesshapeix_aranger[   )r\   r]   max_bass_freqra   power_spectrogramfrequencies	bass_maskbass_energyr   r   r   compute_average_bass_energy`   s   
 rk   c                 C   s:  t | tr t| \} }t| ||jj|jj} | 	d
|j} t  |j| }W d   n1 s5w   Y  tjdd |D dd }|  }t }|j|jjd d t|jjd	 d

|j}tj|jjd |jd}|j| d |jjd}	||	}
|
  }
tj|||ddddf |
d dS )ae  Generate a new voice from a given audio and text prompt.

    Args:
        audio (np.ndarray): The audio to use as a base for the new voice.
        text (str): Transcription of the audio you are clonning.
        model (BarkModel): The BarkModel to use for generating the new voice.
        output_path (str): The path to save the generated voice to.
    r   Nc                 S   s   g | ]}|d  qS )r   r   ).0encodedr   r   r   
<listcomp>}   s    z"generate_voice.<locals>.<listcomp>dimhubert_tokenizer)
model_pathhubert)checkpoint_path)map_location)input_sample_hzrG   )r-   r,   r+   )
isinstancestr
torchaudior/   r   configr]   encodecchannels	unsqueezetodevicetorchno_gradr   catsqueezecpunumpyr   make_sure_tokenizer_installedLOCAL_MODEL_PATHSr	   r
   load_from_checkpointforward	get_tokenr.   savez)r9   r:   r;   rU   encoded_framescodeshubert_managerhubert_modelr   semantic_vectorssemantic_tokensr   r   r   r@   i   s&   


&r@   ffffff?F皙?Tc           &   
   K   s  t | tsJ t| } t|  dksJ tdd |D s"|
durX|dur*|d }|
dur2|
d }t |tjrUt|jdkrUt|dkrU|	 dkrU|
 |jjd ksWJ nd}tt|j| |jj }t|dkrtt|d t| d d}td| d	 |dd }tj|ddt| f|jjd
d}|dur|tj}|dd }tj|ddt| f|jjd
d}n
t|jjgd }tt||t|jjggtjd }|jd dksJ t n ||j}d}t j |dd}d}d}d}t!|D ]9}|r|dur|dddgf }n|}|j"|d||d\}}|ddd|jjf }|	rEt||dd|jjgf f}|dur|j}|# }|$ % #tj&' }t(|ddd }|| }t)t*|}||k}|dd + |dd< d|d< tj, ||| < t|}||#|}|durt-|t	||.d\} }!t/d ||| d k < tj*|| dd}"tj0|"dd}#|	r|#|jjks|dur|"d |kr|1d|   nYtj2||#d fdd}|d|jj3 7 }|dur||kr|1d|   n3||d kr|1d|   n#~~~"~#t	dt4td| | g}$|$|kr=|1|$|  |$}q|5  |$ % ' 6 dd }%W d   n	1 s^w   Y  t|%dkrst|%|jjk suJ t7  |%S )a  Generate semantic tokens from text.

    Args:
        text (str): The text to generate semantic tokens from.
        model (BarkModel): The BarkModel to use for generating the semantic tokens.
        history_prompt (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a prompt for the generation.
        temp (float): The temperature to use for the generation.
        top_k (int): The number of top tokens to consider for the generation.
        top_p (float): The cumulative probability to consider for the generation.
        silent (bool): Whether to silence the tqdm progress bar.
        min_eos_p (float): The minimum probability to consider for the end of sentence token.
        max_gen_duration_s (float): The maximum duration in seconds to generate for.
        allow_early_stop (bool): Whether to allow the generation to stop early.
        base (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a base for the generation.
        use_kv_caching (bool): Whether to use key-value caching for the generation.
        **kwargs: Additional keyword arguments. They are ignored.

    Returns:
        np.ndarray: The generated semantic tokens.
    r   c                 s       | ]}|d uV  qd S r   r   rl   vr   r   r   	<genexpr>       z)generate_text_semantic.<locals>.<genexpr>Nr8      d   z(warning, text too long, lopping of last %constant)constant_valuesmodei i     )disabletotalro   T)merge_context	use_cachepast_kvFInfrp   num_samples)8rx   ry   r   r$   r   allr.   ndarrayrc   minmaxr{   SEMANTIC_VOCAB_SIZEarrayr   r   TEXT_ENCODING_OFFSETroundloggerwarningpadTEXT_PAD_TOKENastypeint64SEMANTIC_PAD_TOKENr   
from_numpyhstackSEMANTIC_INFER_TOKENr   r   r   tqdmrangesemantic_modeltypedetachr   float32r   argsortcumsumr   copyinftopksizefloatmultinomialupdater   SEMANTIC_RATE_HZrL   closer   r   )&r   r:   history_prompttemptop_ktop_psilent	min_eos_pmax_gen_duration_sallow_early_stopbaseuse_kv_cachingkwargssemantic_historyencoded_textpxn_tot_stepspbar
pbar_statetot_generated_duration_skv_cachenx_inputlogitsrelevant_logitslogits_devicelogits_dtypesorted_indicessorted_logitscumulative_probssorted_indices_to_remover   _probs	item_nextreq_pbar_stateoutr   r   r   generate_text_semantic   s   #
"




&
$=r   c                 C   sb   t | jdks	J |  } |d ur*td| jd D ]}| |d d f  || 7  < q| d}|S )NrG   r8   r   F)r$   rc   r   r   ravel)arroffset_sizer   flat_arrr   r   r   _flatten_codebooks)  s   
r   v  <   c           /      C   s  t | tjr#t| jdkr#t| dkr#|  dkr#|  |jjd ks%J d|  kr0dks3J  J || dks;J |jj	|jj
 |jj }tt|| }tdd |D s^|	dur(|durl|}|d }|d }|	durx|	d }|	d }t |tjrt|jdkrt|dkr| dkr| |jjd krt |tjrt|jd	kr|jd |jjkr|jd
 dkr| dkr| |jjd krt|jd
 t| dt||jj dksJ t||jj|jj }t|t|t|d	  ttt|| g}tt|| }|| d tj}|| d tj}|dd }ntjg tjd}tjg tjd}tttt| | |jj |jj }|dkr[||jj dks]J t|| gtj} |tj}t|}t  t| d |j}t|d |j}tt|| }d}tjt|||dD ]_}|tt||  }|ddtd|| gdf }|ddddf }t|dd|jd
  fd|jj }t|t!|jj"gd |j|dd| df g}d}t|D ] }||krq||jj dk}|
r"|dur"|ddd
gf }n|}|j#||
|d\}}|jjdt| |jj  } |jjd	t| |jj  }!|dd| |!f }"|dur|"j}#|"$ }$|"% & $tj'( }"t)|"ddd
 }%|"|% }&t*tj+j,-|&}'|'|k}(|(dd
 . |(dd< d|(d< tj/ |"|%|( < t|"}"|"|#$|$}"|durt0|"t||"1d
\})}t2d |"|"|)d
 k < tj+j,j-|"| d
d}*tj3|*dd}+|+| 7 }+tj4||+d fdd}tj4||+d fdd}~~"~*~+|d7 }q~q~W d   n	1 sw   Y  |% & ( 5 t|d },~t|,|ks.J |,6d
|jjj7|jj }-td|jjD ]}.|-|.ddf  |.|jj 8  < qBt8  |-S )ab  Generate coarse audio codes from semantic tokens.

    Args:
        x_semantic (np.ndarray): The semantic tokens to generate coarse audio codes from.
        model (BarkModel): The BarkModel to use for generating the coarse audio codes.
        history_prompt (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a prompt for the generation.
        temp (float): The temperature to use for the generation.
        top_k (int): The number of top tokens to consider for the generation.
        top_p (float): The cumulative probability to consider for the generation.
        silent (bool): Whether to silence the tqdm progress bar.
        max_coarse_history (int): The maximum number of coarse audio codes to use as history.
        sliding_window_len (int): The length of the sliding window to use for the generation.
        base (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a base for the generation.
        use_kv_caching (bool): Whether to use key-value caching for the generation.

    Returns:
        np.ndarray: The generated coarse audio codes.
    r8   r   r   r   r   c                 s   r   r   r   r   r   r   r   r   _  r   z"generate_coarse.<locals>.<genexpr>NrG   ro   dtype)r   r   r   r   )r   r   Fr   rp   r   )9rx   r.   r   r$   rc   r   r   r{   r   COARSE_RATE_HZr   N_COARSE_CODEBOOKSrL   floorr   CODEBOOK_SIZEr   r   r   int32r   r   r   r   r   r   r   ceilr   r   r   r   COARSE_SEMANTIC_PAD_TOKENtensorCOARSE_INFER_TOKENcoarse_modelr   r   r   r   r   r   r   nnr   r   r   r   r   r   r   r   r   r   reshapeTr   )/
x_semanticr:   r   r   r   r   r   max_coarse_historysliding_window_lenr   r   semantic_to_coarse_ratiomax_semantic_historyr1   x_semantic_historyx_coarse_historyn_semantic_hist_providedn_coarse_hist_providedn_stepsx_coarsebase_semantic_idxx_semantic_inx_coarse_inn_window_stepsn_stepr   semantic_idxx_inr   is_major_stepr   r   logit_start_idxlogit_end_idxr   r   r   r   r   r   r   r   r   r   gen_coarse_arrgen_coarse_audio_arrr   r   r   r   generate_coarse3  s  
 

 
"



 B$r        ?c              	      s<  t | tjr8t| jdkr8d| jd   kr|jjd kr8n J | jd dkr8|  dkr8|  |jj	d ks:J t
dd |D sG|dur|durO|d }|durW|d }t |tjrt|jdkr|jd |jjkr|jd dkr| dkr| |jj	d ksJ nd}| jd }t| t|jj| | jd f|jj	 gtj}|dur|tj}t|ddddf tj|g}|ddddf jd }	nd}	d}
|jd dk rd|jd  }
t|tj|jj|
ftjd	|jj	 g}tdtt| jd d|	  d
 gd }t  t|j|j}tjt||dD ]}t|d
 |jd d g}t|	|d
  |jd d
 g}|| }|||d ddf d }t||jjD ]R}|||}|du r|d|dd|jj	f }t|d}n%|dddd|jj	f | }tj|dd t fddt|dD }||d|d|f< ~~qbt||jjD ]}|d|d|f |||d|  |f< q~q)|    ! j}~W d   n	1 sw   Y  |dd|	df }|
dkr|ddd|
 f }|jd | jd ksJ t"  |S )a  Generate full audio codes from coarse audio codes.

    Args:
        x_coarse_gen (np.ndarray): The coarse audio codes to generate full audio codes from.
        model (BarkModel): The BarkModel to use for generating the full audio codes.
        history_prompt (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a prompt for the generation.
        temp (float): The temperature to use for the generation.
        silent (bool): Whether to silence the tqdm progress bar.
        base (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a base for the generation.

    Returns:
        np.ndarray: The generated full audio codes.
    rG   r8   r   c                 s   r   r   r   r   r   r   r   r     r   z generate_fine.<locals>.<genexpr>Ni rE   r   rF   )r   ro   rp   c                    s   g | ]}t j | d dqS )r8   r   )r   r   )rl   r   r   r   r   rn   B  s    z!generate_fine.<locals>.<listcomp>)#rx   r.   r   r$   rc   r{   N_FINE_CODEBOOKSr   r   r   r   vstackzerosr   r   r   rL   r   r   r   r  r  r   r   r   r   
fine_modelargmaxr   r   r   r   r   r   r   )x_coarse_genr:   r   r   r   r   x_fine_historyn_coarsein_arr	n_historyn_remove_from_endn_loopsr   	start_idxstart_fill_idxrel_start_fill_idx	in_bufferr  r   r   codebook_predsgen_fine_arrr   r!  r   generate_fine  s   
$

. 

r4  c                 C   sX   t | d }||j}|dd}|jj|}|j|}|	 
   }|S )z:Turn quantized audio codes into audio array using encodec.Nr   r8   )r   r   r   r   	transposer|   	quantizerr   decoderr   r   r   r   )fine_tokensr:   r   embr   	audio_arrr   r   r   codec_decodeV  s   r;  )rE   rF   )rR   rS   )r_   )
Nr   NNFr   NTNT)	Nr   NNFr   r   NT)Nr   TN)0loggingr   r   r   typingr   r   rX   r   r.   r   rz   r   encodec.utilsr   scipy.specialr   torch.nnr   r   )TTS.tts.layers.bark.hubert.hubert_managerr   (TTS.tts.layers.bark.hubert.kmeans_hubertr	   $TTS.tts.layers.bark.hubert.tokenizerr
   TTS.tts.layers.bark.load_modelr   r   	getLogger__name__r   r   r   r   ry   r*   r5   rA   rQ   r^   rk   r@   r   r   r  r4  r;  r   r   r   r   <module>   st    



	2
 
 5
r