o
    
j%                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ej Zd0ddZdd Zdd Ze  		d1ddZ!	d2ddZ"dd Z#e$dkre% Z&e&j'de(ddd e&j'd e(d!dd e&j'd"e(d#dd e&j'd$dd%d&d' e&j'd(dd%d)d' e&j'd*e)dd+d, e&j'd-e*d.dd/ e&+ Z,e
e,j-Z.de.j/_0e#e, dS dS )3z.Extract Mel spectrograms with teacher forcing.    N)
DataLoader)tqdm)load_config)
TTSDatasetload_tts_samples)setup_model)SpeakerManager)TTSTokenizer)AudioProcessor)quantize)count_parametersFc              
   C   s   t t\}}tdi d|dddtd|d| ddd	tjd
tjdtjdtjdtj	ddddd|dtj
r@tjnd dtjrItjnd }tjrYtjrY|tj |  t|tjd|jdd tjdd}|S )Noutputs_per_stepcompute_linear_specFsamples	tokenizerapbatch_group_sizer   min_text_lenmax_text_lenmin_audio_lenmax_audio_lenphoneme_cache_pathprecompute_num_workersuse_noise_augmentverbosespeaker_id_mappingd_vector_mapping)
batch_sizeshuffle
collate_fn	drop_lastsamplernum_workers
pin_memory )r	   init_from_configcr   	meta_datar   r   r   r   r   use_speaker_embeddingspeaker_manager
name_to_iduse_d_vector_file
embeddingsuse_phonemescompute_input_seq_cachecompute_input_seqnum_loader_workerspreprocess_samplesr   r   r   )r   rr   r   _datasetloaderr$   r$   S/home/kuhnn/.local/lib/python3.10/site-packages/TTS/bin/extract_tts_spectrograms.pysetup_loader   sd   	

r7   c                 C   s   t j| }|dd }t jt j|ddd t jt j|ddd t jt j|ddd t jt j|ddd t j|d|}t j|d|}t j|d|d	 }t j|d|d	 } ||||| fS )
N.r   quantT)exist_okmelwav_glwavz.wav)ospathbasenamesplitmakedirsjoin)wav_pathout_pathwav_file	file_name	wavq_pathmel_pathwav_gl_pathr$   r$   r6   set_filename?   s   rK   c              
   C   s   | d }| d }| d }| d }| d }| d }| d }| d }t | }	t | }
trf|jd	d
}|jd	d
}|jd	d
}|jd	d
}|d urR|jd	d
}|d ur\|jd	d
}|d urf|jd	d
}|||||||	|
||f
S )Ntoken_idtoken_id_lengthsr;   mel_lengths	item_idxs	d_vectorsspeaker_idsattnsT)non_blocking)torchmeanfloatuse_cudacuda)data
text_inputtext_lengths	mel_inputrN   item_idxrP   rQ   	attn_maskavg_text_lengthavg_spec_lengthr$   r$   r6   format_dataM   s@   ra   c	                 C   s  | dkr.d }	|d ur|}	n|d ur|}	|j |||||	|dd}
|
d }|   }|S d| v r||d}||||||}
|
d }| dkrvg }|j  }t|jd D ]}|| }|t	|
|jj qWt|  }|S | dkr|   }|S )	Nglow_tts)rP   rQ   )	aux_inputmodel_outputstacotron)rQ   rP   r   	tacotron2)inference_with_MASdetachcpunumpyrY   rangeshapeappendrT   FloatTensorout_linear_to_melTstack)
model_namemodelr   rZ   r[   r\   rN   rQ   rP   	speaker_coutputsmodel_outputrc   postnet_outputs	mel_specsbpostnet_outputr$   r$   r6   	inferencet   s>   
r{   
metada.txtc                 C   s  |   g }tt| t| dD ]\}	}
t|
\
}}}}}}}	}	}	}ttj ||||||||	}t	|j
d D ]^}|| }||}t||\}	}}}}|dkr\t||}t|| || }|| }|d |d d f j}t|| |||g |r||| |rtd| ||}||| q8qttj||ddd}|D ]}
||
d  d|
d d	  d
 qW d    d S 1 sw   Y  d S )N)totalr   zAudio for debug saved at:wzutf-8)encoding|   z.npy
)evalr   	enumeratelenra   r{   r&   rs   lowerrk   rl   load_wavrK   r   npsaverp   rm   save_wavprintinv_melspectrogramopenr>   r?   rC   write)data_loaderrs   r   output_pathquantize_bits
save_audiodebugmetada_nameexport_metadatar3   rY   rZ   r[   r\   rN   rQ   rP   r]   rv   idxwav_file_pathr=   rH   rI   rJ   rD   wavqr;   
mel_lengthfr$   r$   r6   extract_spectrograms   sf   



$"r   c              
   C   s   t di tj}ttj| jtjtjd\}}|| atj	r#t
tdantjr-t
tjdand att}|jt| jdd trB|  t|}td|dd tj dkrXd	n|jj}t||dd
}t|||| j| j| j| jdd d S )N)
eval_spliteval_split_max_sizeeval_split_size)
data_items)d_vectors_file_pathT)r   z
 > Model has {} parameters)flushrb   r   )r   r|   )r   r   r   r   r$   ) r
   r&   audior   datasetsr   r   r   r'   r(   r   r)   r+   d_vector_filer   load_checkpointcheckpoint_pathrW   rX   r   r   formatrs   r   decoderr2   r7   r   r   r   r   r   )argsr   meta_data_trainmeta_data_evalrs   
num_paramsr2   
own_loaderr$   r$   r6   main   s8   

r   __main__z--config_pathz!Path to config file for training.T)typehelprequiredz--checkpoint_pathzModel file to be restored.z--output_pathzPath to save mel specsz--debug
store_truezSave audio files for debug)defaultactionr   z--save_audiozSave audio filesz--quantize_bitsz&Save quantized audio files if non-zero)r   r   r   z--evalzcompute eval.)r   r   r   )F)NN)r   FFr|   )1__doc__argparser>   rj   r   rT   torch.utils.datar   r   
TTS.configr   TTS.tts.datasetsr   r   TTS.tts.modelsr   TTS.tts.utils.speakersr   TTS.tts.utils.text.tokenizerr	   TTS.utils.audior
    TTS.utils.audio.numpy_transformsr   TTS.utils.generic_utilsr   rX   is_availablerW   r7   rK   ra   no_gradr{   r   r   __name__ArgumentParserparseradd_argumentstrintbool
parse_argsr   config_pathr&   r   trim_silencer$   r$   r$   r6   <module>   sP   

''	.
>2
