o
    
j+                     @   s   d dl mZ d dlZd dlZd dlmZ d"ddZd"dd	Z					d#d
ejdej	de
dej	dedej	dej	defddZdd Zdd Zd"ddZd"ddZdd Z							d$ddZ						d%d d!ZdS )&    )DictN)nnFcpuc                 C   s(   |rd}| d u r
d S t j| ||d}|S )Ncuda)dtypedevice)torch	as_tensor)np_arrayr   r   r   tensor r   J/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/utils/synthesis.pynumpy_to_torch   s   r   c                 C   s2   |rd}t j||j| |jd|dd}|S )Nr   srr   r   )r   FloatTensormelspectrogramload_wavsample_rate	unsqueeze)	style_wavapr   r   	style_melr   r   r   compute_style_mel   s   r   modelinputs
speaker_idr   
style_textd_vectorlanguage_idreturnc           
   	   C   sT   t |jdd |j}t| dr| jj}n| j}||||||||dd}	|	S )a!  Run a torch model for inference. It does not support batch inference.

    Args:
        model (nn.Module): The model to run inference.
        inputs (torch.Tensor): Input tensor with character ids.
        speaker_id (int, optional): Input speaker ids for multi-speaker models. Defaults to None.
        style_mel (torch.Tensor, optional): Spectrograms used for voice styling . Defaults to None.
        d_vector (torch.Tensor, optional): d-vector for multi-speaker models    . Defaults to None.

    Returns:
        Dict: model outputs.
          module)	x_lengthsspeaker_ids	d_vectorsr   r   language_ids)	aux_input)r   r   shapetor   hasattrr$   	inference)
r   r   r   r   r   r   r    input_lengths_funcoutputsr   r   r   run_model_torch   s   

r1   c                 C   s   | d | |  S )N)find_endpoint)wavr   r   r   r   trim_silenceC   s   r4   c                 C   s.   |j  dv r|| j}|S || j}|S )N)tacotron)r   lowerinv_spectrogramTinv_melspectrogram)postnet_outputr   CONFIGr3   r   r   r   r7   G   s
   r7   c                 C   s.   |rd}| d urt | } t| |} | S )Nr   )npasarrayr   
from_numpyr+   )aux_idr   r   r   r   r   id_to_torchO   s   
r@   c                 C   sD   |rd}| d ur t | } t| tj} |  d|} | S )Nr   r   )	r<   r=   r   r>   typer   squeezer   r+   )r   r   r   r   r   r   embedding_to_torchX   s   
rC   c           	      C   sL   g }t | D ]\}}|| |j |j }t|||}||d|  q|S )aY  Apply griffin-lim to each sample iterating throught the first dimension.
    Args:
        inputs (Tensor or np.Array): Features to be converted by GL. First dimension is the batch size.
        input_lens (Tensor or np.Array): 1D array of sample lengths.
        CONFIG (Dict): TTS config.
        ap (AudioProcessor): TTS audio processor.
    N)	enumerate
hop_lengthr7   append)	r   
input_lensr;   r   wavsidxspecwav_lenr3   r   r   r   apply_griffin_limc   s   rL   c              	      s   t |  j}|rd}d}|dr)|jr)|dur)t|tr!|}nt|| j|d}|drC|j	rC|durCt|| j|d}|
dd}d} durd fdd	| jj D }t|dks`J d
|d }tj| jj||dtjd}|dur|t||d}|	durt|	|d}	 durt |d t|tst|tj|d}|durtj| jj| dtjd}t|tj|d}|d}t|tj|d}|d}t| |||||	 d}|d }|d j  }|d }d}| }|j dkr|rt!|| j|}|rt"|| j}n|}||||d}|S )a  Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
    the vocoder model.

    Args:
        model (TTS.tts.models):
            The TTS model to synthesize audio with.

        text (str):
            The input text to convert to speech.

        CONFIG (Coqpit):
            Model configuration.

        use_cuda (bool):
            Enable/disable CUDA.

        speaker_id (int):
            Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.

        style_wav (str | Dict[str, float]):
            Path or tensor to/of a waveform used for computing the style embedding based on GST or Capacitron.
            Defaults to None, meaning that Capacitron models will sample from the prior distribution to
            generate random but realistic prosody.

        style_text (str):
            Transcription of style_wav for Capacitron models. Defaults to None.

        enable_eos_bos_chars (bool):
            enable special chars for end of sentence and start of sentence. Defaults to False.

        do_trim_silence (bool):
            trim silence after synthesis. Defaults to False.

        d_vector (torch.Tensor):
            d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.

        language_id (int):
            Language ID passed to the language embedding layer in multi-langual model. Defaults to None.
    r   Ngstr   capacitron_vaer"   r#   c                    s   g | ]
\}}| kr|qS r   r   ).0kvr    r   r   
<listcomp>   s    zsynthesis.<locals>.<listcomp>z$language_id must be a valid languager   )language)r   )r   r    model_outputs
alignments)r3   rV   text_inputsr0   )#next
parametersr   hasrM   
isinstancedictr   r   use_capacitron_vae	transposelanguage_manager
name_to_iditemslenr<   r=   	tokenizertext_to_idsint32r@   rC   r   r   floatlongr   r1   datar   numpyrB   ndimr7   r4   )r   textr;   use_cudar   r   r   use_griffin_limdo_trim_silencer   r    r   r   language_namerT   rW   r0   rU   rV   r3   return_dictr   rR   r   	synthesist   s   5



	rq   c
                 C   s   t |  j}
|rd}
|durt||
d}|durt||
d}|dur)t||
d}t| jj|| jjr6| jjn| jj	d|
d}t
| drI| jj}n| j}||||||}d}| }|jdkrr|	rpt|| j|}|rpt|| j}|S |}|S )a  Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
    the vocoder model.

    Args:
        model (TTS.tts.models):
            The TTS model to synthesize audio with.

        CONFIG (Coqpit):
            Model configuration.

        use_cuda (bool):
            Enable/disable CUDA.

        reference_wav (str):
            Path of reference_wav to be used to voice conversion.

        speaker_id (int):
            Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.

        d_vector (torch.Tensor):
            d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.

        reference_speaker_id (int):
            Reference Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.

        reference_d_vector (torch.Tensor):
            Reference d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.

        enable_eos_bos_chars (bool):
            enable special chars for end of sentence and start of sentence. Defaults to False.

        do_trim_silence (bool):
            trim silence after synthesis. Defaults to False.
    r   Nr   r   r$   r#   )rX   rY   r   r@   rC   r   r   argsencoder_sample_rater   r,   r$   inference_voice_conversionrB   rj   r7   r4   )r   r;   rl   reference_wavr   r   reference_speaker_idreference_d_vectorrn   rm   r   r/   rU   r3   r   r   r   transfer_voice   s:   /


rx   )Fr   )NNNNN)NNNFFNN)NNNNFF)typingr   ri   r<   r   r   r   r   ModuleTensorintstrr1   r4   r7   r@   rC   rL   rq   rx   r   r   r   r   <module>   sb    

	
(

	
 