o
    
jMB                     @   sF  U d Z ddlZddlZddlZddlZddlZddlZddlZddlm	Z	 ddl
mZ ddlmZmZmZmZmZmZ ddlZddlZddlmZmZmZ ddlmZmZmZ dd	lmZ dd
l m!Z!m"Z" ddl#m$Z$ da%ee" e&d< e' Z(e Z)dZ*e+dZ,e-e.Z/e	G dd dZ0e	G dd dZ1e	G dd dZ2dS )z&Phonemization and synthesis for Piper.    N)	dataclass)Path)AnyIterableOptionalSequenceTupleUnion   )PhonemeTypePiperConfigSynthesisConfig)BOSEOSPADphonemes_to_ids)ESPEAK_DATA_DIREspeakPhonemizer)TashkeelDiacritizer_ESPEAK_PHONEMIZERg    @z(\[\[.*?\]\])c                   @   s*   e Zd ZU eed< ee ed< eed< dS )PhonemeAlignmentphonemephoneme_idsnum_samplesN)__name__
__module____qualname__str__annotations__r   int r!   r!   >/home/kuhnn/.local/lib/python3.10/site-packages/piper/voice.pyr   !   s   
 r   c                   @   s   e Zd ZU dZeed< 	 eed< 	 eed< 	 ejed< 	 ee	 ed< 	 ee ed< 	 dZ
eej ed	< 	 dZeee  ed
< 	 dZeej ed< dZee ed< dZeee  ed< edejfddZedefddZdS )
AudioChunkzChunk of raw audio.sample_ratesample_widthsample_channelsaudio_float_arrayphonemesr   Nphoneme_id_samplesphoneme_alignments_audio_int16_array_audio_int16_bytes_phoneme_alignmentsreturnc                 C   s0   | j du rt| jt t ttj| _ | j S )zg
        Get audio as an int16 numpy array.

        :return: Audio data as int16 numpy array.
        N)r+   npclipr'   _MAX_WAV_VALUEastypeint16selfr!   r!   r"   audio_int16_arrayM   s   
zAudioChunk.audio_int16_arrayc                 C   s
   | j  S )zl
        Get audio as 16-bit PCM bytes.

        :return: Audio data as signed 16-bit sample bytes.
        )r6   tobytesr4   r!   r!   r"   audio_int16_bytes[   s   
zAudioChunk.audio_int16_bytes)r   r   r   __doc__r    r   r/   ndarraylistr   r)   r   r*   r   r+   r,   bytesr-   propertyr6   r8   r!   r!   r!   r"   r#   (   s2   
 
r#   c                   @   s  e Zd ZU dZejed< 	 eed< 	 eZ	e
ed< 	 e
 Ze
ed< 	 dZeed< dZee ed	< d
Zee ed< eddedfdeee
f deeee
f  dedeee
f deeee
f  dd fddZdedeee  fddZdee dee fddZ		d$dedee dedee fddZ			d%dede j!dee dededeee"  fdd Z#		d$d!ee dee dedee$j%e&e$j%ee$j% f f fd"d#Z'dS )&
PiperVoicezA voice for Piper.sessionconfigespeak_data_dirdownload_dirTuse_tashkeelNtashkeel_diacritizierg?taskeen_thresholdF
model_pathconfig_pathuse_cudar.   c                 C   s   |du r|  d}t d| t|ddd}t|}W d   n1 s&w   Y  |r:ddd	ifg}t d
 ndg}|du rEt }tt	|t
jt| t
 |dt|t|dS )a  
        Load an ONNX model and config.

        :param model_path: Path to ONNX voice model.
        :param config_path: Path to JSON voice config (defaults to model_path + ".json").
        :param use_cuda: True if CUDA (GPU) should be used instead of CPU.
        :param espeak_data_dir: Path to espeak-ng data dir (defaults to internal data).
        :param download_dir: Path to download resources (defaults to current directory).
        :return: Voice object.
        Nz.jsonzGuessing voice config path: %srzutf-8)encodingCUDAExecutionProvidercudnn_conv_algo_search	HEURISTICz
Using CUDACPUExecutionProvider)sess_options	providers)r@   r?   rA   rB   )_LOGGERdebugopenjsonloadr   cwdr>   r   	from_dictonnxruntimeInferenceSessionr   SessionOptions)rF   rG   rH   rA   rB   config_fileconfig_dictrP   r!   r!   r"   rU   z   s2   
zPiperVoice.loadtextc           
   	   C   s  | j jtjkrttd|gS | j jtjkr9ddlm	} t
| dd}|du r4|| jd }t| d| ||S | j jtjkrItd| j j g }t|}d}t|D ]\}}|d	rd
}|sh|g  |dkr|||d  dr||d d |d |dd   |t|d k r||d  dr|d d qV| j jdkr| jr| jdu rt | _| j|| jd}t1 tdu rt | j!at| j j|}	|r|	r|d |	d  |	dd }	||	 W d   n1 sw   Y  d}qV|r|d s|"  |S )z
        Text to phonemes grouped by sentence.

        :param text: Text to phonemize.
        :return: List of phonemes for each sentence.
        NFDr
   )ChinesePhonemizer_chinese_phonemizerNg2pWzUnexpected phoneme type: Fz[[Tr       ar)rE   )#r@   phoneme_typer   TEXTr;   unicodedata	normalizePINYINphonemize_chineser_   getattrrB   setattr	phonemizeESPEAK
ValueError_PHONEME_BLOCK_PATTERNsplit	enumerate
startswithappendendswithextendstriplenespeak_voicerC   rD   r   rE   _ESPEAK_PHONEMIZER_LOCKr   r   rA   pop)
r5   r]   r_   
phonemizerr(   
text_partsprev_raw_phonemesi	text_parttext_part_phonemesr!   r!   r"   ro      s\   	



"

zPiperVoice.phonemizer(   c                 C   s6   | j jtjkrddlm} ||| j jS t|| j jS )zt
        Phonemes to ids.

        :param phonemes: List of phonemes.
        :return: List of phoneme ids.
        r
   r   )r@   rg   r   rk   rl   r   phoneme_id_map)r5   r(   chinese_phonemes_to_idsr!   r!   r"   r      s   zPiperVoice.phonemes_to_ids
syn_configinclude_alignmentsc                 c   s   |du rt }| |}td|| |D ]}|sq| |}d}| j|||d}t|tr3|\}	}n|}	|jrNt	
t	|	}
|
dk rJt	|	}	n|	|
 }	|jdkrX|	|j }	t	|	ddt	j}	d}|durt|t|kr| jjtg }d}g }d}ttg|tgD ]L}| jj|g }|tkrtt||}n|}|}|D ]}|t|krd	} n||| krd	} n|d
7 }q|r n|t||t||| d q|rd}td t| jjdd
|	||||dV  qdS )a  
        Synthesize one audio chunk per sentence from from text.

        :param text: Text to synthesize.
        :param syn_config: Synthesis configuration.
        :param include_alignments: If True and the model supports it, include phoneme/audio alignments.
        Nztext=%s, phonemes=%s)r   g:0yE>g      ?g      r   FTr
   )r   r   r   zPhoneme alignment failedrd   )r$   r%   r&   r'   r(   r   r)   r*   ) _DEFAULT_SYNTHESIS_CONFIGro   rQ   rR   r   phoneme_ids_to_audio
isinstancetuplenormalize_audior/   maxabs
zeros_likevolumer0   r2   float32rz   r@   r   getr   	itertoolschainr   r   r;   rv   r   sumr#   r$   )r5   r]   r   r   sentence_phonemesr(   r   r)   audio_resultaudiomax_valr*   pad_idsphoneme_id_idxalignment_failedr   expected_idsids_to_checkstart_phoneme_id_idx
phoneme_idr!   r!   r"   
synthesize  s   










zPiperVoice.synthesizewav_fileset_wav_formatc           	      C   s|   g }d}| j |||dD ]+}|r&|r$||j ||j ||j d}||j |r7|j	r7|
|j	 q|r<|S dS )a  
        Synthesize and write WAV audio from text.

        :param text: Text to synthesize.
        :param wav_file: WAV file writer.
        :param syn_config: Synthesis configuration.
        :param set_wav_format: True if the WAV format should be set automatically.
        :param include_alignments: If True and the model supports it, return phoneme/audio alignments.

        :return: Phoneme/audio alignments if include_alignments is True, otherwise None.
        T)r   r   FN)r   setframerater$   setsampwidthr%   setnchannelsr&   writeframesr8   r*   rx   )	r5   r]   r   r   r   r   
alignmentsfirst_chunkaudio_chunkr!   r!   r"   synthesize_wav|  s$   

zPiperVoice.synthesize_wavr   c                 C   sJ  |du rt }|j}|j}|j}|j}|du r| jj}|du r"| jj}|du r*| jj}ttj|tj	dd}tj|j
d gtj	d}	tj|||gtjd}
||	|
d}| jjdkr[d}| jjdkrg|du rgd}|durxtj|gtj	d}||d< | jd|}|d  }|s|S t|dkr|dfS |d  | jj tj	}||fS )a4  
        Synthesize raw audio from phoneme ids.

        :param phoneme_ids: List of phoneme ids.
        :param syn_config: Synthesis configuration.
        :param include_alignments: Return samples per phoneme id if True.
        :return: Audio float numpy array from voice model (unnormalized, in range [-1, 1]).

        If include_alignments is True and the voice model supports it, the return
        value will be a tuple instead with (audio, phoneme_id_samples) where
        phoneme_id_samples contains the number of audio samples per phoneme id.
        N)dtyper   r
   )inputinput_lengthsscalessid)r   
speaker_idlength_scalenoise_scalenoise_w_scaler@   r/   expand_dimsarrayint64shaper   num_speakersr?   runsqueezerz   
hop_lengthr2   )r5   r   r   r   r   r   r   r   phoneme_ids_arrayphoneme_ids_lengthsr   argsr   resultr   r)   r!   r!   r"   r     sT   zPiperVoice.phoneme_ids_to_audio)NF)NTF)(r   r   r   r9   rX   rY   r   r   r   rA   r   rV   rB   rC   boolrD   r   r   rE   floatstaticmethodr	   r   rU   r;   ro   r    r   r   r   r#   r   wave
Wave_writer   r   r/   r:   r   r   r!   r!   r!   r"   r>   e   s   
 


2Q
s

.r>   )3r9   r   rT   loggingre	threadingri   r   dataclassesr   pathlibr   typingr   r   r   r   r   r	   numpyr/   rX   r@   r   r   r   constr   r   r   r   r   phonemize_espeakr   r   tashkeelr   r   r   Lockr|   r   r1   compilerr   	getLoggerr   rQ   r   r#   r>   r!   r!   r!   r"   <module>   s<     

<