o
    ´‹
j<#  ã                   @   sr   d dl mZmZmZmZ d dlmZ d dlmZm	Z	 d dl
mZmZ d dlmZ d dlmZmZ G dd„ dƒZd	S )
é    )ÚCallableÚDictÚListÚUnion)Úcleaners)Ú	GraphemesÚIPAPhonemes)ÚDEF_LANG_TO_PHONEMIZERÚget_phonemizer_by_name)ÚMultiPhonemizer)Úget_import_pathÚimport_classc                	   @   s  e Zd ZdZ						d*dedddedef d	efd
d„Ze	dd„ ƒZ
e
jdd„ ƒZ
dedee fdd„Zdee defdd„Zd+dededee fdd„Zdee defdd„Zdee fdd„Zd,dee defd d!„Zd-d#efd$d%„Zed+d.d(d)„ƒZdS )/ÚTTSTokenizeru  ðŸ¸TTS tokenizer to convert input characters to token IDs and back.

    Token IDs for OOV chars are discarded but those are stored in `self.not_found_characters` for later.

    Args:
        use_phonemes (bool):
            Whether to use phonemes instead of characters. Defaults to False.

        characters (Characters):
            A Characters object to use for character-to-ID and ID-to-character mappings.

        text_cleaner (callable):
            A function to pre-process the text before tokenization and phonemization. Defaults to None.

        phonemizer (Phonemizer):
            A phonemizer object or a dict that maps language codes to phonemizer objects. Defaults to None.

    Example:

        >>> from TTS.tts.utils.text.tokenizer import TTSTokenizer
        >>> tokenizer = TTSTokenizer(use_phonemes=False, characters=Graphemes())
        >>> text = "Hello world!"
        >>> ids = tokenizer.text_to_ids(text)
        >>> text_hat = tokenizer.ids_to_text(ids)
        >>> assert text == text_hat
    FNÚtext_cleanerÚ
charactersÚBaseCharactersÚ
phonemizerÚ
PhonemizerÚ	add_blankc                 C   s.   || _ || _|| _|| _|| _g | _|| _d S ©N)r   Úuse_phonemesr   Úuse_eos_bosr   Únot_found_charactersr   )Úselfr   r   r   r   r   r   © r   úO/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/utils/text/tokenizer.pyÚ__init__&   s   	
zTTSTokenizer.__init__c                 C   s   | j S r   )Ú_characters)r   r   r   r   r   7   s   zTTSTokenizer.charactersc                 C   sL   || _ | jjr| j | jj¡nd | _| jjr!| j | jj¡| _d S d | _d S r   )r   r   ÚpadÚ
char_to_idÚpad_idÚblankÚblank_id)r   Únew_charactersr   r   r   r   ;   s   (ÚtextÚreturnc              	   C   sp   g }|D ]1}z| j  |¡}| |¡ W q ty5   || jvr3| j |¡ t|ƒ tdt|ƒ› dƒ Y qw |S )z.Encodes a string of text as a sequence of IDs.z [!] Character z, not found in the vocabulary. Discarding it.)r   r   ÚappendÚKeyErrorr   ÚprintÚrepr)r   r$   Ú	token_idsÚcharÚidxr   r   r   ÚencodeA   s   
€ûzTTSTokenizer.encoder*   c                 C   s"   d}|D ]
}|| j  |¡7 }q|S )z.Decodes a sequence of IDs to a string of text.Ú )r   Ú
id_to_char)r   r*   r$   Útoken_idr   r   r   ÚdecodeP   s   zTTSTokenizer.decodeÚlanguagec                 C   s\   | j dur
|   |¡}| jr| jj|d|d}|  |¡}| jr$|  |d¡}| jr,|  |¡}|S )aÿ  Converts a string of text to a sequence of token IDs.

        Args:
            text(str):
                The text to convert to token IDs.

            language(str):
                The language code of the text. Defaults to None.

        TODO:
            - Add support for language-specific processing.

        1. Text normalizatin
        2. Phonemization (if use_phonemes is True)
        3. Add blank char between characters
        4. Add BOS and EOS characters
        5. Text to token IDs
        Nr.   )Ú	separatorr2   T)	r   r   r   Ú	phonemizer-   r   Úintersperse_blank_charr   Úpad_with_bos_eos)r   r$   r2   r   r   r   Útext_to_idsW   s   



zTTSTokenizer.text_to_idsÚid_sequencec                 C   s
   |   |¡S )z5Converts a sequence of token IDs to a string of text.)r1   )r   r8   r   r   r   Úids_to_textv   s   
zTTSTokenizer.ids_to_textÚchar_sequencec                 C   s   | j jgt|ƒ | j jg S )z8Pads a sequence with the special BOS and EOS characters.)r   Úbos_idÚlistÚeos_id)r   r:   r   r   r   r6   z   s   zTTSTokenizer.pad_with_bos_eosÚuse_blank_charc                 C   s<   |r| j jn| j j}|gt|ƒd d  }||ddd…< |S )zŸIntersperses the blank character between characters in a sequence.

        Use the ```blank``` character if defined else use the ```pad``` character.
        é   é   N)r   r"   r   Úlen)r   r:   r>   Úchar_to_useÚresultr   r   r   r5   ~   s   z#TTSTokenizer.intersperse_blank_charr   Úlevelc                 C   s¶   d| }t |› d| j› ƒ t |› d| j› ƒ t |› d| j› ƒ | jr4t |› dƒ | j |d ¡ t| jƒdkrWt |› dt| jƒ› d	ƒ | jD ]}t |› d|› ƒ qKd S d S )
Nú	z| > add_blank: z| > use_eos_bos: z| > use_phonemes: z| > phonemizer:r@   r   z| > z not found characters:)r(   r   r   r   r   Ú
print_logsrA   r   )r   rD   Úindentr+   r   r   r   rF   ˆ   s   
ýzTTSTokenizer.print_logsÚconfigÚCoqpitc           
   
   C   s„  d}t | jttfƒrtt| jƒ}|du r>| jr)| jjr)t| jjƒ}| 	| ¡\}}n| j
r5tƒ  	| ¡\}}ntƒ  	| ¡\}}n| 	| ¡\}}t|ƒ|j_d}| j
r´d| v ru| jdkrui }| jD ]}|jdkrl|j||j< q^tdƒ‚t|ƒ}n?d| ji}d| v r‹| jr‹t| jfi |¤Ž}n)ztt| j fi |¤Ž}| ¡ |_W n ty³ }	 z
td| j› dƒ|	‚d}	~	ww t| j
|||| j| jƒ|fS )	a  Init Tokenizer object from config

        Args:
            config (Coqpit): Coqpit model config.
            characters (BaseCharacters): Defines the model character set. If not set, use the default options based on
                the config values. Defaults to None.
        Nr   Úmulti_phonemizerr.   z>Multi phonemizer requires language to be set for each dataset.r2   z!No phonemizer found for language z^.
                            You may need to install a third party library for this language.)Ú
isinstancer   Ústrr<   Úgetattrr   r   Úcharacters_classr   Úinit_from_configr   r   r   r   r   Údatasetsr2   Ú
ValueErrorr   Úphoneme_languager
   r	   Únamer'   r   r   Úenable_eos_bos_chars)
rH   r   r   ÚCharactersClassÚ
new_configr   Úlang_to_phonemizer_nameÚdatasetÚphonemizer_kwargsÚer   r   r   rO   •   sX   




ÿÿÿý€ÿÿüzTTSTokenizer.init_from_config)FNNNFFr   )F)r   )rH   rI   r   r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   Úboolr   Úpropertyr   ÚsetterrL   r   Úintr-   r1   r7   r9   r6   r5   rF   ÚstaticmethodrO   r   r   r   r   r   
   s<    ùýü
û
ú


r   N)Útypingr   r   r   r   ÚTTS.tts.utils.textr   ÚTTS.tts.utils.text.charactersr   r   ÚTTS.tts.utils.text.phonemizersr	   r
   Ú/TTS.tts.utils.text.phonemizers.multi_phonemizerr   ÚTTS.utils.generic_utilsr   r   r   r   r   r   r   Ú<module>   s    