o
    
j<#                     @   sr   d dl mZmZmZmZ d dlmZ d dlmZm	Z	 d dl
mZmZ d dlmZ d dlmZmZ G dd dZd	S )
    )CallableDictListUnion)cleaners)	GraphemesIPAPhonemes)DEF_LANG_TO_PHONEMIZERget_phonemizer_by_name)MultiPhonemizer)get_import_pathimport_classc                	   @   s  e Zd ZdZ						d*dedddedef d	efd
dZe	dd Z
e
jdd Z
dedee fddZdee defddZd+dededee fddZdee defddZdee fddZd,dee defd d!Zd-d#efd$d%Zed+d.d(d)ZdS )/TTSTokenizeru  🐸TTS tokenizer to convert input characters to token IDs and back.

    Token IDs for OOV chars are discarded but those are stored in `self.not_found_characters` for later.

    Args:
        use_phonemes (bool):
            Whether to use phonemes instead of characters. Defaults to False.

        characters (Characters):
            A Characters object to use for character-to-ID and ID-to-character mappings.

        text_cleaner (callable):
            A function to pre-process the text before tokenization and phonemization. Defaults to None.

        phonemizer (Phonemizer):
            A phonemizer object or a dict that maps language codes to phonemizer objects. Defaults to None.

    Example:

        >>> from TTS.tts.utils.text.tokenizer import TTSTokenizer
        >>> tokenizer = TTSTokenizer(use_phonemes=False, characters=Graphemes())
        >>> text = "Hello world!"
        >>> ids = tokenizer.text_to_ids(text)
        >>> text_hat = tokenizer.ids_to_text(ids)
        >>> assert text == text_hat
    FNtext_cleaner
charactersBaseCharacters
phonemizer
Phonemizer	add_blankc                 C   s.   || _ || _|| _|| _|| _g | _|| _d S N)r   use_phonemesr   use_eos_bosr   not_found_charactersr   )selfr   r   r   r   r   r    r   O/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/utils/text/tokenizer.py__init__&   s   	
zTTSTokenizer.__init__c                 C   s   | j S r   )_characters)r   r   r   r   r   7   s   zTTSTokenizer.charactersc                 C   sL   || _ | jjr| j| jjnd | _| jjr!| j| jj| _d S d | _d S r   )r   r   pad
char_to_idpad_idblankblank_id)r   new_charactersr   r   r   r   ;   s   (textreturnc              	   C   sp   g }|D ]1}z| j |}|| W q ty5   || jvr3| j| t| tdt| d Y qw |S )z.Encodes a string of text as a sequence of IDs.z [!] Character z, not found in the vocabulary. Discarding it.)r   r   appendKeyErrorr   printrepr)r   r$   	token_idscharidxr   r   r   encodeA   s   
zTTSTokenizer.encoder*   c                 C   s"   d}|D ]
}|| j |7 }q|S )z.Decodes a sequence of IDs to a string of text. )r   
id_to_char)r   r*   r$   token_idr   r   r   decodeP   s   zTTSTokenizer.decodelanguagec                 C   s\   | j dur
|  |}| jr| jj|d|d}| |}| jr$| |d}| jr,| |}|S )a  Converts a string of text to a sequence of token IDs.

        Args:
            text(str):
                The text to convert to token IDs.

            language(str):
                The language code of the text. Defaults to None.

        TODO:
            - Add support for language-specific processing.

        1. Text normalizatin
        2. Phonemization (if use_phonemes is True)
        3. Add blank char between characters
        4. Add BOS and EOS characters
        5. Text to token IDs
        Nr.   )	separatorr2   T)	r   r   r   	phonemizer-   r   intersperse_blank_charr   pad_with_bos_eos)r   r$   r2   r   r   r   text_to_idsW   s   



zTTSTokenizer.text_to_idsid_sequencec                 C   s
   |  |S )z5Converts a sequence of token IDs to a string of text.)r1   )r   r8   r   r   r   ids_to_textv   s   
zTTSTokenizer.ids_to_textchar_sequencec                 C   s   | j jgt| | j jg S )z8Pads a sequence with the special BOS and EOS characters.)r   bos_idlisteos_id)r   r:   r   r   r   r6   z   s   zTTSTokenizer.pad_with_bos_eosuse_blank_charc                 C   s<   |r| j jn| j j}|gt|d d  }||ddd< |S )zIntersperses the blank character between characters in a sequence.

        Use the ```blank``` character if defined else use the ```pad``` character.
              N)r   r"   r   len)r   r:   r>   char_to_useresultr   r   r   r5   ~   s   z#TTSTokenizer.intersperse_blank_charr   levelc                 C   s   d| }t | d| j  t | d| j  t | d| j  | jr4t | d | j|d  t| jdkrWt | dt| j d	 | jD ]}t | d|  qKd S d S )
N	z| > add_blank: z| > use_eos_bos: z| > use_phonemes: z| > phonemizer:r@   r   z| > z not found characters:)r(   r   r   r   r   
print_logsrA   r   )r   rD   indentr+   r   r   r   rF      s   
zTTSTokenizer.print_logsconfigCoqpitc           
   
   C   s  d}t | jttfrtt| j}|du r>| jr)| jjr)t| jj}|	| \}}n| j
r5t 	| \}}nt 	| \}}n|	| \}}t||j_d}| j
rd| v ru| jdkrui }| jD ]}|jdkrl|j||j< q^tdt|}n?d| ji}d| v r| jrt| jfi |}n)ztt| j fi |}| |_W n ty }	 z
td| j d|	d}	~	ww t| j
|||| j| j|fS )	a  Init Tokenizer object from config

        Args:
            config (Coqpit): Coqpit model config.
            characters (BaseCharacters): Defines the model character set. If not set, use the default options based on
                the config values. Defaults to None.
        Nr   multi_phonemizerr.   z>Multi phonemizer requires language to be set for each dataset.r2   z!No phonemizer found for language z^.
                            You may need to install a third party library for this language.)
isinstancer   strr<   getattrr   r   characters_classr   init_from_configr   r   r   r   r   datasetsr2   
ValueErrorr   phoneme_languager
   r	   namer'   r   r   enable_eos_bos_chars)
rH   r   r   CharactersClass
new_configr   lang_to_phonemizer_namedatasetphonemizer_kwargser   r   r   rO      sX   




zTTSTokenizer.init_from_config)FNNNFFr   )F)r   )rH   rI   r   r   )__name__
__module____qualname____doc__r   r   r   boolr   propertyr   setterrL   r   intr-   r1   r7   r9   r6   r5   rF   staticmethodrO   r   r   r   r   r   
   s<    




r   N)typingr   r   r   r   TTS.tts.utils.textr   TTS.tts.utils.text.charactersr   r   TTS.tts.utils.text.phonemizersr	   r
   /TTS.tts.utils.text.phonemizers.multi_phonemizerr   TTS.utils.generic_utilsr   r   r   r   r   r   r   <module>   s    