o
    ´‹
jV  ã                   @   s  d dl Z d dlmZ d dlm  mZ d dl mZ d dlmZ d dl	m
Z
 d dlmZ dd„ Zdd	d
„ZG dd„ dejƒZedkrˆedddZee  d dd¡e  ddg¡e  d dd¡e  ddg¡dd ee  d dd¡e  ddg¡e  d dd¡e  ddg¡ddZeejƒ dS dS )é    N)Úeinsum)ÚCheckpointedXTransformerEncoder)ÚTransformer)ÚEncoderc                 C   s   | d uS )N© )Úvalr   r   úO/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/tortoise/clvp.pyÚexists   s   r	   é   c                 C   s<   |   |d d …d d …d f  d¡} | jdd|jddd  S )Ng        r
   ©Údim).N)Úmasked_fillÚsum)ÚtÚmaskr   r   r   r   Úmasked_mean   s    r   c                       sL   e Zd ZdZdddddddddddd	d	d
ddœ‡ fdd„
Zddd„Z‡  ZS )ÚCLVPzù
    CLIP model retrofitted for performing contrastive evaluation between tokenized audio data and the corresponding
    transcribed text.

    Originally from https://github.com/lucidrains/DALLE-pytorch/blob/main/dalle_pytorch/dalle_pytorch.py
    i   é   é   éx   é   é    éú   r   i   F)Údim_textÚ
dim_speechÚ
dim_latentÚnum_text_tokensÚtext_enc_depthÚtext_seq_lenÚ
text_headsÚnum_speech_tokensÚspeech_enc_depthÚspeech_headsÚspeech_seq_lenÚtext_mask_percentageÚvoice_mask_percentageÚwav_token_compressionÚuse_xformersc                   s  t ƒ  ¡  t ||¡| _tj||dd| _t ||¡| _tj||dd| _|rPt	dddt
|||ddddddd	d| _t	dddt
||	|
ddddddd	d| _ntd||||d	| _td|||	|
d	| _t t d
¡¡| _|| _|| _|| _|| _|s‹t ||¡| _t ||¡| _d S d S )NF)Úbiaséÿÿÿÿgš™™™™™¹?é   T)	r   ÚdepthÚheadsÚ
ff_dropoutÚff_multÚattn_dropoutÚuse_rmsnormÚff_gluÚrotary_pos_emb)Úneeds_permuteÚexit_permuteÚmax_seq_lenÚattn_layers)ÚcausalÚseq_lenr   r+   r,   g      ð?)ÚsuperÚ__init__ÚnnÚ	EmbeddingÚtext_embÚLinearÚto_text_latentÚ
speech_embÚto_speech_latentr   r   Útext_transformerÚspeech_transformerr   Ú	ParameterÚtorchÚtensorÚtemperaturer$   r%   r&   ÚxformersÚtext_pos_embÚspeech_pos_emb)Úselfr   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   ©Ú	__class__r   r   r:      sj   
÷ü÷
ü
ÿ
ÿþzCLVP.__init__c                 C   s‚  |j d |j}}| jr!t | ¡ ¡| jk}t | ¡ ¡| jk}nt | ¡ ¡ 	¡ }t | ¡ ¡ 	¡ }|  
|¡}|  |¡}	| js^||  tj|j d |d¡7 }|	|  tj|	j d |d¡7 }	| j||d}
| j|	|d}t|
|dd}t||dd}|  |¡}|  |¡}tdd„ ||fƒ\}}| j ¡ }|s td||ƒ| }|S td	||ƒ| }tj||d}t ||¡t | ¡ |¡ d
 }|S )Nr   r
   )Údevice)r   r   c                 S   s   t j| dddS )Nr*   r)   )Úpr   )ÚFÚ	normalize)r   r   r   r   Ú<lambda>   s    zCLVP.forward.<locals>.<lambda>zn d, n d -> nzi d, j d -> i jr*   )ÚshaperN   ÚtrainingrE   Ú	rand_likeÚfloatr$   r%   Ú	ones_likeÚboolr=   r@   rH   rI   ÚarangerJ   rB   rC   r   r?   rA   ÚmaprG   Úexpr   rP   Úcross_entropyr   )rK   ÚtextÚspeech_tokensÚreturn_lossÚbrN   Ú	text_maskÚ
voice_maskr=   r@   Úenc_textÚ
enc_speechÚtext_latentsÚspeech_latentsÚtempÚsimÚlabelsÚlossr   r   r   Úforwardh   s4   




 zCLVP.forward)F)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r:   rk   Ú__classcell__r   r   rL   r   r      s&    
ïLr   Ú__main__gš™™™™™É?)r$   r%   r   )r*   r   é2   éd   r   )r*   r   ée   éf   T)r_   F)r
   )rE   Útorch.nnr;   Útorch.nn.functionalÚ
functionalrP   r   Ú"TTS.tts.layers.tortoise.arch_utilsr   Ú#TTS.tts.layers.tortoise.transformerr   Ú%TTS.tts.layers.tortoise.xtransformersr   r	   r   ÚModuler   rl   ÚclipÚrandintrF   ÚnonlossÚprintrS   r   r   r   r   Ú<module>   s8    
{ûûð