o
    
jZ                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlm  mZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ dd ZG dd	 d	ejZd
d ZG dd dejZdS )    N)
GPT2Config)GPT2InferenceModel)ConditioningEncoder)PerceiverResamplerc                 C   s"   t j| jd | jd |f| jdS )Nr      device)torchzerosshaper   )rangedim r   J/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/xtts/gpt.pynull_position_embeddings   s   "r   c                       s.   e Zd Zd	 fdd	Zdd Zdd Z  ZS )
LearnedPositionEmbeddings{Gz?Fc                    s>   t    tj||| _| jjjjd|d || _	|| _
d S )N        )meanstd)super__init__r	   nn	Embeddingembweightdatanormal_relativeseq_len)selfr   	model_diminitr   	__class__r   r   r      s
   

z"LearnedPositionEmbeddings.__init__c                 C   sV   |j d }| jrt|| j| }| tj||| |jdS | tjd||jdS )Nr   r   r   )	r   r   randomrandintr   r   r	   aranger   )r    xslstartr   r   r   forward   s
   
z!LearnedPositionEmbeddings.forwardc                 C   s   |  tj|g|ddS )Nr   r   )r   r	   tensor	unsqueeze)r    inddevr   r   r   get_fixed_embedding'   s   z-LearnedPositionEmbeddings.get_fixed_embedding)r   F)__name__
__module____qualname__r   r+   r0   __classcell__r   r   r#   r   r      s    	r   c              
   C   s   ddl m}m} |d|| | || | || ||| d}	||	}
|
`tjt|d|
_|
`|dkr6t||ntjt|d}|dkrFt||ntjt|d}|
||ddfS )z7
    GPT-2 implemented by the HuggingFace library.
    r   )r   	GPT2Model   
vocab_sizen_positionsn_ctxn_embdn_layern_headgradient_checkpointing	use_cacher   N)	transformersr   r5   wpe	functoolspartialr   wter   )layersr!   headsmax_mel_seq_lenmax_text_seq_lenmax_prompt_lencheckpointingr   r5   
gpt_configgptmel_pos_embtext_pos_embr   r   r   build_hf_gpt_transformer+   s0   




rQ   c                       s   e Zd Z												
									
d. fdd	Zdd Zd/ddZdd Zdd Z								d0ddZdd Z	d d! Z
d1d"d#Z						d2d$d%Zd&d' Zd(d) Zd*d+ Zd,d- Z  ZS )3GPT  r         x      F   r      r6             Fr   c              
      s  t    || _|| _|| _|| _|| _|| _|| _|| _	|| _
|| _|| _|| _|	| _|| j d | _|dkr:dn|d | j | _|dkrHdn|d | _|| _|
| _td||d| _td| _|| _|| _|| _t| j|| _t| j|| _t|||| j| j| j|\| _ | _!| _"| _#| _$|rtj%t&'dd|d dd	| _(tj%t&'dd|d dd	| _)nd
| _(d
| _)t*|| _+t,|| j| _-t,|| j| _.| jrt/|d|dddddd| _0dS t| j|| _1t2d|| _3dS )z
        Args:

           rA   P   )num_attn_headsg?r   r   T)requires_gradr       @   rT      F)r   depthdim_contextnum_latentsdim_headrH   ff_multuse_flash_attn   N)4r   r   label_smoothingnumber_text_tokensstart_text_tokenstop_text_tokennum_audio_tokensstart_audio_tokenstop_audio_tokenstart_prompt_tokenstop_prompt_tokenrG   rH   r!   max_conditioning_inputsmax_gen_mel_tokensmax_mel_tokensmax_text_tokensmax_prompt_tokenscode_stride_lenr   conditioning_encoderr   	Dropout1dconditioning_dropoutaverage_conditioning_embeddingsuse_perceiver_resampler!perceiver_cond_length_compressionr   text_embeddingmel_embeddingrQ   rN   mel_pos_embeddingtext_pos_embeddingmel_layer_pos_embeddingtext_layer_pos_embedding	Parameterr	   randnmel_solo_embeddingtext_solo_embedding	LayerNorm
final_normLinear	text_headmel_headr   conditioning_perceiverprompt_embeddingr   prompt_pos_embedding)r    rm   rn   rG   r!   rH   rw   rv   rx   rt   ry   rl   ro   rp   rq   train_solo_embeddingsrL   r}   rk   r~   r   r#   r   r   r   Y   s|   

 zGPT.__init__c                 C   sN   t | j | jrt | j nd t | j t | j t | j  dS )N)rz   r   rN   rH   )listrz   
parametersr~   r   rN   r   r   )r    r   r   r   get_grad_norm_parameter_groups   s   z"GPT.get_grad_norm_parameter_groupsTc              
   C   s   | j | j | j d }t| j||| j| j| jddd}t|| j| j	| j
| j| j|d| _| j
| j_|rOdd l}|j| j dtjddd| _| jj | _d S d S )	Nr   FTr7   )kv_cacher   auto)modelmp_sizedtypereplace_methodreplace_with_kernel_inject)rx   rv   rw   r   r!   rG   rH   r   rN   r   r   r   r   gpt_inferencerF   	deepspeedinit_inferencehalfr	   float32	ds_enginemoduleeval)r    r   use_deepspeed
seq_lengthrM   r   r   r   r   init_gpt_for_inference   s@   

	zGPT.init_gpt_for_inferencec                 C   s(   t j|d|d}t j|d|d}||fS )Nr   r   valuer   r   )Fpad)r    inputstart_token
stop_tokeninptarr   r   r   set_inputs_and_targets   s   zGPT.set_inputs_and_targetsc                 C   s>   t t|D ]}|| }||jd k r| j|||df< q|S )a$  
        Given mel tokens that are derived from a padded audio clip and the actual lengths of each batch element in
        that audio clip, reformats the tokens with stop_audio_token in place of the zero padding. This is required
        preformatting to create a working TTS model.
        rA   N)r   lenr   rq   )r    mel_input_tokenscode_lengthsb
actual_endr   r   r   set_mel_padding   s   zGPT.set_mel_paddingNc                 C   s  |d ur!|j d }|d urtj|||gdd}n	tj||gdd}d }|	d urKtj|	|
gdd}|d urKtj|j d |tj|jd}tj||gdd}| j|d||d}|rY|jS |jd d |d f }| 	|}|r|d d d |j d f |d d |j d  d f fS |d d d |j d f }||}|
ddd}|d ur|d d |j d  d f }||}|
ddd}||fS |S )Nr   r@   r   r   r   T)inputs_embedsreturn_dictoutput_attentionsattention_maskr]   )r   r	   catonesboolr   rN   
attentionslast_hidden_stater   permute)r    first_inputs
first_headsecond_inputssecond_headprompt	get_attnsreturn_latentattn_mask_condattn_mask_textattn_mask_meloffsetr   	attn_maskgpt_outencfirst_logitssecond_logitsr   r   r   
get_logits   s@   

6zGPT.get_logitsc                 C   sn   t |jdkr|dn|}g }t|jd D ]}|| |d d |f  qtj|dd}|jdd}|S )N   r   r@   )	r   r   r-   r   appendrz   r	   stackr   )r    speech_conditioning_inputcondsjr   r   r   get_conditioning2  s   
zGPT.get_conditioningc           	      C   s   |}| j rfg }t|jd D ]!}d}t|jd D ]}|||f dkr% n|d7 }q|| qd}|d }|jd |krft|jd D ]}|| |k rOd}qDtd|| | }qD|dd||| f }tj|d| jd	}tj|d
| j	d	}|S )z
        Create a prompt from the mel codes. This is used to condition the model on the mel codes.
        Pad the prompt with start and stop mel tokens.
        r   r   S   r      rA   Nr   r   r   )
trainingr   r   r   r%   r&   r   r   rr   rs   )	r    prompt_codesr   lengthsilengthr   
prompt_lenr*   r   r   r   get_prompts?  s*   
zGPT.get_promptsc                 C   sZ   d}|s&|j dkr|d}| |}| jr$| |ddddd}|S |d}|S )zU
        cond_input: (b, 80, s) or (b, 1, 80, s)
        conds: (b, 1024, s)
        Nrc   r   r   r]   )ndimsqueezerz   r~   r   r   	transposer-   )r    
cond_inputr   r   r   r   r   get_style_emba  s   



zGPT.get_style_embc                 C   sn  | j dkr|du sJ d| }t|| j  d }|dur/| jr*|| j }n|| j }|durSt|	dD ]}| jrI|| | j ||< q:|| | j ||< q:| }||j
d krkt|d||j
d  f}||j
d ksJ d| d|j
d  d||j
d ksJ d	| d
|j
d  dtj|ddd|f d| jd}tj|ddd|f d| jd}| ||d }| || j| j\}}| || j| j\}}d}d}d}|
sptj|j
d |j
d tj|jd}tj|j
d |j
d tj|jd}tj|j
d |j
d tj|jd}|dur1t|D ]\}}|d |d  }d|||df< qn|durHt|D ]\}}d|||df< q:t|D ]\}}d|||d df< qLt|D ]\}}d|||d df< q`| || | }| || | }|du r| |dd}d}| jrd}| j|| j|| j ||	|
|||d
\}}|
r|ddd|f S |	r|S t|D ]\}}d|||d df< qt|D ]\}}d|||d df< q|| jk! |j
d ksJ d| j d|dur||df }||df }d||||f< tj"|| d| j#d}tj"|| d| j#d}|$ |$ |fS )ae  
        Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode
        (actuated by `text_first`).

        text_inputs: long tensor, (b,t)
        text_lengths: long tensor, (b,)
        mel_inputs:  long tensor, (b,m)
        wav_lengths: long tensor, (b,)
        cond_mels: MEL float tensor, (b, 1, 80,s)
        cond_idxs: cond start and end indexs, (b, 2)

        If return_attentions is specified, only logits are returned.
        If return_latent is specified, loss & logits are not computed or returned. Only the predicted latents are returned.
        r   Nu<    ❗ cond_mels is not None, but max_conditioning_inputs == 0r   rA   u    ❗ max_mel_len (z) > audio_codes.shape[-1] ()u    ❗ max_text_len (z) > text_inputs.shape[-1] (r   r   r   r   r   r]   )r   r   r   r   r   r   u.    ❗ mel_targets does not contain stop token (z) in every row.)ignore_indexrk   )%rt   maxr	   ceilry   longr~   r   r   sizer   r   r   rn   rq   r   r   rm   rp   r   r   r   	enumerater   r   r   r   r   r   r   r   r   r   sumcross_entropyrk   r   )r    text_inputstext_lengthsaudio_codeswav_lengths	cond_mels	cond_idxs	cond_lenscond_latentsreturn_attentionsr   max_text_lenr   idxmax_mel_lentext_targetsmel_targetsr   r   r   rltext_embmel_embsubtext_logits
mel_logits
cond_startcond_end	loss_textloss_melr   r   r   r+   r  s   

""







zGPT.forwardc                 K   s    |  || | j||fi |S )N)compute_embeddingsgenerate)r    r   r   hf_generate_kwargsr   r   r   	inference-  s   zGPT.inferencec                 C   s   t j|d| jd}t j|d| jd}| || | }tj||gdd}| j	| tj
|jd |jd d fdtj|jd}| j|d d df< |S )	Nr   r   r   r   r@   r   )
fill_valuer   r   rA   )r   r   rn   rm   r   r   r	   r   r   store_prefix_embfullr   r   r   rp   )r    r   r   r   
gpt_inputsr   r   r   r  1  s   	zGPT.compute_embeddingsc                 K   s~   |  ||}| jj|f| j| j| j| j|jd  d|}d|v r2|jd d |jd d f |fS |d d |jd d f S )NrA   )bos_token_idpad_token_ideos_token_id
max_lengthreturn_dict_in_generater   )r  r   r  rp   rq   ru   r   	sequences)r    r   r   r  r  genr   r   r   r  G  s    zGPT.generatec                 K   s2   | j j|f| j| j| j| j|jd  dd|S )NrA   T)r  r  r  r  	do_stream)r   generate_streamrp   rq   ru   r   )r    fake_inputsr  r   r   r   get_generatorZ  s   zGPT.get_generator)rS   r   rT   rU   rT   rV   rW   rX   r   rY   r6   rZ   r[   r\   FFFr   Fr6   )TF)NNNFFNNN)F)NNNNFF)r1   r2   r3   r   r   r   r   r   r   r   r   r   r+   r  r  r  r   r4   r   r   r#   r   rR   X   sd    c

#
7
"
 <rR   )rD   mathr%   r	   torch.nnr   torch.nn.functional
functionalr   rB   r   !TTS.tts.layers.xtts.gpt_inferencer   "TTS.tts.layers.xtts.latent_encoderr   %TTS.tts.layers.xtts.perceiver_encoderr   r   Moduler   rQ   rR   r   r   r   r   <module>   s   -