o
    
j?                     @   sB  d dl Z d dlZd dlmZ d dlZd dlmZ d dlm  mZ	 d dlm
Z
 d dlmZmZ dd Zdd Zd"d
dZG dd dejZG dd dejeZG dd deZG dd deZG dd dejZedkredddZedddZed ddZedddZeddgZ eddd d!Z!e!ee eeZ"dS dS )#    N)abstractmethod)autocast)AttentionBlocknormalizationc                 C      | j tjkS N)dtypetorchfloatt r   \/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/tortoise/diffusion_decoder.py	is_latent      r   c                 C   r   r   )r   r	   longr   r   r   r   is_sequence   r   r   '  c                 C   s   |d }t t| t jd|t jd | j| jd}| dddf  |d  }t j	t 
|t |gdd}|d rRt j	|t |ddddf gdd}|S )	aY  
    Create sinusoidal timestep embeddings.

    :param timesteps: a 1-D Tensor of N indices, one per batch element.
                      These may be fractional.
    :param dim: the dimension of the output.
    :param max_period: controls the minimum frequency of the embeddings.
    :return: an [N x dim] Tensor of positional embeddings.
       r   )startendr   deviceNdim   )r	   expmathlogarangefloat32tor   r
   catcossin
zeros_like)	timestepsr   
max_periodhalffreqsargs	embeddingr   r   r   timestep_embedding   s   
((r-   c                   @   s   e Zd Zedd ZdS )TimestepBlockc                 C   s   dS )zJ
        Apply the module to `x` given `emb` timestep embeddings.
        Nr   )selfxembr   r   r   forward+   s    zTimestepBlock.forwardN)__name__
__module____qualname__r   r2   r   r   r   r   r.   *   s    r.   c                   @   s   e Zd Zdd ZdS )TimestepEmbedSequentialc                 C   s,   | D ]}t |tr|||}q||}q|S r   )
isinstancer.   )r/   r0   r1   layerr   r   r   r2   3   s
   

zTimestepEmbedSequential.forwardN)r3   r4   r5   r2   r   r   r   r   r6   2   s    r6   c                       s0   e Zd Z					d
 fdd	Zdd	 Z  ZS )ResBlockNr      TFc	                    s  t    || _|| _|| _|p|| _|| _dddd| }	|r"dnd}
|r(dnd}tt	|t
 tj|| j|
|d| _tt
 t||rMd| j n| j| _tt	| jt
 tj|dtj| j| j||	d| _| j|krzt | _d S tj|| j|
|d| _d S )Nr   r   r   )r   r:      r:   padding)p)super__init__channelsemb_channelsdropoutout_channelsuse_scale_shift_normnn
Sequentialr   SiLUConv1d	in_layersLinear
emb_layersDropout
out_layersIdentityskip_connection)r/   rA   rB   rC   rD   dimskernel_sizeefficient_configrE   r=   
eff_kerneleff_padding	__class__r   r   r@   =   s<   



zResBlock.__init__c           	      C   s   |  |}| ||j}t|jt|jk r&|d }t|jt|jk s| jrO| jd | jdd  }}tj	|ddd\}}||d|  | }||}n	|| }| |}| 
|| S )N).Nr   r   r   r   )rJ   rL   typer   lenshaperE   rN   r	   chunkrP   )	r/   r0   r1   hemb_outout_normout_restscaleshiftr   r   r   r2   k   s   


zResBlock.forward)Nr   r:   TFr3   r4   r5   r@   r2   __classcell__r   r   rV   r   r9   <   s    .r9   c                       s$   e Zd Z fddZdd Z  ZS )DiffusionLayerc                    s4   t    t||||ddd| _t||dd| _d S )Nr   TrQ   rE   relative_pos_embeddings)r?   r@   r9   resblkr   attn)r/   model_channelsrC   	num_headsrV   r   r   r@   |   s   
zDiffusionLayer.__init__c                 C   s   |  ||}| |S r   )rh   ri   )r/   r0   time_embyr   r   r   r2      s   
zDiffusionLayer.forwardrb   r   r   rV   r   rd   {   s    rd   c                       s`   e Zd Z													d fd
d	Zdd Zdd Zdd Z					dddZ  ZS )DiffusionTts      d         r   F   皙?c                    sX  t    || _| _|| _ | _| _|| _|| _|
| _	t
|ddd| _t
t
t
 t
| _t
|| _t
tddtddtdd| _t| _t
t
j|dddtddtddtddtdd| _t
t
j|ddddt
jd ddddtd ddd	td ddd	td ddd	td ddd	td ddd	| _t
tdd| _tt t t | _t
jd dd
| _ t
j|ddd| _!t
" fddt#|D  fddt#dD  | _$t
tt
 t
j|ddd| _%d S )Nr:   r   Trf   r<   r   )r=   strideF)rg   do_checkpoint)rR   )rR   r=   c                    s   g | ]}t  qS r   )rd   .0_rC   rj   rk   r   r   
<listcomp>   s    z)DiffusionTts.__init__.<locals>.<listcomp>c              	      s   g | ]}t  d ddqS )r   Tre   )r9   rx   )rC   rj   r   r   r|      s    )&r?   r@   in_channelsrj   rD   rC   rk   unconditioned_percentageenable_fp16
layer_droprF   rI   	inp_blockrG   rK   rH   
time_embed	Embeddingcode_embeddingr   code_converterr   	code_normlatent_conditionercontextual_embedder	Parameterr	   randnunconditioned_embeddingr6   rd    conditioning_timestep_integratorintegrating_convmel_head
ModuleListrangelayersout)r/   rj   
num_layersr}   in_latent_channels	in_tokensrD   rC   use_fp16rk   r   r~   rV   r{   r   r@      s   




"



zDiffusionTts.__init__c                 C   s~   t | j t | j t | j t | j  t | j  t | j  t | j t | j  t | j	 d}|S )N)	minicoderr   code_converterstimestep_integratorr   )
listr   
parametersr   r   r   r   r   r   r   )r/   groupsr   r   r   get_grad_norm_parameter_groups  s   z+DiffusionTts.get_grad_norm_parameter_groupsc                 C   sn   t |jdkr|dn|}g }t|jd D ]}|| |d d |f  qtj|dd}|jdd}|S )Nr:   r   r   r   )	rY   rZ   	unsqueezer   appendr   r	   r#   mean)r/   conditioning_inputspeech_conditioning_inputcondsjr   r   r   get_conditioning  s   zDiffusionTts.get_conditioningc                 C   s"  t |r|ddd}tj|ddd\}}t |r| |}n| |ddd}| |}| |d|d  |d }tj	|j
d ddf|jd}| jrv| jdkrvtj|j
d ddf|jd| jk }t|| j|j
d dd|}tj||dd}	|s|	S | |	}
|
|  }
|	|
fS )	Nr   r   r   r   r   r   nearest)sizemode)r   permuter	   r[   r   r   r   r   r   zerosrZ   r   trainingr~   randwherer   repeatFinterpolater   logical_not)r/   aligned_conditioningconditioning_latentexpected_seq_lenreturn_code_pred
cond_scale
cond_shiftcode_embunconditioned_batchesexpanded_code_embmel_predr   r   r   timestep_independent  s.   
" 
z!DiffusionTts.timestep_independentNc              	   C   s.  |dus|dur|dusJ |r|durJ g }|rD| j |jd d|jd }	|t| j t| j   |t| j  n:|durK|}	n-| 	|||jd d\}	}
t
|rn|t| j t| j   n
|t| j  || j  | t|| j}| |	|}	| |}tj||	gdd}| |}t| jD ]L\}}| jr| jdkr|dkr|t| jd krt | jk r|t|  qt|jj| jo|dkd |||}W d   n1 sw   Y  q| }| |}d}|D ]}||   }q||d  }|r||
fS |S )a  
        Apply the model to an input batch.

        :param x: an [N x C x ...] Tensor of inputs.
        :param timesteps: a 1-D batch of timesteps.
        :param aligned_conditioning: an aligned latent or sequence of tokens providing useful data about the sample to be produced.
        :param conditioning_latent: a pre-computed conditioning latent; see get_conditioning().
        :param precomputed_aligned_embeddings: Embeddings returned from self.timestep_independent()
        :param conditioning_free: When set, all conditioning inputs (including tokens and conditioning_input) will not be considered.
        :return: an [N x C x ...] Tensor of outputs.
        Nr   r   r   Tr   )enabled)!r   r   rZ   extendr   r   r   r   r   r   r   r   r   r-   rj   r   r   r	   r#   r   	enumerater   r   r   rY   randomr   r   rX   r   r
   r   r   )r/   r0   r'   r   r   precomputed_aligned_embeddingsconditioning_freer   unused_paramsr   r   rl   ilyrr   extraneous_additionr>   r   r   r   r2   B  s^   "



zDiffusionTts.forward)ro   rp   rq   ro   rr   rs   r   Frt   ru   ru   )NNNFF)	r3   r4   r5   r@   r   r   r   r2   rc   r   r   rV   r   rn      s,    s,rn   __main__r   rq   i  i  ro   i    )r   rq   iX  g333333?g      ?)r   r~   )r   )#r   r   abcr   r	   torch.nnrF   torch.nn.functional
functionalr   r   "TTS.tts.layers.tortoise.arch_utilsr   r   r   r   r-   Moduler.   rG   r6   r9   rd   rn   r3   r   clipaligned_latentrandintaligned_sequencecond
LongTensortsmodelor   r   r   r   <module>   s6    

?  
