o
    
j;                     @   sj   d dl Z d dlmZ d dlmZmZmZmZmZ G dd dej	Z
G dd dej	ZG dd dej	ZdS )	    N)AttentionBlock
DownsampleUpsamplenormalizationzero_modulec                       s6   e Zd Z								d
 fdd	Zdd	 Z  ZS )ResBlockNF      Tc                    sR  t    || _|| _|p|| _|| _|| _|
| _|	dkrdnd}t	t
|t tj|| j|	|d| _|p8|| _|rKt|d|| _t|d|| _n|r\t|d|| _t|d|| _nt  | _| _t	t
| jt tj|dttj| j| j|	|d| _| j|krt | _d S |rtj||| j|	|d| _d S t||| jd| _d S )Nr	      r   paddingF)p)super__init__channelsdropoutout_channelsuse_convuse_scale_shift_normdo_checkpointnn
Sequentialr   SiLUConv1d	in_layersupdownr   h_updx_updr   IdentityDropoutr   
out_layersskip_connection)selfr   r   r   r   r   dimsupdownkernel_sizer   r   	__class__ U/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/tortoise/classifier.pyr      s@   




zResBlock.__init__c                 C   sh   | j r#| jd d | jd }}||}| |}| |}||}n| |}| |}| || S )N)r   r   r   r   r    r!   )r"   xin_restin_convhr)   r)   r*   forward=   s   




zResBlock.forward)NFFr   FFr	   T__name__
__module____qualname__r   r0   __classcell__r)   r)   r'   r*   r      s    5r   c                       s6   e Zd Z								d
 fdd	Zdd	 Z  ZS )AudioMiniEncoder   r      r   r	   c              
      s   t    ttj||ddd| _|}g }|| _t|D ]%}t|D ]}|t	||d|
d q#|t
|d|d |	d |d9 }qtj| | _tt|t t||d| _g }t|D ]}|t||dd	 q`tj| | _|| _d S )
Nr	   r
   r   F)r   r&   Tr   )r   r   factor)r   )r   r   r   r   r   initlayersrangeappendr   r   resr   r   finalr   attndim)r"   spec_dimembedding_dimbase_channelsdepthresnet_blocksattn_blocksnum_attn_headsr   downsample_factorr&   chr>   lrr@   ar'   r)   r*   r   K   s"   

"
zAudioMiniEncoder.__init__c                 C   sH   |  |}| |}| |}| jD ]}||}q|d d d d df S )Nr   )r:   r>   r?   r@   )r"   r,   r/   blkr)   r)   r*   r0   j   s   




zAudioMiniEncoder.forward)r7   r   r   r8   r8   r   r   r	   r1   r)   r)   r'   r*   r6   J   s    r6   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	"AudioMiniEncoderWithClassifierHeadTc                    s<   t    tdi || _t| jj|| _|| _|| _	d S )Nr)   )
r   r   r6   encr   LinearrA   headnum_classesdistribute_zero_label)r"   classesrT   kwargsr'   r)   r*   r   t   s
   

z+AudioMiniEncoderWithClassifierHead.__init__Nc           	      C   s   |  |}| |}|d u r|S | jrBtjj|| jd}|dkd}tj	|tj
d| jd  d}d|d d df< || }|| }n|}tj||}|S )N)rS   r   r+   g?r
   )dtype
fill_valuegɿ)rP   rR   rT   r   
functionalone_hotrS   	unsqueezetorch	full_likefloatcross_entropy)	r"   r,   labelsr/   logits	oh_labelszeros_indiceszero_extra_masslossr)   r)   r*   r0   {   s$   


z*AudioMiniEncoderWithClassifierHead.forward)T)Nr1   r)   r)   r'   r*   rO   s   s    rO   )r\   torch.nnr   "TTS.tts.layers.tortoise.arch_utilsr   r   r   r   r   Moduler   r6   rO   r)   r)   r)   r*   <module>   s    C)