o
    ´‹
j˜  ã                   @   sX   d dl Z d dl mZ d dlmZ G dd„ dejƒZG dd„ dejƒZG dd	„ d	eƒZdS )
é    N)Únn)ÚBaseEncoderc                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚSELayeré   c                    sT   t t| ƒ ¡  t d¡| _t t ||| ¡tjddt || |¡t 	¡ ¡| _
d S )Né   T©Úinplace)Úsuperr   Ú__init__r   ÚAdaptiveAvgPool2dÚavg_poolÚ
SequentialÚLinearÚReLUÚSigmoidÚfc)ÚselfÚchannelÚ	reduction©Ú	__class__© úL/home/kuhnn/.local/lib/python3.10/site-packages/TTS/encoder/models/resnet.pyr
   	   s   

üzSELayer.__init__c                 C   s@   |  ¡ \}}}}|  |¡ ||¡}|  |¡ ||dd¡}|| S )Nr   )Úsizer   Úviewr   )r   ÚxÚbÚcÚ_Úyr   r   r   Úforward   s   zSELayer.forward)r   )Ú__name__Ú
__module__Ú__qualname__r
   r    Ú__classcell__r   r   r   r   r      s    
r   c                       s*   e Zd ZdZd‡ fdd„	Zdd„ Z‡  ZS )	ÚSEBasicBlockr   Nr   c                    s~   t t| ƒ ¡  tj||d|ddd| _t |¡| _tj||dddd| _t |¡| _	tj
dd| _t||ƒ| _|| _|| _d S )Né   r   F)Úkernel_sizeÚstrideÚpaddingÚbias)r'   r)   r*   Tr   )r	   r%   r
   r   ÚConv2dÚconv1ÚBatchNorm2dÚbn1Úconv2Úbn2r   Úrelur   ÚseÚ
downsampler(   )r   ÚinplanesÚplanesr(   r3   r   r   r   r   r
      s   
zSEBasicBlock.__init__c                 C   sj   |}|   |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}| jd ur*|  |¡}||7 }|  |¡}|S ©N)r,   r1   r.   r/   r0   r2   r3   )r   r   ÚresidualÚoutr   r   r   r    (   s   








zSEBasicBlock.forward)r   Nr   )r!   r"   r#   Ú	expansionr
   r    r$   r   r   r   r   r%      s    r%   c                       s^   e Zd ZdZddg d¢g d¢ddddf‡ fd	d
„	Zdd„ Zddd„Zdd„ Zddd„Z‡  Z	S )ÚResNetSpeakerEncoderzÓImplementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
    Adapted from: https://github.com/clovaai/voxceleb_trainer
    é@   i   )r&   é   é   r&   )é    r;   é€   é   ÚASPFNc	              
      s¸  t t| ƒ ¡  || _|| _|| _|| _|| _|| _t	j
d|d dddd| _t	jdd| _t	 |d ¡| _|d | _|  t|d |d ¡| _| jt|d |d dd| _| jt|d	 |d	 dd| _| jt|d |d dd| _t	 |¡| _| jr€|  |¡| _nd | _t| jd
 ƒ}	t	 t	j|d |	 dddt	 ¡ t	 d¡t	jd|d |	 ddt	jd	d¡| _ | jdkr½|d |	 }
n| jdkrË|d |	 d	 }
nt!dƒ‚t	 "|
|¡| _#|  $¡  d S )Nr   r   r&   )r'   r(   r)   Tr   )é   rB   )r(   rB   r   r?   )r'   ©ÚdimÚSAPrA   zUndefined encoder)%r	   r:   r
   Úencoder_typeÚ	input_dimÚ	log_inputÚuse_torch_specÚaudio_configÚproj_dimr   r+   r,   r   r1   r-   r.   r4   Úcreate_layerr%   Úlayer1Úlayer2Úlayer3Úlayer4ÚInstanceNorm1dÚinstancenormÚget_torch_mel_spectrogram_classÚ
torch_specÚintr   ÚConv1dÚBatchNorm1dÚSoftmaxÚ	attentionÚ
ValueErrorr   r   Ú_init_layers)r   rG   rK   ÚlayersÚnum_filtersrF   rH   rI   rJ   Úoutmap_sizeÚout_dimr   r   r   r
   A   sD   

û

zResNetSpeakerEncoder.__init__c                 C   s`   |   ¡ D ])}t|tjƒrtjj|jddd qt|tjƒr-tj |jd¡ tj |j	d¡ qd S )NÚfan_outr1   )ÚmodeÚnonlinearityr   r   )
ÚmodulesÚ
isinstancer   r+   ÚinitÚkaiming_normal_Úweightr-   Ú	constant_r*   )r   Úmr   r   r   r[   {   s   €ûz!ResNetSpeakerEncoder._init_layersr   c              	   C   sž   d }|dks| j ||j kr&t tj| j ||j d|ddt ||j ¡¡}g }| || j |||ƒ¡ ||j | _ td|ƒD ]}| || j |ƒ¡ q>tj|Ž S )Nr   F)r'   r(   r*   )r4   r9   r   r   r+   r-   ÚappendÚrange)r   Úblockr5   Úblocksr(   r3   r\   r   r   r   r   rL   ƒ   s   þ
z!ResNetSpeakerEncoder.create_layerc                 G   s    t  tj|Ž ¡}t j |¡ |S r6   )r   Ú	ParameterÚtorchÚFloatTensorre   Úxavier_normal_)r   r   r8   r   r   r   Únew_parameter”   s   z"ResNetSpeakerEncoder.new_parameterc                 C   sZ  |  d¡ | jr|  |¡}| jr|d  ¡ }|  |¡ d¡}|  |¡}|  |¡}|  	|¡}|  
|¡}|  |¡}|  |¡}|  |¡}| | ¡ d d| ¡ d ¡}|  |¡}| jdkrdtj|| dd}n,| jdkrtj|| dd}t tj|d | dd|d  jd	d
¡}t ||fd¡}| | ¡ d d¡}|  |¡}|r«tjjj|ddd}|S )a{  Forward pass of the model.

        Args:
            x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
                to compute the spectrogram on-the-fly.
            l2_norm (bool): Whether to L2-normalize the outputs.

        Shapes:
            - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
        r   gíµ ÷Æ°>r   éÿÿÿÿrE   rB   rC   rA   gñhãˆµøä>)Úmin)ÚprD   )Úsqueeze_rI   rT   rH   ÚlogrR   Ú	unsqueezer,   r1   r.   rM   rN   rO   rP   Úreshaper   rY   rF   ro   ÚsumÚsqrtÚclampÚcatr   r   r   Ú
functionalÚ	normalize)r   r   Úl2_normÚwÚmuÚsgr   r   r   r    ™   s4   











,
zResNetSpeakerEncoder.forward)r   )F)
r!   r"   r#   Ú__doc__r
   r[   rL   rr   r    r$   r   r   r   r   r:   ;   s    ÷:
r:   )ro   r   ÚTTS.encoder.models.base_encoderr   ÚModuler   r%   r:   r   r   r   r   Ú<module>   s    !