o
    ´‹
jà  ã                   @   sr   d dl Z d dlm  mZ d dl mZ G dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG d	d
„ d
ejƒZ	dS )é    N)Únnc                       sH   e Zd Zd‡ fdd„	Zdd„ Zdd	„ Zd
d„ Zdd„ Zddd„Z‡  Z	S )ÚGE2ELossç      $@ç      ÀÚsoftmaxc                    sv   t ƒ  ¡  t t |¡¡| _t t |¡¡| _|| _t	dƒ | jdv s%J ‚| jdkr.| j
| _| jdkr9| j| _dS dS )a  
        Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
        Accepts an input of size (N, M, D)
            where N is the number of speakers in the batch,
            M is the number of utterances per speaker,
            and D is the dimensionality of the embedding vector (e.g. d-vector)
        Args:
            - init_w (float): defines the initial value of w in Equation (5) of [1]
            - init_b (float): definies the initial value of b in Equation (5) of [1]
        z* > Initialized Generalized End-to-End loss)r   Úcontrastr   r   N)ÚsuperÚ__init__r   Ú	ParameterÚtorchÚtensorÚwÚbÚloss_methodÚprintÚembed_loss_softmaxÚ
embed_lossÚembed_loss_contrast)ÚselfÚinit_wÚinit_br   ©Ú	__class__© úE/home/kuhnn/.local/lib/python3.10/site-packages/TTS/encoder/losses.pyr	      s   


ÿzGE2ELoss.__init__c           	      C   st   t  ||d|…f |||d d…f f¡}t  |d¡}g }t|ƒD ]\}}||kr/| |¡ q!| |¡ q!t  |¡S )zP
        Calculates the new centroids excluding the reference utterance
        Né   r   )r   ÚcatÚmeanÚ	enumerateÚappendÚstack)	r   ÚdvecsÚ	centroidsÚspkrÚuttÚexclÚnew_centroidsÚiÚcentroidr   r   r   Úcalc_new_centroids$   s   *
zGE2ELoss.calc_new_centroidsc           
      C   s¦   g }t |ƒD ]G\}}g }t |ƒD ]0\}}|  ||||¡}	| t t | d¡ dd¡|	 dd¡¡t |¡tj|	dd  d¡¡ qtj	|dd}| |¡ qt 
|¡S )zE
        Make the cosine similarity matrix with dims (N,M,N)
        r   r   ©Údimçíµ ÷Æ°>)r   r)   r   r   ÚclampÚmmÚ	unsqueezeÚ	transposeÚnormr   r    )
r   r!   r"   Úcos_sim_matrixÚspkr_idxÚspeakerÚcs_rowÚutt_idxÚ	utterancer&   r   r   r   Úcalc_cosine_sim2   s&   
þüúÿ

zGE2ELoss.calc_cosine_simc           
   	   C   sn   |j \}}}g }t|ƒD ]%}g }t|ƒD ]}	| t |||	f d¡|  ¡ qt |¡}| |¡ qt |¡S )zU
        Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
        r   )ÚshapeÚranger   ÚFÚlog_softmaxr   r    )
r   r!   r2   ÚNÚMÚ_ÚLÚjÚL_rowr'   r   r   r   r   K   s   "

zGE2ELoss.embed_loss_softmaxc                 C   sª   |j \}}}g }t|ƒD ]C}g }t|ƒD ]0}	t |||	f ¡}
t |
d|… |
|d d… f¡}| dt |||	|f ¡ t |¡ ¡ qt |¡}| |¡ qt |¡S )zj
        Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
        Nr   g      ð?)r9   r:   r   Úsigmoidr   r   Úmaxr    )r   r!   r2   r=   r>   r?   r@   rA   rB   r'   Úcentroids_sigmoidsÚexcl_centroids_sigmoidsr   r   r   r   Z   s   "*

zGE2ELoss.embed_loss_contrastNc                 C   s^   |  ¡ d dks
J ‚t |d¡}|  ||¡}t | jd¡ | j| | j }|  ||¡}| ¡ S )zv
        Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
        r   é   r,   )Úsizer   r   r8   r-   r   r   r   )r   ÚxÚ_labelr"   r2   r@   r   r   r   Úforwardj   s   zGE2ELoss.forward)r   r   r   ©N)
Ú__name__Ú
__module__Ú__qualname__r	   r)   r8   r   r   rK   Ú__classcell__r   r   r   r   r      s    r   c                       ó,   e Zd ZdZd	‡ fdd„	Zd
dd„Z‡  ZS )ÚAngleProtoLossaÌ  
    Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
        Accepts an input of size (N, M, D)
            where N is the number of speakers in the batch,
            M is the number of utterances per speaker,
            and D is the dimensionality of the embedding vector
        Args:
            - init_w (float): defines the initial value of w
            - init_b (float): definies the initial value of b
    r   r   c                    sF   t ƒ  ¡  t t |¡¡| _t t |¡¡| _tj ¡ | _	t
dƒ d S )Nz( > Initialized Angular Prototypical loss)r   r	   r   r
   r   r   r   r   ÚCrossEntropyLossÚ	criterionr   )r   r   r   r   r   r   r	   †   s
   
zAngleProtoLoss.__init__Nc           	      C   sÌ   |  ¡ d dks
J ‚t |dd…dd…dd…f d¡}|dd…ddd…f }|  ¡ d }t | d¡ dd|¡| d¡ dd|¡ dd¡¡}t | j	d¡ || j	 | j
 }t |¡ |j¡}|  ||¡}|S )z|
        Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
        r   rG   Nr   éÿÿÿÿr,   )rH   r   r   r;   Úcosine_similarityr/   Úexpandr0   r-   r   r   ÚarangeÚtoÚdevicerT   )	r   rI   rJ   Ú
out_anchorÚout_positiveÚnum_speakersr2   Úlabelr@   r   r   r   rK      s   "þzAngleProtoLoss.forward©r   r   rL   ©rM   rN   rO   Ú__doc__r	   rK   rP   r   r   r   r   rR   z   s    
rR   c                       s2   e Zd ZdZ‡ fdd„Zd	dd„Zdd„ Z‡  ZS )
ÚSoftmaxLosszÙ
    Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
        Args:
            - embedding_dim (float): speaker embedding dim
            - n_speakers (float): number of speakers
    c                    s0   t ƒ  ¡  tj ¡ | _t ||¡| _tdƒ d S )NzInitialised Softmax Loss)	r   r	   r   r   rS   rT   ÚLinearÚfcr   )r   Úembedding_dimÚ
n_speakersr   r   r   r	   ®   s   
zSoftmaxLoss.__init__Nc                 C   s8   |  d| ¡ d ¡}|  d¡}|  |¡}|  ||¡}|S )NrU   )ÚreshaperH   rd   rT   )r   rI   r^   r@   r   r   r   rK   ¶   s
   

zSoftmaxLoss.forwardc                 C   s0   |   |¡}tjjj|dd d¡}t |¡}|S )Nr   r*   r   )rd   r   r   Ú
functionalr   ÚsqueezeÚargmax)r   Ú	embeddingrI   ÚactivationsÚclass_idr   r   r   Ú	inferenceÀ   s   

zSoftmaxLoss.inferencerL   )rM   rN   rO   ra   r	   rK   rn   rP   r   r   r   r   rb   ¦   s
    

rb   c                       rQ   )ÚSoftmaxAngleProtoLossaf  
    Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
        Args:
            - embedding_dim (float): speaker embedding dim
            - n_speakers (float): number of speakers
            - init_w (float): defines the initial value of w
            - init_b (float): definies the initial value of b
    r   r   c                    s.   t ƒ  ¡  t||ƒ| _t||ƒ| _tdƒ d S )Nz)Initialised SoftmaxAnglePrototypical Loss)r   r	   rb   r   rR   Ú
angleprotor   )r   re   rf   r   r   r   r   r   r	   Ñ   s   
zSoftmaxAngleProtoLoss.__init__Nc                 C   s   |   |¡}|  ||¡}|| S )zŠ
        Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
        )rp   r   )r   rI   r^   ÚLpÚLsr   r   r   rK   Ù   s   
zSoftmaxAngleProtoLoss.forwardr_   rL   r`   r   r   r   r   ro   Ç   s    	ro   )
r   Útorch.nn.functionalr   rh   r;   ÚModuler   rR   rb   ro   r   r   r   r   Ú<module>   s    s,!