o
    
j                     @   sr   d dl Z d dlm  mZ d dl mZ G dd dejZG dd dejZG dd dejZG d	d
 d
ejZ	dS )    N)nnc                       s,   e Zd ZdZd fdd	ZdddZ  ZS )GSTzfGlobal Style Token Module for factorizing prosody in speech.

    See https://arxiv.org/pdf/1803.09017Nc                    s*   t    t||| _t||||| _d S N)super__init__ReferenceEncoderencoderStyleTokenLayerstyle_token_layer)selfnum_mel	num_headsnum_style_tokensgst_embedding_dimembedded_speaker_dim	__class__ U/home/kuhnn/.local/lib/python3.10/site-packages/TTS/tts/layers/tacotron/gst_layers.pyr      s   
zGST.__init__c                 C   s2   |  |}|d urtj||gdd}| |}|S )Ndim)r   torchcatr
   )r   inputsspeaker_embeddingenc_outstyle_embedr   r   r   forward   s
   

zGST.forwardr   __name__
__module____qualname____doc__r   r   __classcell__r   r   r   r   r      s    r   c                       s4   e Zd ZdZ fddZdd Zedd Z  ZS )r   zNN module creating a fixed size prosody embedding from a spectrogram.

    inputs: mel spectrograms [batch_size, num_spec_frames, num_mel]
    outputs: [batch_size, embedding_dim]
    c                    s   t    || _dgg d  t d } fddt|D }t|| _tdd  dd  D | _| 	|ddd|}tj
 d | |d d	d
| _d S )N   )    r&   @   r'      r(   c              	      s,   g | ]}t j |  |d   ddddqS )r%   )   r)   )   r*   )r%   r%   )in_channelsout_channelskernel_sizestridepadding)r   Conv2d).0ifiltersr   r   
<listcomp>&   s    z-ReferenceEncoder.__init__.<locals>.<listcomp>c                 S   s   g | ]}t j|d qS ))num_features)r   BatchNorm2d)r1   filter_sizer   r   r   r5   -   s    r)   r*   r   T)
input_sizehidden_sizebatch_first)r   r   r   lenranger   
ModuleListconvsbnscalculate_post_conv_heightGRU
recurrence)r   r   embedding_dim
num_layersr?   post_conv_heightr   r3   r   r   !   s   

zReferenceEncoder.__init__c           	      C   s   | d}||dd| j}t| j| jD ]\}}||}||}t|}q|dd}| d}|	 ||d}| j
  | 
|\}}|dS )Nr   r%   r   r*   )sizeviewr   zipr?   r@   Frelu	transpose
contiguousrC   flatten_parameterssqueeze)	r   r   
batch_sizexconvbnpost_conv_width_outr   r   r   r   4   s   



zReferenceEncoder.forwardc                 C   s*   t |D ]}| | d|  | d } q| S )zAHeight of spec after n convolutions with fixed kernel/stride/pad.r*   r%   )r=   )heightr-   r.   padn_convsrU   r   r   r   rA   J   s   z+ReferenceEncoder.calculate_post_conv_height)	r    r!   r"   r#   r   r   staticmethodrA   r$   r   r   r   r   r      s    r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )r	   z?NN Module attending to style tokens based on prosody encodings.Nc                    st   t    |d | _|r|  j|7  _|| | _tt|| j| _tj	j
| jddd t| j| j||d| _d S )Nr*   r         ?)meanstd)	query_dimkey_dim	num_unitsr   )r   r   r^   r_   r   	Parameterr   FloatTensorstyle_tokensinitnormal_MultiHeadAttention	attention)r   r   r   r   d_vector_dimr   r   r   r   U   s   


zStyleTokenLayer.__init__c                 C   s@   | d}|d}t| jd|dd}| ||}|S )Nr   r%   r   )rG   	unsqueezer   tanhrc   expandrg   )r   r   rP   prosody_encodingtokensr   r   r   r   r   d   s
   

zStyleTokenLayer.forwardr   r   r   r   r   r   r	   R   s    r	   c                       s(   e Zd ZdZ fddZdd Z  ZS )rf   z
    input:
        query --- [N, T_q, query_dim]
        key --- [N, T_k, key_dim]
    output:
        out --- [N, T_q, num_units]
    c                    sV   t    || _|| _|| _tj||dd| _tj||dd| _tj||dd| _	d S )NF)in_featuresout_featuresbias)
r   r   r`   r   r_   r   LinearW_queryW_keyW_value)r   r^   r_   r`   r   r   r   r   r   x   s   
zMultiHeadAttention.__init__c           	      C   s   |  |}| |}| |}| j| j }tjtj||dddd}tjtj||dddd}tjtj||dddd}t||	dd}|| j
d  }tj|dd}t||}tjtj|dddddd}|S )Nr*   r   r   r)   r[   r%   )rr   rs   rt   r`   r   r   stacksplitmatmulrL   r_   rJ   softmaxr   rO   )	r   querykeyquerieskeysvalues
split_sizescoresrV   r   r   r   r      s   


 zMultiHeadAttention.forwardr   r   r   r   r   rf   o   s    
rf   )
r   torch.nn.functionalr   
functionalrJ   Moduler   r   r	   rf   r   r   r   r   <module>   s    8