o
    
j26                     @   s   d dl mZmZ d dlmZmZmZ d dlZd dl	Z	d dl
mZ d dl	mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ d dlmZmZmZm Z  d dl!m"Z" d dl#m$Z$ eG dd deZ%G dd de"Z&dS )    )	dataclassfield)DictListTupleN)Coqpit)nn)weight_norm)remove_parametrizations)
DataLoader)DistributedSampler)get_optimizerget_scheduler)load_fsspec)WaveGradDataset)Conv1dDBlockFiLMUBlock)BaseVocoder)plot_resultsc                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZ	eed< d	Z
eed
< edd dZee ed< edd dZee ed< edd dZee ed< edd dZeee  ed< dS )WavegradArgsP   in_channels   out_channelsFuse_weight_norm    y_conv_channelsi   x_conv_channelsc                   C      g dS )N)   r!          r$   r$   r$   N/home/kuhnn/.local/lib/python3.10/site-packages/TTS/vocoder/models/wavegrad.py<lambda>       zWavegradArgs.<lambda>)default_factorydblock_out_channelsc                   C   r    )N)r#   r#   r"   r!   r!   r$   r$   r$   r$   r%   r&      r'   ublock_out_channelsc                   C   r    )N)   r+   r+      r,   r$   r$   r$   r$   r%   r&      r'   upsample_factorsc                   C   s"   g dg dg dg dg dgS )N)r   r,   r   r,   )r   r,   r+      r$   r$   r$   r$   r%   r&       s   " upsample_dilationsN)__name__
__module____qualname__r   int__annotations__r   r   boolr   r   r   r)   r   r*   r-   r/   r$   r$   r$   r%   r      s   
 r   c                       s  e Zd ZdZdef fddZdd Zdd Ze	 d?d
dZ
dd Zdd Zdd Zdd Z	d@ddZdededeeef fddZdededdded edeeejf fd!d"Ze	 dedejdeeef fd#d$Zdededdded edd	fd%d&Zd?ded'd(fd)d*Zd+d, Zd-d. Zed/d0 Zededefd1d2Z deded3d4d5e!d6e"d7efd8d9Z#d:d; Z$edAd=d>Z%  Z&S )BWavegradu  🐸 🌊 WaveGrad 🌊 model.
    Paper - https://arxiv.org/abs/2009.00713

    Examples:
        Initializing the model.

        >>> from TTS.vocoder.configs import WavegradConfig
        >>> config = WavegradConfig()
        >>> model = Wavegrad(config)

    Paper Abstract:
        This paper introduces WaveGrad, a conditional model for waveform generation which estimates gradients of the
        data density. The model is built on prior work on score matching and diffusion probabilistic models. It starts
        from a Gaussian white noise signal and iteratively refines the signal via a gradient-based sampler conditioned
        on the mel-spectrogram. WaveGrad offers a natural way to trade inference speed for sample quality by adjusting
        the number of refinement steps, and bridges the gap between non-autoregressive and autoregressive models in
        terms of audio quality. We find that it can generate high fidelity audio samples using as few as six iterations.
        Experiments reveal WaveGrad to generate high fidelity audio, outperforming adversarial non-autoregressive
        baselines and matching a strong likelihood-based autoregressive baseline using fewer sequential operations.
        Audio samples are available at this https URL.
    configc                    s  t  | || _|jj| _t|jj| _d | _	d | _
d | _d | _d | _d | _d | _d | _td|jjddd| _tg | _|jj}t|jjt|jjD ]\}}| jt||| |}qNtg | _|jj}t|jjD ]}| jt|| |}qotg | _|jj }t|jj|jj|jj!D ]\}}}| jt"|||| |}qt|jj#|jj ddd| _$t||jj%ddd| _&|jjr| '  d S d S )Nr      r,   )padding   )(super__init__r7   model_paramsr   npprodr-   hop_lennoise_level	num_stepsbetaalpha	alpha_hatc1c2sigmar   r   y_convr   
ModuleListdblockszipr)   reversedappendr   filmr*   r   ublocksr   r/   r   r   x_convr   out_convapply_weight_norm)selfr7   icocdfufud	__class__r$   r%   r<   <   sJ   
zWavegrad.__init__c           	      C   s   g }|  |}|| jd || t| jdd  | jD ]\}}||}|||| q| |}t| jt|D ]\}\}}||||}q;| |}|S )Nr   r   )	rI   rN   rO   rL   rK   rQ   rP   rM   rR   )	rT   xspectrogramnoise_scaleshift_and_scalerO   layer
film_shift
film_scaler$   r$   r%   forwardj   s   


zWavegrad.forwardc                 C   s$   t j|dd d }| | d S )NT)allow_picklerC   )r>   loaditemcompute_noise_level)rT   pathrC   r$   r$   r%   load_noise_schedulez   s   zWavegrad.load_noise_scheduleNc                 C   s   |du rt |jd d| j|jd  }nt |dd}||}| j|}t	t
| jd ddD ]7}| j| || j| | |||| |jd    }|dkrgt |}|| j|d  | 7 }|dd q6|S )z^
        Shapes:
            x: :math:`[B, C , T]`
            y_n: :math:`[B, 1, T]`
        Nr   r   g            ?)torchrandnshaper@   FloatTensor	unsqueezetype_asrA   torangelenrD   rF   rG   rc   repeat
randn_likerH   clamp_)rT   r\   y_nsqrt_alpha_hatnzr$   r$   r%   	inference~   s   "
6
zWavegrad.inferencec                 C   s   | j || _ t|jdkr|d}td| jd |jd g}| j | | j |d  }}|t|jd |||   }|	d}t
|}|| d|d  d |  }|	d|	d|dddf fS )z+Compute noisy audio based on noise scheduler:   r   r   rk   r,         ?N)rA   rr   rt   rn   squeezerl   randintrB   randrp   rv   )rT   y_0sl_al_br^   noisenoisy_audior$   r$   r%   compute_y_n   s   
"

"zWavegrad.compute_y_nc                 C   s   t || _d| }t|}tjdg|d gdd}|d }t|tj| _	t|tj| _
t|tj| _t|tj| _d| j
d  | _d| j
 d| j d  | _d| jdd  d| jdd   | j	dd  d | _dS )z!Compute noise schedule parametersr   rk   r}   r   )axisNrj   )rt   rB   r>   cumprodconcatenaterl   tensorastypefloat32rC   rD   rE   rA   rF   rG   rH   )rT   rC   rD   rE   rA   r$   r$   r%   rg      s   

<zWavegrad.compute_noise_levelc              	   C   s  t | jD ]!\}}t| dkr&zt|d W q ty%   |  Y qw qt | jD ]!\}}t| dkrMzt|d W q, tyL   |  Y q,w q,t | jD ]!\}}t| dkrtzt|d W qS tys   |  Y qSw qSt| j	d t| j
d t| jd d S )Nr   weight)	enumeraterK   rt   
state_dictr
   
ValueErrorremove_weight_normrO   rP   rQ   rR   rI   rT   _r`   r$   r$   r%   r      s6   zWavegrad.remove_weight_normc                 C   s   t | jD ]\}}t| dkr|  qt | jD ]\}}t| dkr+|  qt | jD ]\}}t| dkrA|  q1t| j| _t| j	| _	t| j
| _
d S )Nr   )r   rK   rt   r   rS   rO   rP   r	   rQ   rR   rI   r   r$   r$   r%   rS      s   zWavegrad.apply_weight_normFc                 C   s   t |td|d}| |d  |r?|   | jrJ | jjjr%| 	  t
|d d |d d |d d }| | d S t
|d d |d d |d d }| | d S )	Ncpu)map_locationcachemodeltest_noise_schedulemin_valmax_valrB   train_noise_schedule)r   rl   deviceload_state_dictevaltrainingr7   r=   r   r   r>   linspacerg   )rT   r7   checkpoint_pathr   r   statebetasr$   r$   r%   load_checkpoint   s&   







zWavegrad.load_checkpointbatch	criterionreturnc           
      C   sH   |d }|d }|  |\}}}| |||}|||}	d|id|	ifS )Ninputwaveformmodel_outputloss)r   rc   )
rT   r   r   r\   yr   x_noisyr^   	noise_hatr   r$   r$   r%   
train_step   s   
zWavegrad.train_stepoutputsloggerLoggerassetsstepsc                 C      d S Nr$   rT   r   r   r   r   r   r$   r$   r%   	train_log     zWavegrad.train_logc                 C   s   |  ||S r   )r   )rT   r   r   r$   r$   r%   	eval_step	  s   zWavegrad.eval_stepc                 C   r   r   r$   r   r$   r$   r%   eval_log  r   zWavegrad.eval_logtest_loaderr   c                 C   s   |d }| j d }t|d |d |d }| | |jd}|D ]>}|d }	|	d d d d d f t|  j	}	|d }
|
d d d f }
| 
|	}t||
|d}|d d   }q#|d	|ifS )
Naudio_processorr   r   r   rB   r   r   testz
test/audio)r7   r>   r   rg   datasetload_test_samplesrr   next
parametersr   r|   r   r~   detachr   numpy)rT   r   r   r   apnoise_scheduler   samplessampler\   r   y_predfiguressample_voicer$   r$   r%   r     s   

&
zWavegrad.testc                 C   s   t | jj| jj| jj| S r   )r   r7   	optimizeroptimizer_paramslr)rT   r$   r$   r%   r   &  s   zWavegrad.get_optimizerc                 C   s   t | jj| jj|S r   )r   r7   lr_schedulerlr_scheduler_params)rT   r   r$   r$   r%   r   )  s   zWavegrad.get_schedulerc                   C   s
   t j S r   )rl   r   L1Lossr$   r$   r$   r%   get_criterion,  s   
zWavegrad.get_criterionc                 C   s&   | d | d }}| d}||dS )Nr   r   )r   r   )rp   )r   mr   r$   r$   r%   format_batch0  s   

zWavegrad.format_batchis_evalTr   verbosenum_gpusc                 C   s~   |d }t ||| jj|j| jj| jj| dd|j|d}|dkr$t|nd }	t|| jj	|dkd|	|r6| jj
n| jjdd}
|
S )Nr   TF)r   itemsseq_lenr@   	pad_shortconv_padis_trainingreturn_segmentsuse_noise_augment	use_cacher   r   )
batch_sizeshuffle	drop_lastsamplernum_workers
pin_memory)r   r7   r   
hop_lengthr   r   r   r   r   r   num_eval_loader_workersnum_loader_workers)rT   r7   r   r   r   r   r   r   r   r   loaderr$   r$   r%   get_data_loader7  s2   	zWavegrad.get_data_loaderc                 C   s2   | j d }t|d |d |d }| | d S )Nr   r   r   rB   )r7   r>   r   rg   )rT   trainerr   r   r$   r$   r%   on_epoch_startR  s   
zWavegrad.on_epoch_startWavegradConfigc                 C   s   t | S r   )r6   )r7   r$   r$   r%   init_from_configW  s   zWavegrad.init_from_configr   )FF)r7   r   )'r0   r1   r2   __doc__r   r<   rc   ri   rl   no_gradr|   r   rg   r   rS   r   r   r   r   r3   r>   ndarrayr   r   Moduler   r   r   r   r   staticmethodr   r   r   r5   r   r   r   __classcell__r$   r$   rZ   r%   r6   $   sj    .

"

"r6   )'dataclassesr   r   typingr   r   r   r   r>   rl   coqpitr   r   torch.nn.utils.parametrizationsr	   torch.nn.utils.parametrizer
   torch.utils.datar   torch.utils.data.distributedr   trainer.trainer_utilsr   r   TTS.utils.ior   TTS.vocoder.datasetsr   TTS.vocoder.layers.wavegradr   r   r   r   TTS.vocoder.models.base_vocoderr   TTS.vocoder.utils.generic_utilsr   r   r6   r$   r$   r$   r%   <module>   s&    