o
    iV                     @   s   d dl mZ d dlmZmZ d dlm	Z	m
Z
 g dZe
dd Ze
dd Ze
d	d
 Ze
dd ZeG dd dZe
dd Ze
dd Ze	dd Ze	dd Ze
dd Ze
dd Ze
dd Ze
dd ZdS )     )
_aggregate)_core	_standard)constexpr_functionjit)pack2unpack2packunpackfmaFloat2Tensorc                 C      t jdd| |gt jdddS )Nz'
        add.f32x2 $0, $1, $2;
        =l,l,lT   dtypeis_purer	   ttglinline_asm_elementwiseint64ab r   m/home/kuhnn/.local/lib/python3.10/site-packages/triton/experimental/gluon/language/nvidia/blackwell/float2.py
_add_f32x2      r   c                 C   r   )Nz'
        sub.f32x2 $0, $1, $2;
        r   Tr   r   r   r   r   r   r   
_sub_f32x2   r   r   c                 C   r   )Nz'
        mul.f32x2 $0, $1, $2;
        r   Tr   r   r   r   r   r   r   
_mul_f32x2+   r   r   c                 C   s   t jdd| ||gt jdddS )Nz.
        fma.rn.f32x2 $0, $1, $2, $3;
        z=l,l,l,lTr   r   r   r   r   cr   r   r   
_fma_f32x29   s   r"   c                   @   sd   e Zd ZU ejed< edejfddZedd Z	edd Z
edd	 Zed
ejfddZdS )r   valuec                 C   s
   || _ d S N)r#   )selfr#   r   r   r   __init__K   s   
zFloat2Tensor.__init__c                 C   $   t t|td tt| j|jS Nzrhs must be a Float2Tensor)r   static_assert
isinstancer   r   r#   r%   rhsr   r   r   __add__O      zFloat2Tensor.__add__c                 C   r'   r(   )r   r)   r*   r   r   r#   r+   r   r   r   __sub__T   r.   zFloat2Tensor.__sub__c                 C   r'   r(   )r   r)   r*   r   r   r#   r+   r   r   r   __mul__Y   r.   zFloat2Tensor.__mul__axisc                 C   s   t tj| j|tdS )N)r1   
combine_fn)r   r   reducer#   r   )r%   r1   r   r   r   sum^      zFloat2Tensor.sumN)__name__
__module____qualname__r   tensor__annotations__r   r&   r   r-   r/   r0   	constexprr4   r   r   r   r   r   G   s   
 



r   c                 C   s$   t jdd| |gt jddd}t|S )Nz)
        mov.b64 $0, { $1, $2 };
        z=l,r,rTr   r   )r   r   r   r   )x0x1r#   r   r   r   r   c   s   
r   c                 C   s"   t jdd| jgt jt jgdddS )Nz)
        mov.b64 { $0, $1 }, $2;
        z=r,=r,lTr   r   )r   r   r#   float32)xr   r   r   r   r   s   
r   c                 C   s   dd | D } | | dksJ d| | |  d  < |  |d d ttt| }|t|d  ||d  ||d < |t|d < t| t|fS )Nc                 S      g | ]}|qS r   r   .0dr   r   r   
<listcomp>       z$_get_split_shape.<locals>.<listcomp>   z'not enough elements to pack along axis r   )insertlistrangelenr   tupleshaper1   permuter   r   r   _get_split_shape   s   2rO   c                 C   sV   dd | D } | |  d9  < t tt| }||d t| t| t|fS )Nc                 S   r@   r   r   rA   r   r   r   rD      rE   z#_get_join_shape.<locals>.<listcomp>rF   r   )rH   rI   rJ   rG   r   rK   rL   r   r   r   _get_join_shape   s
   rP   c                 C   s6   t | j|}| j|d  j|d   \}}t||S )Nr   r   )rO   rM   reshaperN   splitr   )r?   r1   spr<   r=   r   r   r   r	      s    
r	   c                 C   s>   | j j}t||}t| \}}t||j|d  j|d  S )Nr   r   )r#   rM   rP   r   r   joinrN   rQ   )r?   r1   rM   rS   r<   r=   r   r   r   r
      s   
 r
   c                 C   s2   t |jt jkd tj| j|t jd}t||S )Nzfill_value must be a float32)r   )r   r)   r   r>   stdlib	full_liker#   r   )r?   
fill_valuefillr   r   r   rV      s   
rV   c                 C   s   t t| j|j|jS r$   )r   r"   r#   r    r   r   r   r      r5   r   N)triton.language.corer   	aggregate"triton.experimental.gluon.languager   r   r   rU   "triton.experimental.gluon._runtimer   r   __all__r   r   r   r"   r   r   r   rO   rP   r	   r
   rV   r   r   r   r   r   <module>   s<    












