o
    ³´„i  ã                   @   sT   d dl mZ d dlmZmZmZmZmZmZ g d¢Z	ed
dd„ƒZ
eddd	„ƒZdS )é    )Úbuiltin)Úasync_copy_global_to_sharedÚasync_copy_shared_to_globalÚ
store_waitÚtensor_descriptorÚtensor_descriptor_typeÚmake_tensor_descriptor)Úasync_gatherÚasync_scatterr   r   r   r   r   r   TNc                 C   s:   |  |¡}|  |¡}|j | j|j|j|j|j|j¡ dS )a,  
    Asynchronously gather elements from global memory to shared memory using TMA.

    Args:
        tensor_desc (tensor_descriptor): The tensor descriptor.
        x_offsets (tensor): 1D tensor of X offsets.
        y_offset (int): Scalar Y offset.
        barrier (shared_memory_descriptor): Barrier that will be signaled when the operation is complete.
        result (tensor_memory_descriptor): Result shared memory, must have NVMMASharedLayout.
        pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
    N)Ú	to_tensorÚbuilderÚcreate_async_tma_gatherÚhandle)Útensor_descÚ	x_offsetsÚy_offsetÚbarrierÚresultÚpredÚ	_semantic© r   új/home/kuhnn/.local/lib/python3.10/site-packages/triton/experimental/gluon/language/nvidia/blackwell/tma.pyr	      s
   

ÿr	   c                 C   s(   |  |¡}|j | j|j|j|j¡ dS )aW  
    Asynchronously scatter elements from shared memory to global memory using TMA.

    Args:
        tensor_desc (tensor_descriptor): The tensor descriptor.
        x_offsets (tensor): 1D tensor of X offsets.
        y_offset (int): Scalar Y offset.
        src (tensor_memory_descriptor): The source data, must be in NVMMASharedLayout.
    N)r   r   Úcreate_async_tma_scatterr   )r   r   r   Úsrcr   r   r   r   r
   *   s   
r
   )TN)N)Ú(triton.experimental.gluon.language._corer   Ú4triton.experimental.gluon.language.nvidia.hopper.tmar   r   r   r   r   r   Ú__all__r	   r
   r   r   r   r   Ú<module>   s     	