
    9j              	       N   d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlm	Z	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZmZmZ g dZ edd	      Z G d
 dej6                        Zdej:                  fdej6                  dededefdZ  G d de      Z! G d de      Z"da#d Z$da%d Z&y)    N)
namedtuple)Callable)Any))sparse_semi_structured_from_dense_cutlass'sparse_semi_structured_to_dense_cutlass)fallback_dispatchersemi_sparse_addmmsemi_sparse_clonesemi_sparse_detachsemi_sparse_indicessemi_sparse_linearsemi_sparse_mmsemi_sparse_scaled_mmsemi_sparse_tsemi_sparse_tosemi_sparse_to_copysemi_sparse_valuessemi_sparse_view)SparseSemiStructuredTensor!SparseSemiStructuredTensorCUTLASS$SparseSemiStructuredTensorCUSPARSELTto_sparse_semi_structured_SEMI_STRUCTURED_SPARSE_CONFIGz=sparse_min_rows sparse_min_cols dense_min_rows dense_min_colsc                      e Zd ZU dZdZeed<   eej                  e
f   ed<   dZeed<   dZeed<   dZeed<   eed	<   eeef   ed
<   ej"                  dz  ed<   ej"                  dz  ed<   ej"                  dz  ed<   ej"                  dz  ed<   ej"                  dz  ed<   eed<   eed<   g dZe	 	 	 d'dej(                  dej"                  dz  dej"                  dz  dej"                  dz  dej"                  dz  dej"                  dz  dededefd       ZdefdZdeee   eej(                  eeef   f   fdZedeej(                  eeef   dej"                  fd       Zej8                  j:                  Zedefd       Z ed(d)d       Z!edej"                  ddfd       Z"d  Z#eefdej"                  d!edd fd"       Z$dd#d$ej"                  d%ej"                  dz  dej"                  fd&Z%y)*r   a  
    This class implements semi-structured sparsity as a Tensor subclass.

    Semi-structured sparsity describes a sparsity pattern where n in every 2n elements are sparse,
    depending on the datatype. It is also referred to as 2:4 sparsity or fine-grained
    structured sparsity.

    There are two backends available for semi_structred sparsity, either cuSPARSELt or CUTLASS.
    This class is meant to serve as a base class for both implementations. SparseSemiStructuredCUTLASS
    and SparseSemiStructuredCUSPARSELT both inherit from this class and define three backend-specific items.
    Note that as such, this class cannot be instantiated directly.

    -`_DTYPE_SHAPE_CONSTRAINTS` - A dictionary holding backend specific dense/sparse min shape constraints
    - `def from_dense()` - backend specific compression routines
    - `def _mm()` - backend specific mm op (either torch._cslt_sparse_mm or torch._sparse_semi_structured_(mm|addmm))
    r   _DEFAULT_ALG_ID_DTYPE_SHAPE_CONSTRAINTSF_FORCE_CUTLASS_FUSE_TRANSPOSE_PROTOTYPE_WARNING_SHOWNBACKENDSPARSE_DISPATCHNpackedmetapacked_tmeta_tcompressed_swizzled_bitmaskfuse_transpose_cusparseltalg_id_cusparselt)r"   r#   r$   r%   r&   shaperequires_gradc
                    | j                   sRt        j                  dt        d       d| _         | j	                          t
        j                  j                  |        ||}
n||}
nt        d      t
        j                  j                  | ||
j                  |
j                  |
j                  |	      }||_        ||_        ||_        ||_        ||_        ||_        ||_        |S )a0  
        Create a new instance of the tensor subclass from the compressed sparse representation.

        We have the option to create the subclass with the compressed representations of both X and X', for training.
        For inference, we only need a single representation (either X or X'), while the corresponding other set will be None.

        Depending on the backend selected, certain fields will be set to None. (CUSPARSELT vs CUTLASS)

        Args:
            shape: The shape of the original dense tensor
            packed: The compressed representation of the original dense tensor
            meta: The metadata of the original dense tensor, if it is stored separately
            packed_t: The compressed representation of the transposed original dense tensor
            meta_t: The metadata of the transposed original dense tensor, if it is stored separately
            compressed_swizzled_bitmask: The masks used by the CUTLASS backend to determine which threads should
                                         participate in the computation. Used for pointwise ops.
            fuse_transpose_cusparselt: When running with cuSPARSELt, we have the option to fuse a transposition
                                       with a matmul, which is useful in the case of 2:4 sparse training.
            alg_id_cusparselt: The algorithm id to use when using cuSPARSELT, will have effect on performance

        Returns:
            torch.Tensor: A torch.Tensor wrapper subclass.

        Raises:
            ValueError: If all of the tensor arguments are None.
        zThe PyTorch API of SparseSemiStructuredTensor is in prototype stage and will change in the near future. Please open a Github issue for features requests and see our documentation on the torch.sparse module for further information about the project.   
stacklevelTz3At least one of packed or packed_t must be provided)devicedtypelayoutr*   )r   warningswarnUserWarning_load_dispatch_tabletorch_dynamoallow_in_graph
ValueErrorTensor_make_wrapper_subclassr/   r0   r1   r"   r#   r$   r%   r&   r'   r(   )clsr)   r"   r#   r$   r%   r&   r'   r(   r*   previous_tensortensors               \/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/sparse/semi_structured.py__new__z"SparseSemiStructuredTensor.__new__O   s    N ++MMH
 	 ,0C(
 $$& MM((-$O!&ORSS44"))!''"))' 5 
 "-H*+D(#4     returnc                 |    t        | d      st        d      | j                  j                   d| j                   dS )Nr)   ztensor has no shape attributez(shape=))hasattrAssertionError	__class____name__r)   )selfs    r?   __repr__z#SparseSemiStructuredTensor.__repr__   s:    tW% !@AA..))*'$**Q??rA   c                      t        t         fd j                              } j                   j                   j
                   j                  f}||fS )Nc                      t        |       d uS N)getattr)xrI   s    r?   <lambda>z?SparseSemiStructuredTensor.__tensor_flatten__.<locals>.<lambda>   s    WT1-T9 rA   )listfilter	__slots__r)   r'   r(   r*   )rI   inner_tensorstensor_metas   `  r?   __tensor_flatten__z-SparseSemiStructuredTensor.__tensor_flatten__   sV     94>>J
 JJ**""	
 k))rA   rU   c                     |\  }}}} | ||j                  dd       |j                  dd       |j                  dd       |j                  dd       |j                  dd       |||	      S )Nr"   r#   r$   r%   r&   	r)   r"   r#   r$   r%   r&   r'   r(   r*   )get)	r<   rT   rU   
outer_sizeouter_strider)   r'   r(   r*   s	            r?   __tensor_unflatten__z/SparseSemiStructuredTensor.__tensor_unflatten__   s     NYJ(*;] $$Xt4""640"&&z48 $$Xt4(5(9(9-t) '@/'
 	
rA   c                     |j                   | j                  vr%t        | j                   d|j                   d       | j                  |j                      ||||      S )NzI only supports a specific set of operations, can't perform requested op (rD   )_overloadpacketr!   NotImplementedErrorrH   )r<   functypesargskwargss        r?   __torch_dispatch__z-SparseSemiStructuredTensor.__torch_dispatch__   sh    s':'::%<<. !//3}}oQ@  9s""4#7#78udFSSrA   c                    t        | dd      ~t        j                  j                  j                  t
        t        j                  j                  j                  t        t        j                  j                  j                  t        t        j                  j                  j                  t        t        j                  j                  j                  t        t        j                  j                  j                  t        t        j                  j                  j                  t         t        j                  j                  j"                  t$        t        j                  j                  j&                  t$        t        j                  j                  j(                  t*        t        j                  j                  j,                  t.        t        j                  j                  j0                  t2        t        j                  j                  j4                  t6        t        j                  j                  j8                  t:        t        j                  j                  j<                  t>        i| _         || j@                  jC                  |       yyy)zT
        Loads the op overload sparse dispatch table for the current class.
        r!   N)"rN   r6   opsatenvaluesr   indicesr   is_same_sizer   detach_detachr   tr   viewr   mmr   matmuladdmmr	   linearr   _to_copyr   
_scaled_mmr   cloner
   tor   r!   update)r<   custom_dispatch_tables     r?   r5   z/SparseSemiStructuredTensor._load_dispatch_table   sm   
 3)408		%%'9		&&(;		++-@		&&(;		%%'9		  -		##%5		!!>		%%~		$$&7		%%'9		'')<		))+@		$$&7		!!>#C" %0##**+@A 1% 9rA   original_tensorc           	      \   |j                   st        d|j                   d      |j                         dk7  rt        d|j                          d      |j	                         st        d      |j
                  | j                  vrt        d|j
                   d|  d	      |j                  \  }}| j                  |j
                     j                  }| j                  |j
                     j                  }||k  s||z  s
||k  s||z  rt        d
|j                   d| d| d      y)z_
        Assert that the given tensor is valid for semi-structured sparse compression.
        zError original_tensor.device= z= is not supported! Only CUDA tensors are currently supported.r,   zError original_tensor.dim = z; is not supported! Only 2d tensors are currently supported.zXError original_tensor is not contiguous!Only contiguous tensors are currently supported.zError original_tensor.dtype z is not a supported dtype for !zError original_tensor.shape zS is not supported! Both dimensions must be larger or equal than and a multiple of (z, rD   N)
is_cudaRuntimeErrorr/   dimis_contiguousr0   r   r)   sparse_min_rowssparse_min_cols)r<   ry   mnmin_rowsmin_colss         r?    _validate_device_dim_dtype_shapez;SparseSemiStructuredTensor._validate_device_dim_dtype_shape   sr    &&01G1G0H I= =   A%./B/B/D.E F; ;  ,,.C    (D(DD./D/D.EEcdgchhij 
 $$1//0E0EFVV//0E0EFVVx<1x<1x<1x<./D/D.E FSS[R\\^_g^hhik  <HrA   c                     | j                   d   }t        j                  | t        j                  || j                  | j
                              S )Nr0   r/   )r)   r6   ro   eyer0   r/   )rI   cols     r?   to_densez#SparseSemiStructuredTensor.to_dense  s5    jjnxxeii4::dkkRSSrA   alg_idc                     t         rM   r_   r<   ry   r   s      r?   
from_densez%SparseSemiStructuredTensor.from_dense#  s
     "!rA   )biasBr   c                    t         rM   r   )rI   r   r   rc   s       r?   _mmzSparseSemiStructuredTensor._mm+  s
     "!rA   )Fr   FrM   )rB   N)&rH   
__module____qualname____doc__r   int__annotations__dictr6   r0   r   r   boolr   r   strr   r:   rS   staticmethodSizer@   rJ   tuplerQ   rV   classmethodr\   _C_disabled_torch_function_impl__torch_function__r   rd   r5   r   r   r   r    rA   r?   r   r   *   s   " OS"5;;0N#NOO ND !OT!%*d*L(H,--LL4
,,
llT!!LL4!&!44##WI +0!"#RzzR t#R llT!	R
 ,,%R t#R &+\\D%8R $(R R R Rh@# @
*	tCy%

D#t ;<<	=* 
 5::tS$67
 

 
. ??Tc T T B B2 (u|| (PT ( (TT  &"" " 
&	" " %)	"<<" llT!	" 
"rA   r   Fry   
transposedr   rB   c                     |rt        j                  dt        d       t        j                  rt
        j                  j                  nt
        j                  j                  }|j                  | |      S )a	  
    This function converts a dense tensor into a sparse semi-structured tensor.
    It will return a SparseSemiStructuredTensor, a subclass of torch.Tensor.

    This function will check to ensure the dense tensor has the right dtype, size, dims, and device.
    We currently only support semi-structured sparse tensors for 2d CUDA tensors.
    Additionally, your tensor must be a positive multiple of the minimum sparse block size, given in
    `_DTYPE_TO_SHAPE_CONSTRAINTS` for each dtype (float32, float16, bfloat16, int8).

    Args:
        original_tensor (Tensor): the dense tensor to convert
        transposed (bool, optional): deprecated arg to be removed in another release. Do not use.
        alg_id (int, optional): the algorithm id to use for cuSPARSELt matmul. Defaults to 0.
            Can be obtained via ``torch._cslt_sparse_mm_search``.
    Returns:
        SparseSemiStructuredTensor: A sparse semi-structured tensor created from the given original_tensor
    Raises:
        None
    Example:
        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
        >>> A = torch.Tensor([0, 0, 1, 1]).tile((128, 32)).half().cuda()
        tensor([[0., 0., 1.,  ..., 0., 1., 1.],
                [0., 0., 1.,  ..., 0., 1., 1.],
                [0., 0., 1.,  ..., 0., 1., 1.],
                ...,
                [0., 0., 1.,  ..., 0., 1., 1.],
                [0., 0., 1.,  ..., 0., 1., 1.],
                [0., 0., 1.,  ..., 0., 1., 1.]], device='cuda:0', dtype=torch.float16)
        >>> A_sparse = to_sparse_semi_structured(A)
        SparseSemiStructuredTensor(shape=torch.Size([128, 128]))
        >>> A_sparse.values()
        tensor([[1., 1., 1.,  ..., 1., 1., 1.],
                [1., 1., 1.,  ..., 1., 1., 1.],
                [1., 1., 1.,  ..., 1., 1., 1.],
                ...,
                [1., 1., 1.,  ..., 1., 1., 1.],
                [1., 1., 1.,  ..., 1., 1., 1.],
                [1., 1., 1.,  ..., 1., 1., 1.]], device='cuda:0', dtype=torch.float16),
        >>> A_sparse.indices()
        tensor([[-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                ...,
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370]], device='cuda:0', dtype=torch.int16))
    zSetting transpose from `to_sparse_semi_structured` is deprecated and will be removed in a future release. `SparseSemiStructuredTensor` only support contiguous input tensors.r,   r-   )r   )
r2   r3   FutureWarningr   r   r6   sparser   r   r   )ry   r   r   SPARSE_SUBCLASSs       r?   r   r   5  sf    h R 	
 &44 	66\\>>  %%of%EErA   c                       e Zd ZdZdZej                   edddd      ej                   edddd      ej                   edddd      ej                   edddd      iZeej                  fd	ej                  d
edd fd       Z fdZe	 dd	ej                  ddfd       Zddddej                  dej                  dz  dedej                  fdZ xZS )r   a  
    This class implements semi-structured sparsity for the CUTLASS backend.


    In this implementation, the specified elements and metadata are stored separately,
    in packed and meta respectively.

    When _FORCE_CUTLASS is set, or when cuSPARSELt is not available, this subclass calls into _sparse_semi_structured_(mm|addmm) and
    sparse_semi_structured_from_dense for conversion to the compressed format.
    cutlass          @         ry   r   rB   c           	          | j                  |       t        |      \  }} | |j                  ||d d d |j                        S )Nr"   r#   r$   r%   r&   r*   )r   r   r)   r*   )r<   ry   r   sparse_tensor_cutlassmeta_tensor_cutlasss        r?   r   z,SparseSemiStructuredTensorCUTLASS.from_dense  sW     	,,_= 6oF	
! !!($(,)77
 	
rA   c                     | j                   | j                  t        d      | j                   j                  dk(  r t	        | j                  | j                         S t
        |          S )Nz meta and packed must not be Noner,   )r#   r"   rF   ndimr   superr   )rI   rG   s    r?   r   z*SparseSemiStructuredTensorCUTLASS.to_dense  sc    99 3 !CDD yy~~"	 4			
 !#	
rA   r   c           	      p    t        j                  ||d      \  }}}}} | |j                  |||||d      S )a~	  
        This function takes in a unpruned dense tensor and runs a (branchless) static sort across a 4x4 tile.

        It greedily picks the largest values in the tile, upholding the 2:4 sparsity constraint across both rows and columns.
        The algorithm used to prune the matrix is implemented in `_sparse_semi_structured_tile`.

        Then it creates the packed and meta tensors for the compressed sparse representation of the pruned dense tensor.
        It also calculates the packed_t and meta_t tensors for the compressed sparse representation of the transposed
        pruned dense tensor.
        Since we cannot transpose the compressed representations, we store both for the fw/bw pass respectively.

        Finally, this function also computes a compressed swizzled bitmask that encodes the sparsity pattern
        This can be used in the backward pass to mask the gradients.

        [9 1 7 4]                       [9 0 7 0]
        [1 2 3 0]                       [0 2 0 0]
        [8 3 5 4] -> prune 4x4 tile  -> [8 0 0 4] -> pack to CUTLASS semi-structured -> packed
        [1 2 6 2]                       [0 0 6 2]                                    -> metadata

                                                  -> pack to transposed CUTLASS      -> packed_t
                                                     semi-structured representation  -> metadata_t

                                                  -> compute swizzled bitmask        -> compressed_swizzled_bitmask


        The equivalent PyTorch code to create the same five outputs from the dense tensor can be found below:
        ```
        from torch.sparse import SparseSemiStructuredTensorCUTLASS
        from torch.sparse._semi_structured_conversions import (
            _sparse_semi_structured_tile,
            _compute_compressed_swizzled_bitmask,
        )

        pruned = _sparse_semi_structured_tile(dense)
        packed_cutlass, meta_cutlass = sparse_semi_structured_from_dense_cutlass(pruned)
        packed_t_cutlass, meta_t_cutlass = sparse_semi_structured_from_dense_cutlass(
            pruned.t().contiguous()
        )
        bitmask = _compute_compressed_swizzled_bitmask(pruned)

        SparseSemiStructuredTensorCUTLASS(
            dense.shape,
            packed_cutlass,
            meta_cutlass,
            packed_t_cutlass,
            meta_t_cutlass,
            bitmask,
        )
        ```
        T	algorithmuse_cutlassFr   )r6   _sparse_semi_structured_tiler)   r<   ry   r   r"   r#   r$   r%   r&   s           r?   prune_dense_static_sortz9SparseSemiStructuredTensorCUTLASS.prune_dense_static_sort  sX    z ..yd
	
' !!(C
 	
rA   NFr   should_transpose_denser   r   r   c          
         t        |t              rt        d      | j                  j                  }| j
                  dk7  s|j
                  dk7  rt        d| d      | j                  | j                  t        d| d      t                | j                  |j                     }t        j                  j                  j                  || j                  | j                  || j                   d   |j"                  |j$                  |      S )NZ`SparseSemiStructuredTensor @ SparseSemiStructuredTensor` is not supported by the hardwarer,   `)` matmul: Broadcasting is not implemented$` matmul: operation is not supportedr   )
isinstancer   r9   rG   rH   r   r_   r"   r#   _ensure_cutlass_mm_registeredr   r0   r6   rf   semi_structured
cutlass_mmr)   dense_min_rowsdense_min_cols)rI   r   r   r   rc   cls_nameconstraintss          r?   r   z%SparseSemiStructuredTensorCUTLASS._mm  s     a34l  >>**99>QVVq[%H:FG  ;;$))"3%H:AB  *+77@K99,,77		

1****&	 	rA    )rH   r   r   r   r    r6   int8r   float16bfloat16float32r   r   r   r   r:   r   r   r   r   r   r   __classcell__)rG   s   @r?   r   r   |  s#   	 G

22sBC5b"aC6r2q!D5b"aC	   1@@

 
 
-	
 
*

 68I
#llI
	%I
 I
^ %)',!<<! llT!	!
 !%! 
!rA   r   c                      e Zd ZdZdZej                   edddd      ej                   edddd      ej                   edddd      ej                   edddd      iZeej                  fdej                  dedd fd	       Ze	 ddej                  dd
fd       Zddddej                  dej                  dz  dedej                  fdZy)r   a  
    The cuSPARSELt backend expects the specified elements and the metadata to be stored in a single tensor:
    packed = [ specified elements of original tensor | metadata ]
    For an original tensor of size (m, k) we expect the first m * k // 2 elements to be the kept elements
    The rest of the tensor is metadata. Since there is only one tensor, we only use the packed and packed_t
    attributes respectively.

    cuSPARSELt also supports transposition fusion, which is necessary for performant 2:4 sparse training, as well
    as specifying alg_id, a config that affects the performance of the matmul depending on matmul sizes.
    
cusparseltr   r   r   ry   r   rB   c                     | j                  |        | |j                  t        j                  |      d d d d t        j
                  ||j                  	      S )NrX   )r   r)   r6   _cslt_compressr   r   r*   r   s      r?   r   z/SparseSemiStructuredTensorCUSPARSELT.from_dense6  sW     	,,_=!''''8(,&@&P&P$)77

 
	
rA   r   c           	          t        j                  ||d      \  }}}}}|j                  |j                  d   d      }|j                  |j                  d   d      } | |j                  |||||d      S )a=  
        This function does the same thing as described in SparseSemiStructuredCUTLASS, but uses the cuSPARSELt metadata
        layout and sparse matmul.

        The only functional difference is that cuSPARSELt stores `metadata` and `packed` together into a single tensor.

        [9 1 7 4]                       [9 0 7 0]
        [1 2 3 0]                       [0 2 0 0]
        [8 3 5 4] -> prune 4x4 tile  -> [8 0 0 4] -> pack to cuSPARSELT semi-structured -> packed
        [1 2 6 2]                       [0 0 6 2]

                                                  -> pack to transposed cuSPARSELt      -> packed_t
                                                     semi-structured representation

                                                  -> compute swizzled bitmask           -> compressed_swizzled_bitmask


        The equivalent PyTorch code to create the same three outputs from the dense tensor can be found below:
        ```
        from torch.sparse import SparseSemiStructuredTensorCUSPARSELT
        from torch.sparse._semi_structured_conversions import (
            _sparse_semi_structured_tile,
            _compute_compressed_swizzled_bitmask,
        )

        pruned = _sparse_semi_structured_tile(dense)
        packed_cusparselt = torch._cslt_compress(pruned)
        packed_t_cusparselt = torch._cslt_compress(pruned.t().contiguous())
        bitmask = _compute_compressed_swizzled_bitmask(pruned)

        SparseSemiStructuredTensorCUSPARSELT(
            dense.shape, packed_cutlass, None, packed_t_cutlass, None, bitmask
        )
        ```
        Fr   r   r      r   )r6   r   rn   r)   r   s           r?   r   z<SparseSemiStructuredTensorCUSPARSELT.prune_dense_static_sortJ  s    Z ..ye
	
' _2215r:==!6!6q!92> !!(C
 	
rA   NFr   r   r   r   c                <   t        |t              rt        d      | j                  dk7  s|j                  dk7  r#t	        d| j
                  j                   d      |j                  | j                  k7  rit	        d| j
                  j                   dt        | j                         dt        |j                         d| j                   d|j                   d	      ||j                  | j                  k7  rit	        d| j
                  j                   dt        | j                         dt        |j                         d
| j                   d|j                   d      | j                  t        j                  k(  r\t	        d| j
                  j                   dt        | j                         dt        |j                         d| j                   d	      | j                  #t	        d| j
                  j                   d      t                | j                  |j                     }t        j                  j                   j#                  || j                  || j                  d   |j$                  |j&                  | j(                  | j*                  |	      S )Nr   r,   r   r   z` matmul: trying to do `A=z @ B=z`, with A.dtype=z and B.dtype=zH. This operation is only supported when A and B have the same data type.z + C`, with A.dtype=B.dtype=z and C.dtype=zK. This operation is only supported when A, B and C have the same data type.z`, with A.dtype=B.dtype=zO. mm is not supported for float8_e4m3fn, please use `torch._scaled_mm` instead.r   r   )r   r   r9   r   r_   rG   rH   r0   r   r)   r6   float8_e4m3fnr"    _ensure_cusparselt_mm_registeredr   rf   r   cusparselt_mmr   r   r'   r(   )rI   r   r   r   rc   r   s         r?   r   z(SparseSemiStructuredTensorCUSPARSELT._mm  sw    a34l  99>QVVq[%DNN++,,UV  77djj %DNN++,,FuTZZGXFYY^_defelel_m^n o  $

|=	 BYY 
 

djj 8%DNN++,,FuTZZGXFYY^_defelel_m^n o((,

|=	 J\\  ::,,,%DNN++,,FuTZZGXFYY^_defelel_m^n o((,

| 4`` 
 ;;%DNN++,,PQ  -.77@K99,,::

1****..&&&
 
rA   r   )rH   r   r   r   r    r6   r   r   r   r   r   r   r   r   r   r:   r   r   r   r   r   r   rA   r?   r   r   "  s   	 G;BBK

22r2rB5b"aC6r2q!D	   1@@

 
 
0	
 
& 68>
#ll>
	%>
 >
H %)',4<<4 llT!	4
 !%4 
4rA   r   c                     t         ryda ddlm}   | dd      dt        j                  d	t        j                  d
t        j                  dt        j                  dz  dt
        dt
        dt
        dt        dt        j                  fd       }|j                  dt        j                  d	t        j                  d
t        j                  dt        j                  dz  dt
        dt
        dt
        dt        dt        j                  fd       }y)zLazily register the cutlass_mm custom op.

    Registration is deferred to avoid importing torch.library at module load
    time, since torch.sparse is imported early during ``import torch``.
    NTr   	custom_opzsemi_structured::cutlass_mmr   mutates_argsdenser"   r#   r   out_featuresr   r   r   rB   c                    | j                   \  }}	| |z  }
|	 |z  }|
dk7  xs |dk7  }| }|r.t        j                  j                  j	                  | d|d|
f      }|r|j                         n|}|t        j                  |||      }nt        j                  ||||      }|r:|r|n|	}|d | j                  dd|      j                  t        j                        S |j                         S )Nr   r   memory_format)r)   r6   nn
functionalpadrm   _sparse_semi_structured_mm_sparse_semi_structured_addmmnarrowru   contiguous_format
contiguous)r   r"   r#   r   r   r   r   r   r   r   to_pad_mto_pad_nneed_paddense_paddedmm_inputresout_colss                    r?   r   z1_ensure_cutlass_mm_registered.<locals>.cutlass_mm  s     {{1B(?B(?q=1HM 88..2251h8:TUL'=<>>#<<2264JC55dFD(SC2qHM\"1h'U%<%<=
 ~~rA   transpose_densec                     |r| j                   d   n| j                   d   }t        j                  ||| j                  | j                        S Nr   r   r   r)   r6   emptyr0   r/   )	r   r"   r#   r   r   r   r   r   r   s	            r?   _cutlass_mm_fakez7_ensure_cutlass_mm_registered.<locals>._cutlass_mm_fake  sB     &55;;q>%++a.{{++<<	
 	
rA   )_cutlass_mm_registeredtorch.libraryr   r6   r:   r   r   register_fake)r   r   r   s      r?   r   r     s%    !',2> ||   ll  llT!	 
       !%  
  ? > 
||

 ll
 llT!	

 
 
 
 
 

 
rA   c                  
   t         ryda ddlm}   | dd      	 ddt        j                  d	t        j                  d
t        j                  dz  dt
        dt
        dt
        dt        dt
        dt        dt        j                  fd       }|j                  dt        j                  d	t        j                  d
t        j                  dz  dt
        dt
        dt
        dt        dt
        dt        dt        j                  fd       }y)z,Lazily register the cusparselt_mm custom op.NTr   r   zsemi_structured::cusparselt_mmr   r   r   r"   r   r   r   r   fuse_transposer   r   rB   c	                    | j                   \  }	}
|	 |z  }|
 |z  }|dk7  xs |dk7  }| }|r.t        j                  j                  j	                  | d|d|f      }|r|j                         n|}t        j                  |||||      }|r|j                         }|r7|r|	n|
}|j                  dd|      j                  t        j                        S |j                         S )Nr   )r   transpose_resultr   r   r   )r)   r6   r   r   r   rm   _cslt_sparse_mmr   ru   r   r   )r   r"   r   r   r   r   r  r   r   r   r   r   r   r   r   r   r   r   s                     r?   r   z7_ensure_cusparselt_mm_registered.<locals>.cusparselt_mm  s     {{1B(?B(?q=1HM 88..2251h8:TUL'=<>>#<##+
 %%'C2qH::aH-33#55 4   ~~rA   c	                     |r| j                   d   n| j                   d   }	t        j                  ||	| j                  | j                        S r   r   )
r   r"   r   r   r   r   r  r   r   r   s
             r?   _cusparselt_mm_fakez=_ensure_cusparselt_mm_registered.<locals>._cusparselt_mm_fake6  sB     &<5;;q>Q{{++<<	
 	
rA   )F)_cusparselt_mm_registeredr  r   r6   r:   r   r   r  )r   r   r	  s      r?   r   r   	  s8    ! $'/bA (-! ||! !  llT!!  	! 
 !  !  !  !  !%!  
!  B! F   
||

 llT!
 	

 
 
 
 
 !%
 

 !
rA   )'r2   collectionsr   collections.abcr   typingr   r6   )torch.sparse._semi_structured_conversionsr   r   !torch.sparse._semi_structured_opsr   r	   r
   r   r   r   r   r   r   r   r   r   r   __all__r   r:   r   r   r   r   r   r   r   r   r   r
  r   r   rA   r?   <module>r     s     " $     " ",$C" H" H"Z ,<<DF\\DFDF DF  	DFNc(B cL]+E ]@  >
B " ?
rA   