
    9j>             
      T   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlm c m!Z" d dl#m$Z$ d d	l%m&Z&m'Z' d d
l(m)Z)m*Z* d dl+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1m2Z2 d dl3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z; ddl<m=Z= ddl>m?Z?m@Z@mAZAm Z  ddlBmCZC ddlDmEZEmFZFmGZGmHZH ddlImJZJ ddlKmLZL ddlMmNZN ddlOmPZP ddlQmRZRmSZSmTZTmUZUmVZV ddlWmXZXmYZY ddlZm[Z[m\Z\m]Z]m^Z^m_Z_ ddl`maZa ddlbmcZc ddl mdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZnmoZompZpmqZq ddlrmsZtmuZumvZvmwZw dd lxmyZy d!d"lzm{Z{ d!d#l|m}Z}m~Z~mZmZmZmZmZmZmZmZmZmZmZmZmZmZ d!d$lmZmZmZmZmZmZmZ d!d%lmZmZmZmZmZ d!d&lmZ er,d d'lmZ d d(lmZ d d)lmZ dd*l@mZ d!d+l|mZ d!d,lmZ  ed-      Z e	jP                  e      ZejV                  jY                  ed.      ZejV                  jY                  ed/      ZejV                  jY                  ed0      Z eC       ZBd1Zd2 Zdjd3Z	 	 	 	 	 	 dkd4Z G d5 d6      Z ed      dld7       Z G d8 d9      Zejn                   G d: d;             Zejn                   G d< d=             Zejn                   G d> d?e             Zejn                   G d@ dAe             Z	 	 	 	 	 	 	 	 dmdBZ G dC dDe      Z e       j|                  ZdndEZdndFZdodGZdndHZdpdIZdqdJZdrdKZdsdLZ G dM dNe      ZdtdOZdudvdQZ G dR dSe      Zej                  eːj                  dP       eːj                  dT        G dU dVe˫      Z G dW dX      Zejn                   G dY dZ             Z G d[ d\      Zejn                   G d] d^             Z G d_ d`eeeeeef   z  f         Zejn                   G da db             Z G dc ddee         Z G de dfeث      Z G dg dhe      ZdwdiZy)x    )annotationsN)abstractmethod)CallableIterableSequence)	lru_cache)AnycastTYPE_CHECKINGTypeVar)
PRECEDENCE)get_interface_for_device)identitypreserve_rng_state)is_integer_dtypetype_to_dtype)
OrderedSet)CeilDivFloorDivModularIndexingTruncToFloat
TruncToInt)get_triton_versionhas_triton_packagehas_triton_stable_tma_api   )free_symbol_is_type
prefix_strsymbol_is_typeSymT)ValueRanges   )configirmetricsutils)AsyncCompile)	code_hashget_pathPyCodeCachewrite_atomic)'set_kernel_post_grad_provenance_tracing)DefaultHandler)triton_heuristics)benchmarker)AutotuneHintDevicePropertiesReductionHintTRITON_MAX_BLOCKTRITON_MAX_RSPLIT)get_max_y_gridnext_power_of_2)BaseSchedulerNode$FusedExternTritonKernelSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)get_broadcasted_shape)get_raw_stream_name)cache_on_selfDelayReplaceLineget_bounds_index_exprget_fused_kernel_nameget_kernel_metadatais_welford_reductionPlaceholderprefix_is_reduction	sympy_dotsympy_product
sympy_substriton_typetriton_version_uses_attrs_dictupcast_compute_type)_opsReductionType	StoreModeV)"get_kernel_category_by_source_code   )BlockPatternMatcher)ArgNameBackendFeatureConstexprArgCSECSEVariableDeferredLineIndentedBufferInplacedBufferis_buffer_removedOpOverridesPythonPrinter
RemovedArgSizeArg	TensorArgWorkspaceArgWorkspaceZeroMode)constant_reprIterationRangesIterationRangesEntryIterationRangesRootPartialAccumulate
SIMDKernelSIMDScheduling)	config_ofequal_1_arg_indicesnon_constexpr_signatureshould_unwrap_unspec_argsignature_to_meta)SymbolicCallArg)
ModuleTypeDtypePropagationOpsHandler)ShapeEnv)IRNode)BlockShapeType)SIMDKernelFeatures_T
perf_hintsschedulefusion   c                <    | dv }|rdnd}| dv r| d|  dS | d|  S )N)anymaxminprodtriton_helperstl)r~   r   .2 )reduction_type
use_helpermodules      ^/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/codegen/triton.pyget_triton_reduction_functionr      sE    #@@J!+F'>*!,,>*++    c                    t        | t        j                        syt        | t        j                        xs& | j                  xr t        | j                        dk(  S )z "
    Is this expression a Sympy Integer or is it an integer sympy Expr
    containing no free symbols. The latter case can happen with Identity expr.
    Fr   )
isinstancesympyExprInteger
is_integerlenfree_symbolsexprs    r   is_sympy_integer_liker      sI    
 dEJJ'dEMM* 7C 1 12a7r   c                    |j                   r| j                  t              s| S | j                  t        u rt	        | j
                   S dddfd |       S )Nc                L    t        t        | dd      xs t        | dd            S )N
is_BooleanFis_Relational)boolgetattr)nodes    r   is_predicate_exprz;_materialize_trunc_to_float_expr.<locals>.is_predicate_expr   s*    D,.W'$QV2W
 	
r   c                $   | j                  t              s| S | j                  t        u rt        | j                   S  |       s| j
                  r| S t        fd| j                  D              }|| j                  k(  r| S  | j                  | S )Nc              3  v   K   | ]0  }t        |t        j                        r |      s |      n| 2 y wN)r   r   r   ).0argr   rewrite_float_subexprs     r   	<genexpr>zR_materialize_trunc_to_float_expr.<locals>.rewrite_float_subexpr.<locals>.<genexpr>   s@      
  #uzz*3DS3I "#&
   69)hasr   funcr   argsr   tuple)r   new_argsr   r   s     r   r   z?_materialize_trunc_to_float_expr.<locals>.rewrite_float_subexpr   s    xx
#K99
"++T"dooK 
 yy	
 
 tyy Ktyy(##r   )r   zsympy.Basicreturnr   )r   
sympy.Exprr   r   )is_floating_pointr   r   r   r   r   )r   dtyper   r   s     @@r    _materialize_trunc_to_float_exprr      sR     ""$((:*> yyJTYY''

$$ !&&r   c                  @    e Zd ZU dZi Zded<   i Zded<   edd       Zy)	OpDtypeSupportz
    Some Triton ops such as libdevice and tl.math only support float32 and float64.
    This class records which dtypes are supported by specific IR ops.
    z"dict[str, OrderedSet[torch.dtype]]supported_dtypeszdict[str, bool]convert_outputsc                    |j                   }t        t        j                  t        j                  g      | j
                  |<   || j                  |<   y r   )__name__r   torchfloat32float64r   r   )clsr   convert_outputop_names       r   register_upcastzOpDtypeSupport.register_upcast   s=    --(2EMM5==3Q(RW%'5G$r   N)r   zCallable[..., str]r   r   r   None)	r   
__module____qualname____doc__r   __annotations__r   classmethodr   r   r   r   r   r      s1    
 <>8=')O_)6 6r   r   c                 d    t               syddl} t        | j                  j                  d      ryy)zd
    import AttrsDescriptor if the triton version is new enough to have this
    class defined.
     r   NAttrsDescriptorz4from triton.compiler.compiler import AttrsDescriptor)r   triton.compiler.compilerhasattrcompiler)tritons    r   gen_attr_descriptor_importr      s-     # v''):;Er   c                     e Zd ZdZ eej                  ej                  g      Z eej                  ej                  ej                  ge      ZeD  ci c]%  }|t        j                  t        |    ddd      ' c}}}} ZeD  ci c]3  }|t        j                  t        |   j#                          ddd      5 c}}}} Zedd       Zedd       Zedd	       Zy
c c}}}} w c c}}}} w )TritonSymbolszU
    Stores sympy.Symbol instances and constants associated with triton codegen.
    offsetTintegernonnegativeBLOCKr   positivec           
        d}|j                   }|D ]  }t        |t        j                        r?t        j
                  j                  j                  |j                     }|j                  }nht        |t        j                  t        j                  t        j                  t        j                  t        j                  t        j                  f      rd}n| j                   D cg c]  }t        ||      s| }}t#        |      dk(  sJ d|j                          |d   }	t        j
                  j%                         }
dg|
z  }t        j
                  j'                         D cg c]  }t(        |	   |j*                  k(  r| }}t#        |      dk(  sJ d       t-        | j/                  |d               ||d   j0                  <   t3        |      }t5        ||      } |J |S c c}w c c}w )Nr   rQ   Ambiguous type: r   1z# of Match expected to 1)r   r   r    TMPrO   kernelcsevarname_mapnameshapeUNBACKED_INTSIZEPRECOMPUTED_SIZEINDEXFLOATUNBACKED_FLOATblock_typesr   triton_tensor_ndimactive_range_treesr   prefixstrget_block_size
tensor_dimr   r<   )r   r   
expr_shape	expr_varsvarcse_var	var_shapesymtsymbol_matchessymndimr   tree
tree_matchs                 r   get_block_shapezTritonSymbols.get_block_shape
  s    &(
%%	 %	FCc488,((,,22388<#MM	%%II))JJJJ''
 	 &)__"!sD8QD" " >*a/N3CCHH:1NN/$Q'xx224 !" ; ; =!#$++5 
 
 :!+G-GG+25c6H6HTU6W2Xjm../!%L	 /z9EJK%	FN %%%1"s   %G+7G+(G0c                4    | j                   |j                     S r   )block_sizesr   r   r   s     r   r   zTritonSymbols.get_block_sizeA  s    tyy))r   c                4    | j                   |j                     S r   )block_offsetsr   r   s     r   get_block_offsetzTritonSymbols.get_block_offsetE  s      ++r   N)r   r   r   ru   )r   rd   r   zsympy.Symbol)r   r   r   r   r   r    R0_INDEXR1_INDEXreduction_typesXBLOCKYBLOCKZBLOCKr   r   Symbolr   r   upperr   r   r   r   r   )r   r   r   r   s   0000r   r   r      s    !$--!?@Odkk4;;VoVWK    	ellj./v6RVWWM  	   	ell$%%'(.t
 	
K 4 4l * * , ,Q
s   *C(
8C0
r   c                      e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   ded<   ddZddZddZddZddZe	dd       Z
y)IndexingOptionsr   	index_strOrderedSet[str]	mask_varsz
str | None
expand_strr   _has_rindexr   indexzSequence[int | str] | Noneexpand_shapec                ,    t        | j                        S r   )r   r	  selfs    r   has_maskzIndexingOptions.has_maskS  s    DNN##r   c                J    t        | j                  t        j                        S r   )r   r  r    r   r  s    r   has_indirectzIndexingOptions.has_indirectV  s    "4::txx88r   c                    | j                   S r   )r  r  s    r   
has_rindexzIndexingOptions.has_rindexY  s    r   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |      j                  d          yw)tmpNr   
startswithr   masks     r   r   z.IndexingOptions.has_tmpmask.<locals>.<genexpr>]  s     J43t9''.J   $&r}   r	  r  s    r   has_tmpmaskzIndexingOptions.has_tmpmask\  s    J4>>JJJr   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |      j                  d          yw)rNr  r  s     r   r   z,IndexingOptions.has_rmask.<locals>.<genexpr>`  s     H3t9'',Hr  r  r  s    r   	has_rmaskzIndexingOptions.has_rmask_  s    HHHHr   c                    | j                   r2dj                  t        t        t        | j                                     S dS )N & r   )r	  joinsortedmapr   r  s    r   mask_strzIndexingOptions.mask_strb  s4     =ANNEJJvc#t~~678	
PV	
r   Nr   r   r   r   )r   r   r   r   r  r  r  r  r#  propertyr)  r   r   r   r  r  J  sT    N,,$9 KI 
 
r   r  c                     e Zd ZU dZded<   ded<   ded<   ded	<   d
ed<   ded<   d
ed<   ded<   dZded<   dZded<   ed'd       Zed'd       Z	ed'd       Z
ed'd       Zedd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d(d       Z	 	 	 	 	 	 	 	 d)dZd*dZ	 	 	 	 	 	 d+dZd,d Zd-d!Zd-d"Zd-d#Zd-d$Zd-d%Z	 	 	 	 	 	 	 	 	 	 	 	 d.d&Zy)/BlockDescriptorOptionsz
    This is a base class that describes a block descriptor used in Triton kernels.
    It can be used to create either a tensor descriptor (with TensorDescriptorOptions)
    or a block pointer (with BlockPtrOptions).
    BlockParametersparamsr   constant_offset	list[int]orderr  r	  Sequence[sympy.Expr]broadcast_shape
list[bool]broadcasting_dimsfinal_shapeBlockParameters.StrideSorterstride_sorterNzlist[int] | None_boundary_checkFr   can_liftc                .    | j                   j                  S r   )r0  r   r  s    r   r   zBlockDescriptorOptions.shape  s    {{   r   c                .    | j                   j                  S r   )r0  block_shaper  s    r   r?  z"BlockDescriptorOptions.block_shape  s    {{&&&r   c                .    | j                   j                  S r   )r0  stridesr  s    r   rA  zBlockDescriptorOptions.strides      {{"""r   c                .    | j                   j                  S r   )r0  offsetsr  s    r   rD  zBlockDescriptorOptions.offsets  rB  r   )r<  c                  t         j                  j                  d	fd} ||j                        |_         ||j                        |_        |j
                  D 	cg c]  }	j                  |	d       }
}	t        |
      rd|
d<   |j                  |
      }|j                  |t         j                  j                        \  }}|j                  D cg c]  }j                  |d       }}|j
                  }|j                  |      }|D cg c]  }t        j                  |       }}t         j                  j                  r%|d   j                  dk(  sJ |j!                  d       t         j                  j"                  }t         j                  j$                  st'        |j                        t'        t         j                  j(                        |z
  k(  rIt         j                  j*                  j-                         r!|t.        j0                  j2                  g|z  z  }	 t5        j6                  t         j                  j                  |j                        } | |t         j                  j                  jA                  |      |||||||	      }|jC                  ||       |S c c}	w c c}w c c}w # t8        $ r3 t;        t=        t?        t'        |j                                          }Y w xY w)
z2Helper to create a BlockDescriptorOptions instancec                L    | D cg c]  }j                  |       c}S c c}w r   )lookup_precomputed_size)exprsr   sizevarss     r   lookup_sizez2BlockDescriptorOptions.create.<locals>.lookup_size  s"    GLMtH44T:MMMs   !rQ   F)stride_sorter_cls	shape_envr   x)	r0  r1  r3  r	  r8  r5  r7  r:  r<  )rH  zIterable[sympy.Expr]r   list[sympy.Expr])"rO   graphrI  r   rA  r?  statically_known_equalsallremove_dimsmaybe_sort_with_stride_order
_shape_envr   r   r   no_x_dimr   popnum_reduction_dimsinside_reductionr   numelsfeaturesis_reductionr   SOner&   argsort_symAssertionErrorlistreversedrangerG  compute_boundary_check)r   r0  r1  range_treesr	  get_max_blockrL  r<  rJ  dimsingleton_dimsr:  strider7  r5  r   r8  reduction_ndimr3  resultrI  s                       @r   createzBlockDescriptorOptions.create  s    77##	N #6<<0$V^^4 AG@R@R
9<H,,S!4
 
 ~!&N2 ##N3 !' C C/177;M;M !D !
 GMnn
<BH,,VQ7
 
 !,, ##$56 GRRd}33D9RR88q>((C///OOA44))FNN#s188??';n'LL!!..0 EGGKK=>99K	? %%agg&8&8&..IE
 GG,,DD_U#+/'

 	%%m[AC
$
 S$  	?%FNN(;"<=>E	?s$   J-J2J7)8J< <9K87K8c                D    t         j                  |   }t        |||i      S zN
        Replaces instances of {symt}_offset with the new expression.
        r   r   rH   r  r   replacementr   roffsets        r   replace_offsetz%BlockDescriptorOptions.replace_offset  &      --d3$+ 677r   c                |    t         j                  D ](  }| j                  |t        j                  d      |      }* |S Nr   r   r   rs  r   r   r  r   r   s      r   remove_roffsetsz&BlockDescriptorOptions.remove_roffsets  ;    !11 	ED&&tU]]1-=tDD	Er   c           
        t         j                  j                  }|D ci c]7  }t        j                  |j
                      |t        |j
                           9 }}t        t        t         j                  j                  |            }t        t        | j                              D cg c]%  }|j                  | j                  |   t         j"                  j$                        s|r:t        j                  t&        j(                     | j*                  |   j,                  v sb|j/                  | j                  |   | j*                  |         s|j/                  | j                  |   t1        | j*                  |   |            sMt         j                  j2                  r1| j*                  |   t        j                  t&        j4                     k(  s|( c}| _        yc c}w c c}w )z6List of indices to pass to tl.load(boundary_check=...)N)rO   rP  rI  r   r   r   r   r}   r(  r   needs_yz_grid_overflowrc  r   r   rQ  rA  r   r]  Zeror    r  r?  r   statically_known_multiple_ofrH   rV  r   r;  )r  rf  re  rI  tblock_to_maxneeds_overflow_grididxs           r   rd  z-BlockDescriptorOptions.compute_boundary_check   s    77## !/
 %%aff-}Z=O/PP/
 /
 "#ahh&E&E{"ST S_- 
44T\\#5FU ,)55dkkB++C0==> %AA JJsOT-=-=c-B !) E E JJsO&t'7'7'<lK! HH%%((-1J1J4;;1WW-  
/
 
s   <G*D+G#c                6    | j                   J | j                   S r   )r;  r  s    r   boundary_checkz%BlockDescriptorOptions.boundary_check4  s     ##///###r   c                     yNFr   r  s    r   r  z#BlockDescriptorOptions.has_indirect8      r   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |t        j                           y wr   )r   r   r   )r   r   s     r   r   z4BlockDescriptorOptions.has_rindex.<locals>.<genexpr><  s%      
  m&C&CD
r  )r}   r?  r  s    r   r  z!BlockDescriptorOptions.has_rindex;  s"     
((
 
 	
r   c                "    | j                         S r   )r  r  s    r   r#  z BlockDescriptorOptions.has_rmaskA  s      r   c                     yr  r   r  s    r   r  z"BlockDescriptorOptions.has_tmpmaskD  r  r   c                4    t        | j                               S r   )r   r  r  s    r   r  zBlockDescriptorOptions.has_maskG  s    D'')**r   c                T   | j                   }| j                  }|rJ| j                  j                  | j                         }| j                  j                  | j                        }t	        ||      D 	cg c]#  \  }}	|	rt
        j                  j                  n|% }
}}	t        |||
      }| j                  j                  s4|s2t        |
      t        |      k(  r| j                  j                  |
      }
t        j                  j                  |xr7 t        |
      t        |      k(  xr t        fdt	        |
|      D              }t        | j                        r(|s&d| dt        j                   j#                  |       d}| j                   }| j                  j                  sj|r| j                  j$                  n| j                  j&                  }d| d| d}|r| j                   n$| j                  j                  | j                         }t        |||      }|S c c}	}w )a  
        Generate a broadcast and a reshape for the block descriptor.
        This restores stride-0 dimensions which were removed from the block descriptor.

        Transposes are also applied to the input using self.stride_sorter:
        if for_store is True:
            - First Broadcast the value. Since self.broadcast_shape is stored in
            descending stride order, it must be reverted to the original order
            since the input value does not have dims with descending strides
            - After, transpose the broadcasted value so that dimensions are in
            descending stride order
            - Finally reshape to the block shape
        else (for load):
            - First broadcast the value to self.broadcast_shape (strides are descending)
            - Then transpose the value so that dimensions no longer have descending strides
            - Finally reshape the block to the final kernel tile shape
        c              3  p   K   | ]-  \  }}j                  |d       xs j                  ||       / ywrQ   N)rQ  )r   pre_dimpost_dimrI  s      r   r   zGBlockDescriptorOptions.codegen_broadcast_and_reshape.<locals>.<genexpr>  sH       &GX 00!< G33GXFGs   36tl.broadcast_to(, )	tl.trans()r5  r7  r:  revertzipr   r]  r^  triton_reshapeis_identityr   rO   rP  rI  rR  r}   r   index_to_strsort_idxrevert_sort_idx)r  valueinitial_shaper8  allow_implicit	for_storer5  r7  rg  is_broadcastingpre_broadcast_shapesupports_implicit_broadcast	old_shapepermute_dimsrI  s                 @r   codegen_broadcast_and_reshapez4BlockDescriptorOptions.codegen_broadcast_and_reshapeJ  s   2 .. 22 "00778L8LMO $ 2 2 9 9$:P:P Q
 ),O=N(O
$_ +EGGKK3
 
 um5HI ""..'(C,<< #'"4"4";";<O"P 77##&4 '
#$K(88   *--@+)N  	$ t%%&/J"5'AHH,A,A/,R+SSTU  ((	!!--  ""++''77 
  wba8E  $$''..t/C/CD  ui=w
s   5(H$r   rO  )r0  r/  r1  r   re  list[IterationRangesRoot]r	  r  rf  Callable[[str], int]rL  z"type[BlockParameters.StrideSorter]r<  r   r   r.  r   r   rq  r   r   r    r   r   r   r   r   r   )rf  r  re  r  r   r   )r   r2  r*  )r  r   r  r4  r8  r4  r  r   r  r   r   r   )r   r   r   r   r   r;  r<  r,  r   r?  rA  rD  r   rl  rs  ry  rd  r  r  r  r#  r  r  r  r   r   r   r.  r.  l  s    ))!!%% 0/(,O%, Hd! ! ' ' # # # #  [  [ $	[
 /[ #[ ,[ >[ [ 
 [ [z88-78?C8	8
2
+2
 /2
 
	2
h$
!+aa ,a *	a
 a a 
ar   r.  c                      e Zd ZdddZy)TensorDescriptorOptionsc                ,   t         j                  j                  }| j                  dk7  r| d || j                         dn|d || j                         d || j
                         d || j                         g}ddj                  |       dS )	a  
        Codegen a call to tl.make_tensor_descriptor()

        Args:
            name: variable name for pointer
            roffset: unused, but kept for compatibility with BlockPtrOptions.format()

        Returns:
            "tl.make_tensor_descriptor(...)"
        r    + (r  shape=strides=block_shape=ztl.make_tensor_descriptor(r  )rO   r   r  r1  r   rA  r?  r&  )r  r   rr  fr   s        r   formatzTensorDescriptorOptions.format  s     HH!! ''1, &Qt3345Q7Qtzz]O$q'(1T--./0	
 ,DIIdO+<A>>r   NTr   r   r   r   )r   r   r   r  r   r   r   r  r    s    ?r   r  c                  >    e Zd Z	 	 	 	 	 	 	 	 ddZddZdd	dZd
dZy)BlockPtrOptionsc                D    t         j                  |   }t        |||i      S rn  ro  rp  s        r   rs  zBlockPtrOptions.replace_offset  rt  r   c                |    t         j                  D ](  }| j                  |t        j                  d      |      }* |S rv  rw  rx  s      r   ry  zBlockPtrOptions.remove_roffsets  rz  r   c           	        t         j                  j                  }g | j                  }|s|D cg c]  }| j	                  |       }}| j
                  dk7  r| d || j
                         dn|d || j                         d || j                         d || j                         d || j                         d ||       g}d	d
j                  |       dS c c}w )a  
        Codegen a call to tl.make_block_ptr()

        Args:
            name: variable name for pointer
            roffset: should rn_offset be included in offsets=..., for use with tl.advance()

        Returns:
            "tl.make_block_ptr(...)"
        r   r  r  r  r  r  zorder=zoffsets=ztl.make_block_ptr(r  )rO   r   r  rD  ry  r1  r   rA  r?  r3  r&  )r  r   rr  r  rD  r   r   s          r   r  zBlockPtrOptions.format  s     HH!!!DLL/BIJt++F3JGJ ''1, &Qt3345Q7Qtzz]O$q'(1T--./0Qtzz]O$qzl#
 $DIIdO#4A66 Ks   C"c           	         t         j                  |   }| j                  D cg c]A  }| j                  |||      | j                  |t        j
                  j                  |      z
  C }}|S c c}w )av  
        Codegen string to pass to tl.advance(name, ...).

        Advance is the difference between offsets in each loop iteration.
        To compute it, we replace rN_offset with multiples of RN_BLOCK.
        Since we expect rN_offset to vary in range(0, rN_numel, RN_BLOCK), the first
        iteration has rN_offset=0, while the second has rN_offset=RN_BLOCK.
        )r   r   rD  rs  r   r]  r}  )r  r   rblockr   advances        r   advance_roffsetzBlockPtrOptions.advance_roffset  st     **40 ,,

  ##FFD9%%feggllDAB
 
 
s   AA,Nr  r  r  r  )r   r    r   r   )r   r   r   rs  ry  r  r  r   r   r   r  r    s6    88-78?C8	8
7:r   r  c                r   t        |t              rt        |t              sJ |D cg c]!  }t        j                  j	                  |      # }}|D cg c]!  }t        j                  j	                  |      # }}||k(  r| S |D cg c]
  }|dk7  s	| c}|k7  rd|  ddj                  |       dS d}g }|D ]G  }	|t        |      k  r|	||   k(  r|j                  d       |dz  }0|	dk(  sJ |j                  d	       I |t        |      k(  sJ |  d
dj                  |       dS c c}w c c}w c c}w )z<Workaround https://github.com/triton-lang/triton/issues/2836r   ztl.reshape(, [r  z])r   :rQ   r   [])r   ra  rO   r   r  r&  r   append)
r  r  	new_shaper   old_shape_strnew_shape_strsr  expandsizes
             r   r  r    sE    i&:i+FFF?HIeQXX**51IMI?HIeQXX**51IMI% -aAH->UG3tyy'?&@CC
CF "]##c0B(BMM#1HC3;;MM&!" #m$$$$WAdii'(**% JI .s   &D*&D/
D4D4c                     e Zd Zd!dZd!dZd!dZd!dZd!dZd!dZd!dZ	d!dZ
d!d	Zd!d
Zd!dZd!dZd!dZd!dZd!dZd"dZd!dZd!dZd!dZd!dZd!dZd!dZd!dZd!dZd!dZd!dZd!dZd!dZd!dZd!dZ d!dZ!y )#TritonPrinterc                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS )NrQ   libdevice.trunc(r   ).to(r  r   r   _printrO   r   index_dtyper  r   s     r   _print_TruncToIntzTritonPrinter._print_TruncToInt,  O    499~""" t{{499Q<89qxx?S?S>TTUV	
r   c                    t        |j                        dk(  sJ | j                  |j                  d         }d| d| dS )NrQ   r   z(libdevice.trunc(z) + tl.zeros_like())r   r   r  )r  r   r  s      r   _print_TruncToFloatz!TritonPrinter._print_TruncToFloat3  sF    499~"""DIIaL) #5');E7"EEr   c                    |j                   rt        t        |            }|S t        j                         rt
        j                  j                  r| }|S d| d}|S )Ntl.full([], , tl.float64))r   r   intr#   	is_fbcoder   versionhip)r  r   rets      r   _print_FloatzTritonPrinter._print_Float<  s\    ?? c$i.C
 
	 EMM$5$5FC 
 !m4C
r   c                    t        |j                        dk(  sJ | j                  |j                  d   t        d   dz
        }| dS )NrQ   r   Atom      ?z.to(tl.float64))r   r   parenthesizer   )r  r   r  s      r   _print_ToFloatzTritonPrinter._print_ToFloatH  sI    499~"""diilJv,>,DEO$$r   c                    |j                   \  }}|j                  r3|j                  r'| j                  |j                   dt        d   dz
        S | j	                  |      }| j	                  |      }d| d| dS )N % r  r  z!triton_helpers.remainder_integer(r  r  )r   is_nonnegative	stringifyr   r  r  r   quotdivquot_sdiv_ss         r   _print_PythonModzTritonPrinter._print_PythonModN  sr    II	c3#5#5>>$))UJv4F4LMMT"C 26("UG1EEr   c                   |j                   sJ |j                  \  }}|j                  r3|j                  r'| j                  |j                  dt        d   dz
        S | j                  |      }| j                  |      }d| d| dS )N // r  r  z!triton_helpers.div_floor_integer(z,  r  )r   r   r  r  r   r  r  s         r   _print_FloorDivzTritonPrinter._print_FloorDivX  s~    II	c3#5#5>>$))VZ5G#5MNNT"C 26(#eWAFFr   c                P    | j                  |j                  dt        d   dz
        S )N / r  r  )r  r   r   r  s     r   _print_IntTrueDivzTritonPrinter._print_IntTrueDive  s#    ~~dii
60BS0HIIr   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS NrQ   libdevice.floor(r   r  r  r  r  s     r   _print_floorzTritonPrinter._print_floorj  r  r   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS r  r  r  s     r   _print_FloorToIntzTritonPrinter._print_FloorToIntq  r  r   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS NrQ   libdevice.ceil(r   r  r  r  r  s     r   _print_ceilingzTritonPrinter._print_ceilingx  K    499~""" TYYq\!: ;5AUAU@VVWXXr   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS r   r  r  s     r   _print_CeilToIntzTritonPrinter._print_CeilToInt}  r  r   c                ,    d| j                  |       dS )Nztl.sqrt_rn(().to(tl.float32)))r  r  s     r   _helper_sqrtzTritonPrinter._helper_sqrt  s    dkk$/00ABBr   c                    | j                  |j                  d         }| j                  |j                  d         }d| d| dS )Nr   rQ   zlibdevice.pow((z).to(tl.float64), (z).to(tl.float64)))r  r   )r  r   baseexps       r   _print_FloatPowzTritonPrinter._print_FloatPow  sG    {{499Q<(kk$))A,' !&9#>OPPr   c                D   |j                   d   j                  rdt        |j                   d          d}n"d| j                  |j                   d          d}|j                   d   }|j                  rdt        |       d}nd| j                  |       d}d| d| d	S )
Nr   r  r  (z).to(tl.float64)rQ   libdevice.pow(r  r  )r   
is_Integerfloatr  )r  r   r
  exp_valr  s        r   _print_PowByNaturalz!TritonPrinter._print_PowByNatural  s    99Q<""!%		!"5!6mDD t{{499Q<011ABD))A, w 0>C dkk'*++;<C  vRuA..r   c                    | j                  |j                  d         }| j                  |j                  d         }| j                  |j                  d         }d| d| d| dS )Nr   rQ   r"   	tl.where(r  r  )doprintr   )r  r   cpqs        r   _print_WherezTritonPrinter._print_Where  s_    LL1&LL1&LL1&1#Rs"QCq))r   c                   t        |j                        dk(  r| j                  |j                  d         S t        |j                        dz  }t        |      }| j                   ||j                  d|        }| j                   ||j                  |d        }t	        d ||fD              \  }}|dv sJ d| d       d	| d
| d| d| d| d
| d| d| dS )zI
        Helper for max/min code generation.
        cmp: > or <
        rQ   r   r"   Nc              3  (   K   | ]
  }d | d  yw)r  r  Nr   r   rN  s     r   r   z6TritonPrinter._print_min_max_helper.<locals>.<genexpr>  s     .!q1X.s   )><zUnexpected comparator: ''r  z * ( z= z) + r  )r   r   r  typer   )r  r   cmpmidr   abs          r   _print_min_max_helperz#TritonPrinter._print_min_max_helper  s    
 tyy>Q;;tyy|,,$))n!4jKKTYYt_-.KKTYYst_-. .1v..1j C$<SE"CC 1#T!AcU"QCtA3d1#Qse1QCrBBr   c                &    | j                  |d      S )Nr  r'  r  s     r   
_print_MinzTritonPrinter._print_Min      ))$44r   c                &    | j                  |d      S )Nr  r)  r  s     r   
_print_MaxzTritonPrinter._print_Max  r+  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrQ   tl_math.abs(r   r  r  r  s     r   
_print_AbszTritonPrinter._print_Abs  s9    499~"""dkk$))A,78::r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrQ   zlibdevice.cos((r   r  r  r  s     r   _print_OpaqueUnaryFn_cosz&TritonPrinter._print_OpaqueUnaryFn_cos  :    499~""" TYYq\!: ;;LMMr   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrQ   zlibdevice.cosh((r   r  r  r  s     r   _print_OpaqueUnaryFn_coshz'TritonPrinter._print_OpaqueUnaryFn_cosh  :    499~"""!$++diil";!<<MNNr   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrQ   zlibdevice.acos((r   r  r  r  s     r   _print_OpaqueUnaryFn_acosz'TritonPrinter._print_OpaqueUnaryFn_acos  r6  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrQ   zlibdevice.sin((r   r  r  r  s     r   _print_OpaqueUnaryFn_sinz&TritonPrinter._print_OpaqueUnaryFn_sin  r3  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrQ   zlibdevice.sinh((r   r  r  r  s     r   _print_OpaqueUnaryFn_sinhz'TritonPrinter._print_OpaqueUnaryFn_sinh  r6  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrQ   zlibdevice.asin((r   r  r  r  s     r   _print_OpaqueUnaryFn_asinz'TritonPrinter._print_OpaqueUnaryFn_asin  r6  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrQ   zlibdevice.tan((r   r  r  r  s     r   _print_OpaqueUnaryFn_tanz&TritonPrinter._print_OpaqueUnaryFn_tan  r3  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrQ   zlibdevice.tanh((r   r  r  r  s     r   _print_OpaqueUnaryFn_tanhz'TritonPrinter._print_OpaqueUnaryFn_tanh  r6  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrQ   zlibdevice.atan((r   r  r  r  s     r   _print_OpaqueUnaryFn_atanz'TritonPrinter._print_OpaqueUnaryFn_atan  r6  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrQ   zlibdevice.log2((r   r  r  r  s     r   _print_OpaqueUnaryFn_log2z'TritonPrinter._print_OpaqueUnaryFn_log2  r6  r   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS )NrQ   zlibdevice.llrint(r   r  r  r  r  s     r   _print_RoundToIntzTritonPrinter._print_RoundToInt  sO    499~"""  DIIaL 9:%@T@T?UUVW	
r   c                    t        |j                        dk(  sJ |j                  \  }}|j                  r|dk  sJ t        d| d      | j	                  |t
        d         }d| d| d|  S )	Nr"   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got r   Mulzlibdevice.nearbyint(1e * z) * 1e)r   r   r   
ValueErrorr  r   )r  r   numberndigits
number_strs        r   _print_RoundDecimalz!TritonPrinter._print_RoundDecimal  s    499~"""))Q;;abiajjkl  &&vz%/@A
'yJ<vwhZPPr   N)r   r   r   r   )r   r   r#  r   r   r   )"r   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r'  r*  r-  r0  r2  r5  r8  r:  r<  r>  r@  rB  rD  rF  rH  rP  r   r   r   r  r  +  s    
F
%F	GJ


Y
Y
CQ/"*C,55;
N
O
O
N
O
O
N
O
O
O

Qr   r  c                *    t        t        |             S )zCConvert torch.dtype to triton type and upcast [b]float16 to float32)rI   rK   r   s    r   triton_compute_typerS    s    *5122r   c                ^    | t         j                  k(  rt         j                  } t        |       S )z@Convert torch.dtype to triton type, with fix for storing tl.bool)r   r   int8rI   rR  s    r   triton_store_typerV    s"    



ur   c                    t        |       r+| j                  r| j                  dk  rt        j                  S t        |       S )z0Implicit upcasts used for Triton reduction types   )r   	is_signeditemsizer   int32rK   rR  s    r   upcast_acc_dtyper\    s0    5??u~~7J{{u%%r   c                *    t        t        |             S )z:Convert torch.dtype to triton type, with reduction upcasts)rS  r\  rR  s    r   triton_acc_typer^  %  s    /677r   c                <    | j                   dk  xr | j                  S )Nr"   )rZ  r   rR  s    r   low_precision_fpr`  *  s    >>Q:5#:#::r   c                    t        | t              sy| j                  }t        |t        j                        rt	        |      S dS r  )r   rW   r   r   r`  )r   r   s     r   low_precision_fp_varrb  .  s6    c;'IIE&0&DE"O%Or   c                    t        | t              r| j                  S t        | t        j                  j
                        rt        t        |             S y r   )r   rW   r   r   _prims_commonNumberr   r"  r   s    r   triton_arg_dtyperg  6  s>    #{#yy#u**112T#Y''r   c                    t         j                  j                   xr> t        | t              xr, | j
                  t        j                  t        j                  fv S r   )	r#   r   codegen_upcast_to_fp32r   rW   r   r   float16bfloat16rf  s    r   needs_upcast_to_float32rl  >  sD    MM000 	9sK(	9II%--88r   c                  <     e Zd Z	 d	 	 	 	 	 	 	 	 	 d fdZd Z xZS )TritonCSEVariablec                n    t         |   ||||       t               | _        |J d       |J d       y )Nr   z!TritonCSEVariable must have dtypez!TritonCSEVariable must have shape)super__init__r   r	  )r  r   boundsr   r   	__class__s        r   rr  zTritonCSEVariable.__init__G  sH     	vuE:*4, E"EE  E"EE r   c                F   |D ]  }t        |t              r&| j                  j                  |j                         9t        |t        j
                        sTt        j                  D ]6  }t        ||      s| j                  j                  t        |    dg          y )Nr  )
r   rn  r	  updater   r  r   r   r   r   )r  r   r   kwargsr   r   s         r   update_on_argsz TritonCSEVariable.update_on_argsT  s     
	C#01%%cmm4C. *55 D%c40--*T2B1C4/H.IJ
	r   r   )
r   r   rs  zValueRanges[Any]r   torch.dtyper   ru   r   r   )r   r   r   rr  rx  __classcell__rt  s   @r   rn  rn  F  sH     !%FF !F 	F
 F 
Fr   rn  c                     ddl m}   |        S )Nr   rq   )!torch._inductor.dtype_propagationrr   rq   s    r   get_dtype_handlerr~  b  s    L%''r   Tc                      ddd fd}|S )z
    Codegen helper to upcast arguments to float32, depending on the config and dtype.
    This decorates tl.math/libdevice codegen functions.
    c                ,    t        |       rdnd}|  | S )N.to(tl.float32)r   rl  )r   upcast_strings     r   maybe_upcast_argz.maybe_upcast_float32.<locals>.maybe_upcast_argn  s     -DS-I)r}o&&r   c                F     t         j                          d fd}|S )Nc                    | D cg c]
  } |       }}|j                         D ci c]  \  }}| |       }}} |i |}xr4 t        d t        j                  | |j	                               D              }|sd n# t        t               j                        | i |}	|	t        j                  d fv}
|
r|	dt        |	       dnd}| | S c c}w c c}}w )Nc              3  2   K   | ]  }t        |        y wr   r  r   r   s     r   r   zKmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<genexpr>}  s      6 (,6   .to(r  r   )itemsr}   	itertoolschainvaluesr   r~  r   r   r   rI   )r   rw  r   upcast_argskeyvalupcast_kwargsrk  any_needs_upcastresult_dtypeneeds_downcastdowncast_stringr   r   r  s               r   wrappedz8maybe_upcast_float32.<locals>.decorator.<locals>.wrappedv  s   <@AS+C0AKAHNWHCS"23"77WMW ;8-8F-  # 6$??4A6 3 ( @W.0$--@$Q&Q 
 *%--1FFN "l&> {<013 
 Xo.//) BWs
   CCr+  )r   r   )r   r  r   r  s   ` r   	decoratorz'maybe_upcast_float32.<locals>.decoratorr  s    &&t^<	00 r   r+  )r   Callable[..., Any]r   r  r   )r   r  r  s   ` @r   maybe_upcast_float32r  h  s    '< r   c                  ,   e Zd ZdZ ej
                  ej                        Ze	 	 dS	 	 	 dTd       Z	edUd       Z
ed        Zed        ZedVd       Ze e       d	               Zed
        Zed        Zed        Ze e       d               Ze e       d               Ze e       d               Ze e       d               Zed        Zed        Zed        Zed        Zed        Zedej<                  ddddd       Ze e       d               Z e e       d               Z!ed        Z"ed        Z#e e       d               Z$e e       d               Z%e e       d               Z&e e       d                Z'e e       d!               Z(e e       d"               Z)e e       d#               Z*e e       d$               Z+e e       d%               Z,e e       d&               Z-e e       d'               Z.e e       d(               Z/e e       d)               Z0e e       d*               Z1e e       d+               Z2e e       d,               Z3e e       d-               Zed.        Z4e e       d/               Z5ed0        Z6ed1        Z7ed2        Z8ed3        Z9ed4        Z:ed5        Z;ed6        Z<ed7        Z=ed8        Z>ed9        Z?ed:        Z@ed;        ZAed<        ZBed=        ZCed>        ZDe e       d?               ZEe e       d@               ZFe e       dA               ZGe e       dB               ZHe e       dC               ZIedD        ZJe e       dE               ZKedF        ZLe e       dG               ZMe edHI      dJ               ZNe edHI      dK               ZOe e       dL               ZPe e       dM               ZQedN        ZRedO        ZSe e       dP               ZTedQ        ZUe e       dR               ZVy)WTritonOverrideszEMap element-wise ops to Triton e.g., ops.to_dtype(x,...) -> x.to(...)NTc                Z   	 	 	 	 	 	 dd}|>t         |||      t        j                  j                        t        j                  _        |t        j
                  k(  rd|  dS |t        j                  k(  r||j                  s||  dS |rt        |      }nt        |      }|  d| dS )Nc                   | |k(  ryt         j                  t         j                  f}| |v r||v r| |k7  rJ d       | t         j                  k(  s|t         j                  k(  ry| t         j                  k(  s|t         j                  k(  ryy)Nr   zCConversions between float8_e5m2 and float8_e4m3fn is not supported!rX  r"   )r   float8_e4m3fnfloat8_e5m2)	src_dtype	dst_dtype
fp8_dtypess      r   _get_min_elements_per_threadz>TritonOverrides.to_dtype.<locals>._get_min_elements_per_thread  s     I% ##!!J Z'+*U U	U 
 E---e>O>O1OE///9@S@S3Sr   r  z != 0)z.to(tl.int16).to(tl.uint8)r  r  )r  ry  r  ry  r   r  )
r~   rO   r   min_elem_per_threadr   r   uint8r   rS  rV  )rN  r   r  use_compute_typesr  	out_dtypes         r   to_dtypezTritonOverrides.to_dtype  s    	"	/:		6   ,/,Y>,,,AHH(
 EJJqc= ekk!!i&A&AYEV S233+E2I)%0ID1%%r   c                    |j                   |j                   k(  sJ | j                  |k7  r|  dt        |       d} |  dt        |       d}t        |      |k7  r| dt        t        |             d}|S )Nr  r  , bitcast=True))rZ  r   rI   rK   )rN  r   r  outs       r   to_dtype_bitcastz TritonOverrides.to_dtype_bitcast  s    !!U^^333 77i#T+i013A4E*+?;u%.Ek*=e*DEFaHC
r   c           	     T   t         j                  j                  |      }t         ||             }t	        |      }| dk(  r5t        j                  d|       dk  r|dk(  r	d| d| dS |dk(  r	d| d| dS | dk  r#|j                  sd	|d
d   }d| d| d| d| d	S d| d| d| dS )Nr   g      ?z
tl.float32tl.full(z, 0x80000000, tl.uint32).to(r  z
tl.float64z$, 0x8000000000000000, tl.uint64).to(ztl.rX  r  r  r  )r   rd  dtype_to_typerc   rS  mathcopysignrY  )r  r   r   type_
triton_valrI   triton_signed_types          r   _shaped_constantz TritonOverrides._shaped_constant  s    ##11%8"5<0
)%0 A:$--U3a7l*!%(D[MQ`aa,!%(L[MYhii
 19U__#&{12&7!8eWBzl"5G4Hk]Z[\\eWBzl"[MCCr   c                *    | j                  ||g       S )Nrp  )r  )r   r  r   s      r   constantzTritonOverrides.constant  s    ##E5#;;r   c                    t        |t        j                  j                        r| j	                  ||      S t        |      |k(  r| S d| dt        |       dS )Nr  r  r  )r   r   rd  re  r  rg  rI   )r   r   r   s      r   _cast_libdevice_argz#TritonOverrides._cast_libdevice_arg  sX    c5..556<<U++C E)UO3%u[/022r   c                    d|  dS )Nr/  r  r   rN  s    r   abszTritonOverrides.abs	       aS""r   c                   t        | dd       }t        |dd       }|t        j                  k(  r7|t        j                  k(  r$t        j                  j
                  r
d|  d| d}n	d|  d| d}t        |       st        |      rMt               j                  | |      }|t        j                  t        j                  fv r| dt        |       d}|S )Nr   triton.language.div_rn(r  r  r  r  r  )r   r   r   r#   eager_numericsdivision_roundingrb  r~  truedivrj  rI   rN  yx_dtypey_dtyper  r  s         r   r  zTritonOverrides.truediv  s    !Wd+!Wd+ u}}$5==(%%77 ,A3b15CaSA3a.C"&:1&=)+33Aq9IU]]EMM::T+i"8!9;
r   c                r   t        | dd      }t        |dd      }|t        j                  k(  r|t        j                  k(  r
d|  d| d}n	d|  d| d}t        |       st        |      rMt	               j                  | |      }|t        j                  t        j                  fv r| dt        |       d}|S )	z
        Division with round-to-nearest rounding mode.
        Always uses triton.language.div_rn for float32 inputs to match eager CUDA behavior.
        r   Nr  r  r  r  r  r  )r   r   r   rb  r~  r  rj  rI   r  s         r   div_rnzTritonOverrides.div_rn)  s     !Wd+!Wd+emm#5==(@+A3b15C aSA3a.C"&:1&=)+33Aq9IU]]EMM::T+i"8!9;
r   c                    d|  d| d}t        |       st        |      rMt               j                  | |      }|t        j                  t        j
                  fv r| dt        |       d}|S )Nr  r  r  r  )rb  r~  modr   rj  r   rI   )rN  r  r  r  s       r   r  zTritonOverrides.mod?  sl    !Cs!n"&:1&=)+//15IU]]EMM::T+i"8!9;
r   c                :    t         j                  rd|  dS d|  dS )z
        When use_fast_math, use the ftz (flushing to zero) variant
        of exponent computation.

        Check https://github.com/triton-lang/triton/issues/5735 for
        more details.
        ztl_math.exp(r  zlibdevice.exp()r#   use_fast_mathr  s    r   r  zTritonOverrides.expH  s+     !!A&&#A3a((r   c                    d|  dS )Nzlibdevice.exp2(r  r   r  s    r   exp2zTritonOverrides.exp2X       !1%%r   c                    d|  dS )Nzlibdevice.expm1(r  r   r  s    r   expm1zTritonOverrides.expm1]       "!A&&r   c                    d|  dS )Nztl.sqrt_rn(r  r   r  s    r   sqrtzTritonOverrides.sqrtb  s     QCq!!r   c                   t         j                  j                  }|dk(  ry|dk(  r	d|  d|  dS |dk(  r|  dS |8t        j                  t        j
                  d	t        j                        |       S t        d
|      )Ncompile_errorzcompile error!runtime_errorz"triton_helpers.device_assert_then(z == 0, "injected assert fail", r  accuracyz + 1r   z:unrecognized config triton.inject_relu_bug_TESTING_ONLY = )	r#   r   inject_relu_bug_TESTING_ONLYopsmaximumr  r   r[  r`  )rN  bugs     r   reluzTritonOverrides.reluh  s    mm88/!#O# 8s:YZ[Y\\]^^JS:[;;s||Au{{;Q?? LSGT r   c                Z    t         j                  j                  r	d|  d| dS d|  d| dS )Nztl.minimum(r  , tl.PropagateNan.ALL)ztriton_helpers.minimum(r  r   r  r  r%  r&  s     r   minimumzTritonOverrides.minimumz  <     == 2aS(>??,QCr!A66r   c                Z    t         j                  j                  r	d|  d| dS d|  d| dS )Nztl.maximum(r  r  ztriton_helpers.maximum(r  r  r  s     r   r  zTritonOverrides.maximum  r  r   c                    d|  d| d| dS )Nr  r  r  r   )r%  r&  r  s      r   wherezTritonOverrides.where  s     1#Rs"QCq))r   c           	       	
 t         j                  j                  sJ | |}}d }d } ||      r ||       }  ||      r ||      }	 	 	 	 	 	 d	
fd}t        t         j                  j	                               dk\  sJ d       t        t        j                  t        j                           	t        t        j                  t        j                           
t        t        j                  t        j                           t        t        j                  t        j                           t         j                  j                  j                  t         j                  j                   || t!        | j"                        
g      | j$                  
f      } t         j                  j                  j                  t         j                  j                   ||t!        |j"                        	g      |j$                  	f      }t&        j(                  j*                  j,                  j.                  dk(  rd}nd}d	|  d
| d| dS )a  
        Triton code generation for lowering ops.dot to tl.dot.

        The logic is as follows:

        1. Downcasting for performance
           If the data was previously upcasted to fp32, we downcast back to the
           original dtype (e.g., fp16 or bf16) for better performance. While
           surrounding operations may run in fp32, matmul itself is executed at the
           original precision to optimize throughput.

        2. Handling non-constant reduction masks
           If the reduction mask is not constant and there was any operation between
           tl.load and tl.dot, we zero out regions outside the mask using
           tl.where(r0_mask, val, 0).
           This ensures that values outside the mask do not contribute to the dot
           product, preventing incorrect results.

        3. Shape alignment for tl.dot
           We massage shapes to match the tl.dot requirement of (Y, R) x (R, X).
           Current codegen eagerly broadcasts tl.arange to create unique axes. We
           reshape, transpose, or broadcast to align with the (Y, R) x (R, X) shape.
           We avoid using 3D dot ((Z, Y, R) x (Z, R, X)) because 3D tl.dot has
           poor performance. During batched matmul (bmm), we keep ZBLOCK=1 and call
           the 2D dot kernel instead.
        c                x   t        t        t        | j                              syt        j
                  j                  d   }|j                  sJ t        j
                  j                  |      ryt        j
                  j                  j                  j                         D ]  \  }}|| k(  sd|v sd|v s y y)NFrK  ztl.loadz	other=0.0T)r}   r(  rE   r	  rO   r   re  r\  _has_constant_maskr   _cacher  )r   reduction_rangekvs       r   is_where_neededz,TritonOverrides.dot.<locals>.is_where_needed  s    s.>?hh2226O"//// xx**?;
 ++113 !18	Q;!3C ! r   c                   t         j                  j                  d| j                        }t        j
                  j                  D cg c]  }|j                  r|j                   d }}t        |      dk(  sJ d       t        j                  |d   | |      }t        j
                  j                  j                  t        j
                  j                  || j                  | j                        S c c}w )Ndotr  rQ   z'don't tile reduction when native matmulr   r   r   )r$   	Reductiondefault_valuer   rO   r   re  r\  r   r   TritonKernelOverridesr  r   generatecomputer   )r   defaultr   reduction_mask	where_vars        r   
where_condz'TritonOverrides.dot.<locals>.where_cond  s    ll00		BG HH00$$ ;;-t$N  ~&!+V-VV+-33N14EsGTI88<<((  )399CII )  s   "C3c                   	|v r|D cg c]  }|	k(  rdn| }}|gk(  rE|vsJ d       ddg}|v r|d<   |v r|d<   t        | ||      } |gk7  }|rd|  d d d} | S |gk(  rK|vsJ d	       ddg}|v r|d<   |v r|d<   t        | ||      } d
|  d} |gk7  }|rd|  d d d} | S t        c c}w )a  
            Generate a reshape, transpose, and broadcast for the tl.dot.
            tl.dot requires specific shape requirement : (Y,R) x (R,X)
            but the current triton codegen eagerly broadcast the tl.arange so
            it needs to be reshaped to meet the requirement.

            This is done by three steps.
            1. remove the empty dimension (dim with size 1) and make it 2d with tl.reshape
            2. permute the dimension if needed (e.g., (X,R) -> (R,X)) with tl.trans
            3. broadcast if needed with broadcast_to.
                - This shows up when matmul operand is broadcasted with torch.expand/repeat.
                - e.g., torch.rand((16,)).expand(16,16) @ B

            e.g., (Y,1,R), (Y,R) -> tl.reshape(var, (Y,R))
            e.g., (1,X,R), (R,X) -> tl.trans(tl.reshape(var, (X,R)))
            e.g., (1,X,1), (R,X) -> tl.broadcast_to(tl.trans(tl.reshape(var, (X,1))), (R,X))

            TODO : eventually we want to remove this function when lazy broadcasting arrives
            r   z&left tl.dot operand cannot depend on xr   rQ   r  z, (r  r  z'right tl.dot operand cannot depend on yr  r  )r  NotImplementedError)
r  r  r8  rg  shape_2dbroadcast_neededRBLOCKr   r  r  s
         r   #reshape_transpose_broadcast_for_dotz@TritonOverrides.dot.<locals>.reshape_transpose_broadcast_for_dot  s|   6 &JW X3v3!> X Xvv..]2 <2  :]*"(HQK]*"(HQK 'umXF $,/?#? #.ugS6("ME4 L1  00]2 =2  :]*"(HQK]*"(HQK 'umXF $E7!, $,/?#? #.ugS6("ME L *)W !Ys   C r   ztl.dot can only do mm and bmmr  tf32ieeeztl.dot(r  z, input_precision=""))r  r4  r8  r4  r   r   )rO   r   is_native_matmulr   dense_size_listr   r   r   r    r   r  r  r   r   r  r  ra  r   r   r   backendscudamatmulfp32_precision)r%  r&  orig_aorig_br  r  r  input_precisionr   r   r  r  s           @@@@r   r  zTritonOverrides.dot  s   : xx((((A	*	6 6"1A6"1AI	/I	 .I	 	I	 I	V 188++-.!3T5TT3]..t{{;<]..t{{;<]..t{{;<]..t}}=>HHLL!!HH/4=66BRS''6"	 " 
 HHLL!!HH/4=66BRS''6"	 " 
 >>%%44>$O$O2aS 3O3DBGGr   rQ   )constraintsr   is_purepackinput_dtypesc           
        t        |      |#dj                  dg|D cg c]  }d c}z         }|j                  d      D cg c]  }|j                          }	}|	D cg c]  }|j	                  d      r| }
}g }t        t        ||
d t        |                   D ]o  \  }\  }}|dk(  rH|Ft        |t              r6|j                  ||   k7  r$|j                  | dt        ||          d       V|j                  t        |             q t        j                  j                  r7t!        |       j#                  d	d
      t!        |      j#                  d	d
      nd|  dd| dfd}dk  r |dj                  |            S |d   }t$        j&                  j(                  }t$        j&                  j*                  }|j-                  ||j.                        }dj                  fd|D              }|j1                  | d ||              |j1                  | d| d| d d       |S c c}w c c}w c c}w )Nr  z=rr"  ,=hr  r  \z\\r   c                .    d d d|  d d d dS )Nztl.inline_asm_elementwise(r  r  z	], dtype=z
, is_pure=z, pack=r  r   )r   asm_literalasm_triton_typeconstraints_literalr  r  s    r   asm_callz8TritonOverrides.inline_asm_elementwise.<locals>.asm_call{  s>    ,[M<O;P Q6?"3:gYgdVSTVr   rQ   r   r  c              3  0   K   | ]  }d | d d  yw)ztriton_helpers.inline_asm_pack(r  r  Nr   )r   inpr  s     r   r   z9TritonOverrides.inline_asm_elementwise.<locals>.<genexpr>  s&       
AD-cU"TF!< 
s    = z$ = triton_helpers.inline_asm_unpack()rI   r&  splitstripr  	enumerater  r   r   rW   r   r  r   r   r  r  reprreplacerO   r   r  r   newvarr   	writeline)asmr  r   r  r  r  inputs_r  constraint_partsinput_constraintscast_inputsir  r  r  first_inputr  r   rk  packed_argsr  r  r  s      ``                @@@r   inline_asm_elementwisez&TritonOverrides.inline_asm_elementwiseK  sN    &e,))TF6-Bac-B$BCK
 0;/@/@/EF!AGGIFF(8R1S@QQRR$S1B=S[1Q%RS 		-KAxQS ,sK0IIa0""cU${<?/K.LA#NO""3s8,		- == s)++D&9K"&{"3";";D&"IcU!*K$%k]!"4	 	 19DIIk233Qi((""hhll%{/@/@Aii  
HS 
 
 	VHC(='>?@h:6("[MQSTXSYYZ[	
 m .C
 GRs   	I
	I
&I=Ic                    d|  dS )Nztl_math.cos(r  r   r  s    r   coszTritonOverrides.cos  r  r   c                    d|  dS )Nztl_math.sin(r  r   r  s    r   sinzTritonOverrides.sin  r  r   c                    t        d      )Nz/ops.index_expr not implemented outside a kernelr  )r   r   r   s      r   
index_exprzTritonOverrides.index_expr  s    !"STTr   c                    t        d      )Nz+ops.masked not implemented outside a kernelr5  )r  bodyothers      r   maskedzTritonOverrides.masked  s    !"OPPr   c                    d|  dS )Nzlibdevice.lgamma(r  r   r  s    r   lgammazTritonOverrides.lgamma       #1#Q''r   c                    d|  dS )Nzlibdevice.erf(r  r   r  s    r   erfzTritonOverrides.erf        s!$$r   c                    d|  dS )Nzlibdevice.cosh(r  r   r  s    r   coshzTritonOverrides.cosh       !1%%r   c                    d|  dS )Nzlibdevice.sinh(r  r   r  s    r   sinhzTritonOverrides.sinh  rC  r   c                    d|  dS )Nzlibdevice.acos(r  r   r  s    r   acoszTritonOverrides.acos  rC  r   c                    d|  dS )Nzlibdevice.acosh(r  r   r  s    r   acoshzTritonOverrides.acosh       "!A&&r   c                    d|  dS )Nzlibdevice.asin(r  r   r  s    r   asinzTritonOverrides.asin  rC  r   c                    d|  dS )Nzlibdevice.asinh(r  r   r  s    r   asinhzTritonOverrides.asinh  rJ  r   c                    d|  d| dS )Nzlibdevice.atan2(r  r  r   rN  r  s     r   atan2zTritonOverrides.atan2       "!Bqc++r   c                    d|  dS )Nzlibdevice.atan(r  r   r  s    r   atanzTritonOverrides.atan  rC  r   c                    d|  dS )Nzlibdevice.atanh(r  r   r  s    r   atanhzTritonOverrides.atanh  rJ  r   c                    d|  d| dS )Nzlibdevice.copysign(r  r  r   rP  s     r   r  zTritonOverrides.copysign  s     %QCr!A..r   c                    d|  dS )Nzlibdevice.erfc(r  r   r  s    r   erfczTritonOverrides.erfc  r  r   c                    d|  dS )Nzlibdevice.erfinv(r  r   r  s    r   erfinvzTritonOverrides.erfinv  r=  r   c                    d|  d| dS )Nzlibdevice.hypot(r  r  r   rP  s     r   hypotzTritonOverrides.hypot  rR  r   c                    d|  dS )Nzlibdevice.log10(r  r   r  s    r   log10zTritonOverrides.log10  r  r   c                    d|  dS )Nzlibdevice.log2(r  r   r  s    r   log2zTritonOverrides.log2  r  r   c                    d|  d| dS )Nzlibdevice.ldexp(r  z.to(tl.int32))r   )rN  ns     r   ldexpzTritonOverrides.ldexp  s     "!Bqc88r   c                    d|  d| dS )Nzlibdevice.nextafter(r  r  r   rP  s     r   	nextafterzTritonOverrides.nextafter  s     &aS1#Q//r   c                    |  d| S Nr%  r   r  s     r   logical_andzTritonOverrides.logical_and       Cs|r   c                    |  dS )Nz == 0r   r%  s    r   logical_notzTritonOverrides.logical_not  s    E{r   c                    |  d| S Nz | r   r  s     r   
logical_orzTritonOverrides.logical_or  rj  r   c                    d|  d| dS )Nr   ^ r  r   r  s     r   logical_xorzTritonOverrides.logical_xor!  s     1#S1~r   c                    |  d| S rh  r   r  s     r   bitwise_andzTritonOverrides.bitwise_and&      Cs|r   c                    d|  S )N~r   rl  s    r   bitwise_notzTritonOverrides.bitwise_not*  s    1#wr   c                    |  d| S ro  r   r  s     r   
bitwise_orzTritonOverrides.bitwise_or.  rv  r   c                    |  d| S )Nrr  r   r  s     r   bitwise_xorzTritonOverrides.bitwise_xor2  rv  r   c                    |  d| S )Nz << r   r  s     r   bitwise_left_shiftz"TritonOverrides.bitwise_left_shift6      D}r   c                    |  d| S )Nz >> r   r  s     r   bitwise_right_shiftz#TritonOverrides.bitwise_right_shift:  r  r   c                     d| d}d|  d| dS )Nr  ).to(tl.uint32)ztl.rand(r  r  r   seedr   s     r   randzTritonOverrides.rand>  s%    VHO,$r&++r   c                n    d| d}d| d| d}d| d| d| d}d| d| d}d	|  d
| d| d
| d| dS )Nr  r  z((z)*(r  z)//(z//4))z)%(z!triton_helpers.rand_eager_kernel(r  +z, VEC=r  r   )	r  base_offsetthreads_per_roundtidvectid_u32denomr"  	tid_truncs	            r   
rand_eagerzTritonOverrides.rand_eagerC  s     cU/*SE./r2	eWCuE2	UG2.	24&;-q2i[X^_b^ccdeer   c                     d| d}d|  d| dS )Nr  r  z	tl.randn(r  r  r   r  s     r   randnzTritonOverrides.randnM  s%    VHO,4&6(!,,r   c           	     ,    d| d}d|  d| d| d| d	S )Nr  r  ztriton_helpers.randint64(r  r  r   )r  r   lowhighs       r   	randint64zTritonOverrides.randint64R  s1    VHO,*4&6("SED6KKr   c                    t        d      )Nz.ops.load_seed not implemented outside a kernelr5  )r   r   s     r   	load_seedzTritonOverrides.load_seedW  s    !"RSSr   c                N    t         j                  j                  rd|  dS d|  dS )Nz	tl.rsqrt(r  zlibdevice.rsqrt(r  r  s    r   rsqrtzTritonOverrides.rsqrt[  s/     ==qc##%aS**r   c                    d|  dS )Nzlibdevice.log1p(r  r   r  s    r   log1pzTritonOverrides.log1pd  r  r   c                    d|  dS )Nzlibdevice.tan(r  r   r  s    r   tanzTritonOverrides.tani  r@  r   c                R   t         j                  j                  j                  j	                  |       }|rt        |d      r|j                  }nd }t        j                  rBt        j                  j                  r(t               dkD  r|t        j                  k7  r|d|  dS d|  dS )Nr   )r      zlibdevice.fast_tanhf(r  zlibdevice.tanh()rO   r   r   r   getr   r   r#   r  r   r  r  r   r   )rN  r   r   s      r   tanhzTritonOverrides.tanho  s     ((,,**..q1ww0MMEE  !!"$v-&! +1#Q//$QCq))r   c                    d|  dS )Nztl.sigmoid(r  r   r  s    r   sigmoidzTritonOverrides.sigmoid  s     QCq!!r   c                    d|  d|  d|  dS )Nz(libdevice.signbit(z) != 0) if (z).dtype is tl.float32 else z < 0r   r  s    r   signbitzTritonOverrides.signbit  s#    
 "!L3NqcQUV	
r   c                    d|  d| dS )Nzlibdevice.fmod(r  r  r   r  s     r   fmodzTritonOverrides.fmod  s     !2aS**r   c                   t               j                  ||      }|mt        |      rb| j                  ||      }t	        |t
        j                  j                        r | j                  |t
        j                        n| }d| d| dS t        |      xs t        |      }|}|t
        j                  t
        j                  fvr-t        |      s|rt
        j                  nt
        j                  }| j                  ||      }| j                  ||      }	d| d|	 d}
|6||k7  r1t        |      r|r|
 dt        |       d}
|
S |
 dt        |       d}
|
S )Nztriton_helpers.pow_integer(r  r  r  r  )r~  powr   r  r   r   rd  re  r  int64rl  r   r   r`  rI   )r   r%  r&  r  r
  exponentr  	pow_dtypecast_acast_brk  s              r   r  zTritonOverrides.pow  sg   (*..q!4#(8(F**1l;D a!4!4!;!;< Q,s 
 1b
!DD215S9PQR9S 	U]]EMM:: $L15E ]]  ((I6((I6!&F815#	(A-# &xtK,E+FaHF  #84L(A'B!Dr   c                    d|  dS )Nztl_math.log(r  r   r  s    r   logzTritonOverrides.log  r  r   F)r   c                    d|  dS )Nzlibdevice.isinf().to(tl.int1)r   r  s    r   isinfzTritonOverrides.isinf       "!M22r   c                    d|  dS )Nzlibdevice.isnan(r  r   r  s    r   isnanzTritonOverrides.isnan  r  r   c                    d|  dS )Nzlibdevice.nearbyint(r  r   r  s    r   roundzTritonOverrides.round  s     &aS**r   c                    d|  dS )Nr  r  r   r  s    r   floorzTritonOverrides.floor  rJ  r   c                   t        j                  dt        j                        }t        j                  dt        j                        }t        j                  ||      }t        j
                  |||      }t        j                  ||      }t        j
                  |t        j                  ||       |       } t        j
                  |t        j                  ||      |      }t        j                  | |      }t        j
                  |t        j                  |       |       } t        j                  | |      }t        j
                  |t        j                  |      |      }t        j
                  |||      S Nr   rQ   )
r  r  r   r[  eqr  ltsubry  truncdiv)r%  r&  zerooneb_zerob_nega_negr  s           r   floordivzTritonOverrides.floordiv  s     ||Au{{+ll1ekk*
 4IIfc1%q$IIeSWWT1-q1IIeSWWT1-q1q$IIeS__Q/3||Aq!yy 5t<yyt,,r   c                f   t        j                  dt        j                        }t        j                  t        j
                  ||       t        j                        }t        j                  t        j
                  | |      t        j                        }t        j                  ||      }| d|  dS )Nr   r  .dtype))r  r  r   r[  r  r  rU  r  )rN  zleftrightr  s        r   signzTritonOverrides.sign  sw     LLEKK(||SVVAq\EJJ7cffQlUZZ8ggdE"d1#W%%r   c                    d|  dS )Nr  r  r   r  s    r   trunczTritonOverrides.trunc  rJ  r   c                    |  d| S )Nr  r   r  s     r   r  zTritonOverrides.truncdiv  s    
 D}r   c                    d|  dS )Nr  r  r   r  s    r   ceilzTritonOverrides.ceil  rC  r   NT)r   ry  r  torch.dtype | None)r   ry  r  ry  r   ry  r   r   )Wr   r   r   r   r  ra  e_LOG_2_Estaticmethodr  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r/  r1  r3  r6  r:  r<  r?  rB  rE  rG  rI  rL  rN  rQ  rT  rV  r  rY  r[  r]  r_  rd  rf  ri  rm  rp  rs  ru  ry  r{  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r  r    s   Otyy H )-	:&:& &:& :&x    D D, < < 3 3 #  #  ,  *   )  ) &  & '  ' "  "  " 7 7 7 7 * * xH xHt  mmC CJ #  # #  # U U Q Q (  ( %  % &  & &  & &  & '  ' &  & '  ' ,  , &  & '  ' /  / &  & (  ( ,  , '  ' &  & 9 9 0  0                     , , f f - - L L T T +  + '  ' %  % *  *& "  " 
 
 +  +    D #  # /3 0 3 /3 0 3 +  + '  ' - -6 & & '  '  
 &  &r   r  r   c                       e Zd ZdZ fdZeej                  d               Zed        Z	ed        Z
ed        Zed        Zed        Ze	 	 	 	 	 	 	 	 	 	 d
d	       Z xZS )r  a   Map element-wise ops to Triton within a TritonKernel

    Unlike TritonOverrides, these assume the code is going to be inserted into
    the body of the main triton kernel and so it may use indexing and mask
    variables which are assumed to already be defined in the current scope.
    c                D    t        |   |i | | j                          y r   )rq  rr  _setup_libdevice_routing)r  r   rw  rt  s      r   rr  zTritonKernelOverrides.__init__   s#    $)&) 	%%'r   c                   ddl m t        j                  j                  j
                  D ]  }t        | |      sJ t        | |      }fd}|dk(  rDt        d      sJ t        j                  |||      }||_
        t        | |t        |             kd }t        j                  |||      }||_
        t        | |t        |              y)z<Set up routing to libdevice implementations for fp64 inputs.r   )OpDecompositionsc                    | j                   t        j                  k7  r ||       S  t        |      |       j                  S r   )r   r   r   r   r  )rN  _original_impl_fn_namer  s      r   decomposition_routerzLTritonKernelOverrides._setup_libdevice_routing.<locals>.decomposition_router2  s9    77emm+)!,,>7#3X>qAGGGr   r  )r  r  c                ^    | j                   t        j                  k(  r	d| d|  dS  ||       S )Nz
libdevice.r  r  )r   r   r   )rN  r  r  s      r   dtype_routerzDTritonKernelOverrides._setup_libdevice_routing.<locals>.dtype_routerA  s2    77emm+'z1#Q77)!,,r   N)torch._inductor.codegen.commonr  r   	_inductorr&   op_requires_libdevice_fp64r   r   	functoolspartialr   setattrr  )r   fn_nameoriginal_implr  fnr  r  s         @r   r  z.TritonKernelOverrides._setup_libdevice_routing'  s    
 	D,,GG 	4G3(((#C1MH )#/;;;&&(QX &Wl2&67- ""]WB "BKC,r"23;	4r   c                r    t         j                  j                         }dg|z  }| j                  |||      S )NrQ   rp  )rO   r   r   r  )r   r  r   r   r   s        r   r  zTritonKernelOverrides.constantM  s9    
 xx**,d
##E5#>>r   c                   t        ||      }t        j                  j                  |dd       }t	        |t
              sJ |j                  r|j                  }nt        j                  |j                        }t        j                  j                         }|t        j                  t        j                  fvr|n|}t        j                  j                   }	 dt        j                  _        t        j                  j"                  j%                  t        j                  j&                  |j(                  t+        |      ||      }|t        j                  _        |t        j                  t        j                  fvrit        j                  j"                  j%                  t        j                  j&                  | j-                  ||      t/        |      |j0                        }n|}|j2                  D ]l  }t5        |t6        j8                        st        j:                  |t        j                  j"                  j<                  |j>                     j@                        }n ||k7  r_t        j                  j"                  j%                  t        j                  j&                  | j-                  ||      ||j0                        }|jB                  |_!        |S # |t        j                  _        w xY w)NF	block_ptrtma_compatibility_checkerrs  r   r   r  )"r   rO   r   indexingr   r  r  r   r   r  get_index_dtype_as_torch_dtyper   r[  r  r#   test_configsruntime_triton_dtype_assertr   r  r  r  r@   r  rK   r   r   r   r    r   promote_typesr   r   r   r	  )	r   r   r   r  r   r  origr   	index_vars	            r   r6  z TritonKernelOverrides.index_exprV  s=   /e<88$$ET % 
 (O444   ))E!11(..AE hh==?u{{EKK&@@k "">>
	C>CF;((,,''  "",T2 ( C ?CF;ekk22((,,''  S%()%0ii	 ( C  E!.. 	!)TXX6!//qxx||77	GMME #hhll++HH$$LLk2%))	 ,  !**
= ?CF;s   A.K K1c           
        | zt         j                  j                  `t        j                  j
                  j                  t        j                  j                  |  dt         j                  | j                        } |j                  j                  d      }|sJ d       d}|D ]>  }|j                  D ]-  }|j                  dk7  st        |j                  d         s+d	} > @ |rd n|}t        j                  j                  | |
      5 } |       }	d d d        |r	j                   j"                  rt        |      }t        j                  j
                  j                  t        j                  j                  d|	 dt%        |       d|	 dt'        j(                  |      |	j*                  |	j                        }t-        j.                  |	|      }
n	}
|
j0                  j3                         |
S # 1 sw Y   xY w)N.to(tl.int1)r  output)opz)graph for body does not contain an outputFloadrQ   T)r  r  z.shape, r  r  r  )r   r  r  rO   r   r   r  r  r   r   rP  
find_nodesr   targetrm   
mask_loadsrs  is_boolrc   r!   wrapr   r  r  r	  discard)r  r8  r9  nodes
need_wherer   r   r  new_maskrk  r  s              r   r:  zTritonKernelOverrides.masked  s    1 1 =88<<((  &%jjjj	 ) D 

%%%2AAAu
  	Dyy ::'+CCHHQK+P!%J	 #XX  U 3 	xVF	 }}$$UHHLL))  6((=+?*@6('R"''.llll * E ))Hfe4CCh'
)	 	s   G88Hc                    t         j                  j                  j                  |       }d| dt         j                  j                  j	                  d|       dS )Ntl.load( + load_seed_offsetr  )rO   r   r   inputseed_offset)r   r   r   s      r   r  zTritonKernelOverrides.load_seed  sI    hhmm!!$'se3qxx}}889KVTUUVW	
r   c                0   d|  d}t         j                  j                  j                  |      x}r|S t         j                  j                  j	                  | j
                  | j                        }t         j                  j                  j	                  t        j                  | j                        }t         j                  j                  j                  | d| d|  d       t         j                  j                  j                  |||f       ||fS )Nzfrexp(r  r  r  z = triton_helpers.frexp()rO   r   r   try_getr$  r   r   r   r[  r  r%  put)rN  	cache_keycse_valmantissar  s        r   frexpzTritonKernelOverrides.frexp  s     QCqM	hhll**95575N88<<&&QWWAGG&D88<<&&U[[&H	""j8*$<QCqA	
 	
Xx$89(##r   c                    t         r   r5  )r   r   r  
extra_metas       r   partial_accumulatez(TritonKernelOverrides.partial_accumulate  s
     "!r   )
r   r   r   r   r  rW   r  dict[str, Any]r   r   )r   r   r   r   rr  r   r  cacher  r  r6  r  r:  r  r  r  rz  r{  s   @r   r  r    s    ( __"4  "4H ? ? < <| , ,\ 
 
 $ $ """ " #	"
 
" "r   r  c                  H    e Zd ZU dZded<   ded<   ddZdddd	Zd
 Zd Zy)HelperFunctionsz#An ordered set of helper functions.zdict[str, str]_templates_seen	list[str]finalized_helpersc                     i | _         g | _        y r   )r  r  r  s    r   rr  zHelperFunctions.__init__  s    !!#r   _triton_helper_fn	base_namec                   | j                   j                  |      }||S | t        | j                         }|| j                   |<   | j                  j	                  |j                  |             |S )a9  This accepts a function definition with the function name
        left as a format specifier e.g.

            @triton.jit
            def {name}(arg0, arg1):
                return arg0 + arg1

        We add the templated code to the function set and return the name
        assigned to that function.

        )r   )r  r  r   r  r  r  )r  template_coder#  existing_namer   s        r   addzHelperFunctions.add  sw     ,,00?$  S!7!789:.2]+%%m&:&:&:&EFr   c                ,    t        | j                        S r   )iterr  r  s    r   __iter__zHelperFunctions.__iter__	  s    D**++r   c                     | j                   |   S r   )r  )r  r  s     r   __getitem__zHelperFunctions.__getitem__	  s    %%c**r   Nr   r   )r%  r   r   r   )	r   r   r   r   r   rr  r'  r*  r,  r   r   r   r  r    s+    -##  $ 4G ,,+r   r  c                     e Zd ZU dZ ej
                  e      Zded<    ej
                  e      Z	ded<    ej
                  e      Z
ded<    ej
                  e      Zded<   ej                   G d d	             Zej                   G d
 de             Zej                   G d de             ZddZ	 	 	 	 	 	 ddZddZy)r/  zM
    Class representing ND block dimensions, for block pointer analysis.
    )default_factoryrO  r   r?  rA  rD  c                      e Zd ZU ded<   ded<    ej
                  d      Zded<   d Zed        Z	e
e	 	 	 	 	 	 dd	              Zd
 Zd Zy)r9  r2  original_stridesr  F)initr  c                   t        | j                        dkD  sJ t        | j                        t        | j                        k(  sJ t        t	        t        | j                                    }| j                  |k(  | _        t        | j                        D ci c]  \  }}||
 }}}t	        t        |            D cg c]  }||   	 c}| _        y c c}}w c c}w rv  )r   r1  r  ra  rc  _is_identityr!  r  )r  identity_sort_idxr,  r  sorted_dims_by_strides_maps        r   __post_init__z*BlockParameters.StrideSorter.__post_init__	  s    t,,-111t}}%T-B-B)CCCC $U3t/D/D+E%F G $1B BD <ET]];S)T41a!Q$)T&)T s#=>?$ +1-$D  *U$s   CCc                    | j                   S r   )r4  r  s    r   r  z(BlockParameters.StrideSorter.is_identity+	  s    $$$r   c                     y)zBCreate a `StrideSorter` that can be used to sort block parameters.Nr   r   r1  rM  s      r   rl  z#BlockParameters.StrideSorter.create/	  s    r   c                b    | j                   s| j                  D cg c]  }||   	 c}S |S c c}w r   )r  r  r  attrr,  s      r   sortz!BlockParameters.StrideSorter.sort6	  s.    ##)-7AQ77K 8   ,c                b    | j                   s| j                  D cg c]  }||   	 c}S |S c c}w r   )r  r  r<  s      r   r  z#BlockParameters.StrideSorter.revert;	  s0    ##)-)=)=>AQ>>K ?r?  Nr1  zlist[int | sympy.Expr]rM  rs   r   r9  )r   r   r   r   dataclassesfieldr  r7  r,  r  r   r   rl  r>  r  r   r   r   StrideSorterzBlockParameters.StrideSorter	  s    ##%6[%6%6E%BB	 
	% 
	% 
		U#9	UFN	U)	U 
 
	U
	
	r   rD  c                  <     e Zd Z fdZe	 	 	 	 	 	 dd       Z xZS )$BlockParameters.IdentityStrideSorterc                "    t         |           y r   rq  r7  r  rt  s    r   r7  z2BlockParameters.IdentityStrideSorter.__post_init__B	      G!#r   c           
     L     | |t        t        t        |                        S )Nr1  r  )ra  rc  r   r:  s      r   rl  z+BlockParameters.IdentityStrideSorter.createE	  s'     !1eC(8$9:; r   rA  )r   r   r   r7  r   rl  rz  r{  s   @r   IdentityStrideSorterrF  @	  s4    	$ 
	#9	FN	)	 
	r   rM  c                  @     e Zd ZdZ fdZe	 	 	 	 	 	 dd       Z xZS )+BlockParameters.TensorDecriptorStrideSorterzT
        Sorts BlockParameters dimensions with strides in descending order.
        c                "    t         |           y r   rH  rI  s    r   r7  z9BlockParameters.TensorDecriptorStrideSorter.__post_init__T	  rJ  r   c                    t        t        t        |                  }	 t        j                  ||d      } | ||      S # t
        $ r |}Y w xY w)a  
            If the strides are not all known constants or if the strides are already
            sorted in descending order, return identity sort.

            For example if block_shape @ strides is [ZBLOCK, XBLOCK, YBLOCK] @ [8, 1, 16]
            The indices to sort the strides in descending order will be [2, 0, 1].
            The indices to revert back to the original order will be [1, 2, 0].
            T)reverserL  )ra  rc  r   r&   r_  r`  )r   r1  rM  identity_sortr  s        r   rl  z2BlockParameters.TensorDecriptorStrideSorter.createW	  sa     !s+;'<!=>M	) !,,Y8HRVW
 !1! 	 " )()s   A AArA  )r   r   r   r   r7  r   rl  rz  r{  s   @r   TensorDecriptorStrideSorterrO  N	  s9    		$ 
	#9	FN	)	 
	r   rT  c                    t        |       }t        d | |fD              \  }} |di |D ci c]  }|||   ||   z    c}S c c}w )z0
        Concatenates block parameters.
        c              3  F   K   | ]  }t        j                  |        y wr   )rB  asdictr  s     r   r   z*BlockParameters.__add__.<locals>.<genexpr>y	  s     Bq[''*Bs   !r   )r"  r   )r  r9  r   r%  r&  r  s         r   __add__zBlockParameters.__add__t	  sR     4jBT5MBB19a8sc1S6AcF?*8998s   Ac                    |j                  | j                  |      }t        di t        j                  |       j                         D ci c]  \  }}||j                  |       c}}}||fS c c}}w )z
        Sort `BlockParameter` with stride_sorter_cls. Returns block parameters
        as well as a `StrideSorter` which contains information on how the sort
        can be reverted.
        )rM  r   )rl  rA  r/  rB  rW  r  r>  )r  rL  rM  r:  r  r  r0  s          r   rT  z,BlockParameters.maybe_sort_with_stride_order|	  s~     *000S  
 !, 2 24 8 > > @C ]'',,
 }$$s   A1c                    fd}t        di t        j                  |       j                         D ci c]  \  }}| ||       c}}S c c}}w )zA
        Remove dimensions where removable_dims is True.
        c                R    t        |       D cg c]	  \  }}|s| c}}S c c}}w r   )r  )ititemis_removableremovable_dimss      r   filter_dimsz0BlockParameters.remove_dims.<locals>.filter_dims	  s3     +.b.*A&D,#   s   #r   )r/  rB  rW  r  )r  r_  r`  r  r  s    `   r   rS  zBlockParameters.remove_dims	  sO    
	  
5@5G5G5M5S5S5UVcsK$$V
 	
Vs   AN)r9  r/  r   r/  )rL  ztype[StrideSorter]rM  rs   r   z4tuple[BlockParameters, BlockParameters.StrideSorter])r_  r6  r   r/  )r   r   r   r   rB  rC  ra  r   r   r?  rA  rD  	dataclassrD  rM  rT  rX  rT  rS  r   r   r   r/  r/  	  s     0k//EEE$5K$5$5d$KK!K 1 1 1$ GGG 1 1 1$ GGG& & &P |   #l # #J:%!3%@H%	=%"
r   r/  c                  *    e Zd ZdZd ZddZd Zd Zy)"CooperativeReductionWorkspaceCachez
    The scratch space used for cooperative reductions can be reused
    after two reduction loops.  This keeps track of what can be reused.
    c                    || _         g | _        g | _        t        j                  t        j
                        | _        d| _        d| _        y rv  )	r   current_loop
prior_loopcollectionsdefaultdictdequeready_for_reuse
loop_countstore_count)r  r   s     r   rr  z+CooperativeReductionWorkspaceCache.__init__	  s@    	*66{7H7HIr   c                    | j                   j                  |      }|r|j                         S | j                  j	                  |d      \  }}}| j
                  j                  |||f       ||fS r  )rj  r  popleftr   	workspacere  r  )r  nbytescachedws_namer(  	ws_offsets         r   allocatez+CooperativeReductionWorkspaceCache.allocate	  sk    %%))&1>>## $		 3 3FE BI  &'9!=>##r   c                    | j                   D ]&  \  }}}| j                  |   j                  ||f       ( | j                  | _         g | _        | xj                  dz  c_        y NrQ   )rf  rj  r  re  rk  )r  rp  rr  rs  s       r   on_loop_endz.CooperativeReductionWorkspaceCache.on_loop_end	  s_    *.// 	F&FGY  (//)0DE	F++1r   c                H    | j                   }| xj                   dz  c_         |S rv  )rl  )r  priors     r   increment_store_countz8CooperativeReductionWorkspaceCache.increment_store_count	  s#      Ar   N)rp  r   )r   r   r   r   rr  rt  rw  rz  r   r   r   rc  rc  	  s    
$r   rc  c                  $    e Zd ZU ded<   d Zd Zy)FixedTritonConfigzdict[str, int]r#   c                     | j                   |   S r   r#   r  r]  s     r   r,  zFixedTritonConfig.__getitem__	  s    {{4  r   c                    || j                   v S r   r~  r  s     r   __contains__zFixedTritonConfig.__contains__	  s    t{{""r   N)r   r   r   r   r,  r  r   r   r   r|  r|  	  s    !#r   r|  c                      e Zd ZdZddZy)	TritonCSEz
    Subclasses CSE to apply the current load mask to the cache key to avoid CSEing
    variables across separate masked blocks.
    c                Z    t         j                  j                  x}r||j                  fS |S r   )rO   r   
_load_maskr   )r  r  r  s      r   augment_keyzTritonCSE.augment_key	  s,    88&&&4&tyy))r   N)r  r   r   zstr | tuple[str, str])r   r   r   r   r  r   r   r   r  r  	  s    
r   r  c                  d    e Zd ZU dZded<   ded<   ded<   ded<   d	 Z	 	 dd
Z	 	 	 	 ddZddZy)TMACompatibilityCheckerzO
    Checks if the TMA API can be used for load / store triton operations.
    TritonKernelr   ry  r   r   r  forcec                    d| _         y )Nz2Cannot use TMA descriptor for load / store since: )failed_debug_prefixr  s    r   r7  z%TMACompatibilityChecker.__post_init__	  s
    #W r   c                B   | j                   ryt        j                  j                         j                  dk(  r4t
        j                  j                         d   dk\  rt        j                  s+t        j                  j                         j                  dk(  r$t        j                  j                  r
t               s!t        j                  d| j                         y| j                   r7| j"                  j$                  r!t        j                  d| j                         yy)	NTr  r   	   xpuz}%s Requires triton>=3.4.0, a CUDA device with cc>=9.0 and `use_tensor_descriptor` and `assume_aligned_inputs` options enabledFz/%s stores with `no_x_dim` cannot load 16 bytes.)r  rO   rP  get_current_device_or_throwr"  r   r  get_device_capabilityr#   assume_aligned_inputsr   use_tensor_descriptorr   r  debugr  r  r   rV  r  s    r   can_use_tmaz#TMACompatibilityChecker.can_use_tma	  s     :: GG779>>&H

88:1=B4477668==F33)+ II[ ((  >>dkk22IIA(( r   c           
     
   | j                   rA|j                  D cg c]+  }t        j                  j                  j                  |      - }}n|j                  }t        j                  j                  j                  |d   t        j                  d            s"t        j                  d| j                  |       y| j                  j                  }|dd D ]  }t        j                  j                  j                  t        ||z  dt        j                  d            t        j                  d            rbt        j                  d| j                  ||        y |j                  d   }t        j                  j                  j                  |t        j                  d            r,t        j                  d	| j                  |j                         yd}d}|j                   D ])  }	t"        j$                  D ]  }
t'        |	|
      s|	}|
} ) + |r|sJ | d
t"        j$                          | j(                  j*                  r;| j,                  s.|t"        j.                  v rt0        |   }d}| j(                  j2                  D ],  }|j4                  s|j6                  |k(  s |j8                  } n |-t        j                  d| j                  ||j                         y| j(                  j;                  |      }|j=                  ||i      |z  }t        j                  j                  j?                  |t        j                  d            s-t        j                  d| j                  |j                  |       yy	 	 d	 	 	 	 	 	 	 dd}||z  dz
  }|jA                  tB        |      jA                  t        |      }tE        tG        t        jH                  ||d                  }|| j(                  jK                  t0        |         kD  r#t        j                  d| j                  ||       y| j(                  jM                  |      }	| j(                  jN                  rW|| j(                  jN                  |	   kD  rt        j                  d| j                  |	| j(                  jN                  |	   |       ytQ        || j(                  jR                  jU                  |	d            | j(                  jR                  |	<   yc c}w # tV        $ r. t        j                  d| j                  |j                         Y yw xY w)z
        Check if the block parameters are valid for TMA.
        If force, we allow relying on symbolic hints equivalent
        to what we check for Triton templates.
        rK  rQ   z=%s TMA API requires innermost stride to be 1. Strides are: %sFN   r   zU%s TMA API requires outer strides to be 16 byte aligned. Dtype bytes: %d, strides: %sz>%s innermost block shape cannot load 16 bytes. Block shape: %sz, expr must contain a single block type from zN%s could not find reduction range tree for innermost prefix %s Block shape: %szj%s persistent reduction innermost block shape cannot load 16 bytes. Block shape: %s, persistent RBLOCK: %dc                    | |z  }|r||z  }|S r   r   )rN  r  r  r  s       r   indexing_div_repzQTMACompatibilityChecker.are_block_parameters_compatible.<locals>.indexing_div_rep
  s    
 a%C!AgJr   zC%s the minimum block size to satisfy expression %s is too large: %dzT%s For block %s, fixed config block size %d is smaller than the minimum required: %dz?%s innermost block shape cannot load 16 bytes. Block params: %sTr   )rN  r   r  r   r  zsympy.Expr | Noner   r   ),r  rA  rO   rP  rI  !replace_backed_symbols_with_hintsrQ  r   r   r  r  r  r   rZ  r   r?  r   r   r   r   r   persistent_reductionr  r   r   re  r\  r   numel_get_persistent_RBLOCKsubsstatically_known_geqr#  r   r6   r  nsolve	max_blockr  fixed_configr~   tma_min_block_sizesr  rL  )r  block_paramsstrA  element_sizeri  innermost_block_shapeinnermost_block_typeinnermost_block_symtblock_type_str
block_symtinnermost_tree_prefix
tree_numelr  persistent_rblockinnermost_block_bytesr  
solve_exprsolve_expr_simplifiedmin_block_sizes                       r   are_block_parameters_compatiblez7TMACompatibilityChecker.are_block_parameters_compatible
  s    :: '..   BB2FG 
 #**G ww77U]]STEUVIIO((
 zz**crl 	F77##;; 5q%--:KLa  		k,, 	 	" !- 8 8 < 7733!5==#3
 IIP((((
 ##3@@ 	N+77 
!.*=+9(+5(		 $(< 	
$%%QR_RkRkQlm	
< KK,,NN$(E(EE %//C$D!J[[,, >>ahh2G&G!"J !
 		d,,) ,,	  $ B B: N%**,@BS+TU " 77##88%u}}R'8 		 A,, ,,%	 T IB
 ,0!! )  	 3\ABF
(2(:(:.)'/+;< & "110" "DKK$9$934%  II]00-&	 !!%!9!9:N!O;;++%(@(@(PP		< 44* KK44^D*  % GJ&77;;NANGDKK33NC kZ  		U,, ,,
 s&   0TB/T 3BT ;AT 4T?>T?c                    | j                   S )aH  
        Can you lift the make_tensor_descriptor
        call to the top of the kernel? This requires
        being certain that all of the shape, stride,
        and block_shape information is handled in arguments
        or top level definitions.

        Right now we assume this is always possible if you force TMA.
        )r  r  s    r   r<  z TMACompatibilityChecker.can_lift
  s     zzr   Nr*  )r  r/  r   r   )	r   r   r   r   r   r7  r  r  r<  r   r   r   r  r  	  sS     OKX&	&P% 
B
r   r  c                  ^    e Zd ZU dZeZded<   eZded<   dZ	e
ZeZded<   eZd	ed
<   dZded<   	 	 	 	 	 d`	 	 	 	 	 	 	 	 	 da fdZedbd       Zedbd       ZdcdZdbdZd Zd Zd Zd ZdbdZd Zeddd       Zdddddd	 	 	 	 	 dedZ	 df	 	 	 	 	 	 	 dgdZ dfdZ!	 	 	 	 	 	 	 	 dhdZ"d  Z#d!Z$d"Z%ed#        Z&dd$d%Z'd& Z(did'Z)	 	 	 	 djd(Z*dkd)Z+	 dl	 	 	 	 	 	 	 	 	 dmd*Z,dnd+Z-d, Z.dod-Z/	 	 dp	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dqd.Z0ddd/Z1drd0Z2	 	 	 	 	 	 dsd1Z3	 	 	 	 	 	 	 	 	 	 dtd2Z4	 	 dud3Z5dud4Z6d5 Z7d6 Z8d7 Z9d8 Z:	 	 dvd9Z;	 	 	 	 	 	 dwd:Z<	 	 	 	 	 	 dxd;Z=	 	 	 	 	 	 	 	 dyd<Z>	 	 	 	 	 	 	 	 	 	 dzd=Z?did>Z@d? ZAd{d@ZBd|dAZCdB ZDdC ZEeF eGd      dddD              ZHeFdE        ZIeFdF        ZJdldddGZKedH        ZLedI        ZMdJ ZNd}dKZOdL ZP	 d~	 	 	 	 	 ddMZQdndNZRddOZSddPZTddQZU	 	 	 	 	 	 ddRZVddSZWddTZXddUZYddVZZdbdWZ[ddXZ\e]d{dY       Z^ddZZ_dd[Z`e]dd\       Zadd]Zbdd^Zc	 	 	 	 	 	 dd_Zd xZeS )r  zdA class to represent a triton kernel and helpers to generate
    triton kernel programmatically
    r  helper_functionszCallable[[sympy.Expr], str]kexprTztype[BlockPtrOptions]block_ptr_options_clsztype[TensorDescriptorOptions]tensor_descriptor_options_clsNzbool | None3transpose_discontiguous_tensor_descriptors_overrideFc                   || _         || _        || _        t        |   |fi | t        | j                  | j                        | _        i | _	        t               | _        t               | _        t               | _        t        t                  | _        || _        t%        j&                         | _        t+        t,        t,        f          | _        t1               | _        t5        j6                  t*              | _        t+        t,        t:        f          | _        || _        t5        j@                         | _!        d| _"        d| _#        t        tH                  | _%        d | _&        | jN                  r| jQ                  | jR                         | jT                  r| jW                          | jY                          | jT                  r| j[                          d| _.        g | _/        y )Nr   F)0optimize_maskr  is_combo_kernelrq  rr  r  newvar_prefixsuffixr   prologue_cacherY   prologuepost_loop_combinepost_loop_storer   r	   outside_loop_varsr  r  countblock_ptr_iddictr   block_ptr_to_bufferr  r  rg  rh  pointer_advancementsr  r  hint_overrideCounter_load_counts_pdl_load_index_pdl_has_waitr0   autotune_hintstriton_metarY  codegen_reduction_numelsr8  cooperative_reductioninit_cooperative_reductioncodegen_range_treeinit_cooperative_reduction_maskhas_load_with_contiguous_rdimstores_with_contiguous_rdim)	r  tilingr  r  r  r  r  rw  rt  s	           r   rr  zTritonKernel.__init__
  s    $1(%4*6*T//=.0(6(81?1A/=/?!+C!2#6 %OO-#'S>#3  / 1##D) 	! $(S>#3 *6A6I6I6K " )6826  ))$))4%%++-!%%002-2*68(r   c                   t         j                  st         j                  j                  sy| j                  }|D cg c]  }t        |t        j                        r|  }}t        |      dk(  ry	 t        j                  j                  j                  | ||      }t        d |D              S c c}w # t        $ r Y yw xY w)NFr   c              3  &   K   | ]	  }|d k(    ywr  r   )r   ri  s     r   r   z4TritonKernel._has_stride1_on_rdim.<locals>.<genexpr>>  s     966Q;9   )r#   deterministicr  force_filter_reduction_configsr   r   r   r   r   rO   rP  rI  stride_varsZeroDivisionErrorr}   )r  support_varsr   reduce_varsr  s        r   _has_stride1_on_rdimz!TritonKernel._has_stride1_on_rdim$  s       F$7$7$V$V)) $
c=#@#@A 
 
 {q 	''**66uk<XK 9[999!
 ! 		s   #B.1+B3 3	B?>B?c                <    t        d | j                  D               S )Nc              3  2   K   | ]  }t        |        y wr   )r[   )r   r   s     r   r   z>TritonKernel.has_store_with_contiguous_rdim.<locals>.<genexpr>B  s      
(,d#
r  )rR  r  r  s    r   has_store_with_contiguous_rdimz+TritonKernel.has_store_with_contiguous_rdim@  s'     
040P0P
 
 
 	
r   c                    t        |      S r   )rI   )r  r   s     r   dtype_to_strzTritonKernel.dtype_to_strF  s    5!!r   c                p    | j                   xr) t        j                  j                  | j                        S r   )rY  rO   choices should_use_cooperative_reductionr[  r  s    r   r  z-TritonKernel.should_use_cooperative_reductionI  s-    $$ 
)S)SMM*
 	
r   c                     j                   sJ  j                  D ]$  }|j                  |xj                  dz  c_        &  j                  d   } j                  rt        | j                  d         } j                  j                  |       _        t         j                         _
         j                  j                  d       t         fd j                  D              r j                  j                  d       yy)z/One time setup code for cooperative reductions.NrQ   rN  r   a              RSPLIT_NEXT_POWER_OF_2: tl.constexpr = triton_helpers.constexpr_next_power_of_2(RSPLIT)
            RSPLIT_IS_POWER_OF_2: tl.constexpr = RSPLIT == RSPLIT_NEXT_POWER_OF_2
            HAS_RSPLIT: tl.constexpr = RSPLIT > 1
            rsplit_id = tl.program_id(0)
            num_rblocks = (rnumel + RBLOCK - 1) // RBLOCK
            rsplit_chunk = (num_rblocks + RSPLIT - 1) // RSPLIT * RBLOCK
            rsplit_start = rsplit_chunk * rsplit_id
            rsplit_end = rsplit_chunk * (rsplit_id + 1)
            c              3  Z   K   | ]"  }|j                   rj                  |        $ y wr   )r\  r  r   r   r  s     r   r   z:TritonKernel.init_cooperative_reduction.<locals>.<genexpr>j  s0      
   ''--
s   (+z>rsplit_end = tl.where(rsplit_end < rnumel, rsplit_end, rnumel))r  re  grid_dimrZ  r  r   r   
semaphoressemaphores_namerc  %cooperative_reduction_workspace_cacher8  splicer}   r%  )r  r   	sem_counts   `  r   r  z'TritonKernel.init_cooperative_reductionN  s    )))) $$ 	#D}}("	# KK$		4+<+<X+FGI#yy33I>5WII6
2 					
  
((
 

 IIP
r   c                   d}| j                   s| d}| j                  j                  d|        | j                         r| j                  j	                  d       y | j                   rJ | j                  j                  d       y )Nz$tl.arange(0, RSPLIT_NEXT_POWER_OF_2)z	[None, :]zrsplit_arange = z                if RSPLIT_IS_POWER_OF_2:
                    rsplit_mask: tl.constexpr = None
                else:
                    rsplit_mask = rsplit_arange < RSPLIT
                zSrsplit_mask = xmask if RSPLIT_IS_POWER_OF_2 else ((rsplit_arange < RSPLIT) & xmask))rV  r8  r%  _has_constant_xmaskr  )r  rsplit_aranges     r   r  z,TritonKernel.init_cooperative_reduction_masks  s{    >}},oY7M		.}o>?##%II }}$$IIer   c                2   | j                   D ]q  }|j                  s| j                  || j                         ,| j                  s9| j                  j                  |j                   d| j                  |              s | j                  rt        d | j                   D              rS| j                  ddd      }| j                  |      }| j                  j                  d| j                  |              y | j                  | j                         y y )Nzbase = c              3  4   K   | ]  }|j                     y wr   )is_loopr   r   s     r   r   z2TritonKernel.codegen_range_tree.<locals>.<genexpr>  s     =D4<<=s   r
  Tr   zrbase = )re  r  iteration_ranges_codegen_headerr8  rY  r%  r   iteration_ranges_ranges_coder}   _get_reduction_symbols_flatten_reduction_indicesr  r  codegen_reduction_indices)r  r   rn_basesrbases       r   r  zTritonKernel.codegen_range_tree  s    $$ 		D<<44T499E&& 		##{{m74+L+LT+R*ST		   =D,<,<==66Dd 7  77A		  8D,=,=e,D+E!FG ..tyy9 !r   c                     y)z
        Indicate whether we need provide numel as arguments for the generated
        kernel calls in the benchmark.

        Should be true for pointwise/reduction kernels but false for triton
        matmul kernels.
        Tr   r  s    r   need_numel_argszTritonKernel.need_numel_args  s     r   c                    | j                   xr4 t        j                  j                  | j                  | j
                        S r   )rY  rO   r  should_use_persistent_reductionr[  r  r  s    r   r  z,TritonKernel.should_use_persistent_reduction  s5    $$ 
)R)RMM455*
 	
r   c                    | j                   xrG t        | j                        | j                  dz   k(  xr  | j                  xr | j                  d   dk(  S )NrQ   r   )r  r   rZ  rX  r  r  s    r   want_no_x_dimzTritonKernel.want_no_x_dim  sY    %% 1DKK D$;$;a$??1!!1 !!(+q0		
r   c                     y)Nztl.device_assertr   r  s    r   assert_functionzTritonKernel.assert_function  s    !r   )
copy_shapedense_indexingoverride_maskr  r  c          
        !"#$  j                        j                  }d}t               !t        |t	        j
                  d            D ]y  }	t        |	t        j                        sJ |xs t        |	t        j                        }|rAt        |	t        j                        r? j                  j                  |	j                      }
!j#                  |
j$                         t        |	t        j&                  t        j(                  t        j*                  t        j,                  t        j.                  t        j0                  f      rt        j2                  D cg c]  }t        |	|      r	t4        |    }}t7        |      dk(  r	 t7        |      dk(  sJ d|	j                           !j9                  |d    d       | t:        j<                  j>                  xs |xs  j@                  duxr dk7  }d	}d}t               } jC                         D ]@  }|jE                  |jF                        rd	}nd}|j9                  |jH                   d       B |r& jJ                  rt:        j<                  jL                  srjO                         rz|sx j@                  slt7        !|z
        dk(  r[ jQ                        sJ|rH jR                  d
k(  r9	 	 	 	 	 	 dd"	 	 	 	 	 	 d fd$	 	 	 	 	 	 d"$fd#d!# fd} |       }||S d}d} jU                        } fd}tW              rst7         jY                               dk(  r |       \  }}nLt[        dgt7         jY                               z        }t]        dgt7         jY                               z        }d| d| d} j^                  s j`                  rt         fd jb                  D              !n
t               ! j@                  r!j9                   j@                         te        |!|||      S |r|sˉ jf                  r jh                  r!jk                         } j@                  r|j9                   j@                         t        g d      }|jm                  |      sq|jo                  |      }|jq                         }t        |tr              sJ |ju                  |       |j#                  |j$                         |jm                  |      sqdgt7         jY                               z  }|D ]y  }t        |tZ              sJ  jC                         D ]R  }|jw                  |jH                        s|jx                  }t        |tz              sJ  jY                         |   ||<   T { ddj}                  t        tZ        |            z   dz   }t]        |      }d| d| d}n/ |       \  }}d| d| d}|!n|sr |       \  }}d| d| d}|!||s|r |       \  } }nd}|rt        |g      ! j@                  r!j9                   j@                          j                  !       te        |!|||      S c c}w ) zO
        Compute the index and mask to pass to tl.load() or tl.store()
        Fr   r  r   rQ   r   r  NTtl.int32c                    t        j                  | |j                               }|yt        |j                  gt
        j                  |      g|gt
        j                  |      g      S )z
                Matches expressions of the form:
                    idx = s * xindex

                This implies stride (s,), and shape (XBLOCK,).
                Nr   r?  rA  rD  )rR   match_affine_block_exprsymbolr/  r  r   r   r   )r  
range_treeri  s      r   match_affine_blockz1TritonKernel.indexing.<locals>.match_affine_block  sl     -DD:,,. >&%++,!.!=!=j!I J#H*;;JGH	 r   c                   |j                         }t        j                  dt        j                  t        j
                  |g            \  }}t        dt        |j                        | j                  t        ||            | j                  t        |||            z         }t        j                  j                  j                  |       } j                  |j                         }t#        j$                  | |||      }|y|\  }}	}
t#        j&                  |      }t        j                  j                  j)                  |j*                        t-        fd|D              ryt.        j1                  |      }t3        |j5                  |d               gt7        |dd |dd       D cg c]C  \  }}t        j8                  t3        |j5                  |            j5                  |            E c}}z   }|
D cg c]#  }t;        ||t.        j=                  |      i      % }}t?        |D cg c]  }j5                  |       c}||	|	      S c c}}w c c}w c c}w )
a  
                Matches higher-dimensional blocks coming from FloorDiv and ModularIndexing.

                Example expression to match:
                   sN * ((rindex//(d1 * ... * d(N-1))))
                       + s1 * ModularIndexing(rindex, 1, d1)
                       + ...
                       + s(N-1) * ModularIndexing(rindex, d1 * ... * d(N-2), d(N-1))

                This iterates over a block of shape (dN, ..., d1) and stride
                (sN, ..., s1). (d1,...,d(N-1)) and (s1,...,sN) are
                wildcards that we match.

                Note that dN does not appear in the expression, but we solve for it
                using range tree numels and the other dims.
                zdenom modulo)excluder   r"   Nc              3  l   K   | ]+  }j                  |       xr j                  |        - y wr   )r~  statically_known_power_of_2)r   r  r  rI  s     r   r   zETritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<genexpr>u  sH        !==eYOO H$@@GGHs   14r   rQ   r  ) r  r   symbolsr  r  Wildr~   r   r  r  r   r   rO   rP  rI  remove_precomputed_replacementsr  rR   match_mod_div_block_exprget_slice_numelsr  r   r}   r   r   r   rG  r  MinrH   r   r/  )r  r  r  r  modulonum_dimsr  match_resultdimsrA  block_index_exprsslice_numelslinear_block_sizerg  r?  r   r   dr  rI  r  s                     @@r   match_mod_div_blockz2TritonKernel.indexing.<locals>.match_mod_div_block(  sh   & '--/	 !&"!))%**ykJ!v
  
(()HY$>?++oi&OPQ	& 77++ @@G @@AQAQR2KK9eX   ' !	%2CCDI 77++ NN:+<+<=	  ". 
   %2$@$@$L!) 88aI1 '*,qr*:DH&E #s II-x/O/OPU/V !88=	1( !2	3  y-*H*H*TU3 3 'HLM18;;A>M +#)	 %3 Ns   7AI	(I%<I*c                ~    fD ]5  }t        j                  | |j                               } |||      }|3|c S  y)ze
                Match a block indexing subexpression involving a single range tree.
                N)rR   factor_index_exprr  )r   r  
match_funcfactored_index_exprmatchr  r)  s        r   match_block_subexprz2TritonKernel.indexing.<locals>.match_block_subexpr  s\     ''# 	%J +>*O*Oj//1+' '':JGE($	% r   c            	        t        j                  j                         D  ci c]  \  } }| |j                   c}}       }j	                         }|D cg c]&  }t        j                  ||j                               ( }}t        d |D              }t               }t        ||      D ]@  \  }}t        |j                  |j                              dkD  r y  ||      }	|	 y ||	z  }B |t        |      z
  }
j                         t         j"                  j$                  rj&                  nj(                  }t         j"                  j$                  rd}t        j*                  }nt-        t.              j1                         }j2                  	 j2                  }nt         j"                  j4                  }t7        d      r|d uz  }|rt        j8                  nt        j*                  }|j;                  ||
|j<                  ||      }|t>        k(  r,t-        t.              jA                  |jB                        sy |S c c}} w c c}w )Nc              3  <   K   | ]  }|j                           y wr   )r  r  s     r   r   zBTritonKernel.indexing.<locals>.match_block_expr.<locals>.<genexpr>  s     *QT4;;=*Q   rQ   Ftemplate_out_shape)r0  r1  re  r	  rf  r<  rL  )"rH   range_tree_nodesr  r   r   rR   get_subexpr_involving_symbolr  r   r/  r  r   intersectionr   sumfilter_masksr#   r   use_block_ptrr  r  rM  r
   r  r<  r  )transpose_discontiguous_tensor_descriptorr   rT  rl  r  r  r  r0  )r  r  index_relative_to_xyr_indexre  r   index_subexprsrange_symbolsr  subexprr0  r   options_classr<  rL  transpose_contiguousoptionsr
  r  r	  r/  r  r  s                   r   match_block_exprz/TritonKernel.indexing.<locals>.match_block_expr  sl   .8$2G2G2M2M2OP$!QAqvvIP/+ #557 !,	"  (DD3T[[]" " !+*Q[*Q Q.0%(n%E 
+MD' =55g6J6JKLqP# 1$?F~# F*L
+ 5s>7JJ !!), }}22 ..;;  ==..$H(7(L(L%04/1J1-  9AACH PP#$ !TT -
 #MMSS - t%9:,
$0FF, 0 (CC,AA & (..'$* +'"&..%&7 /  !$;;04/1J1- 5TT  ${ Q"s   I+Ic                      r7t         t              r  dd fS ddj                  d  D              z   dz    fS j                         t	        j                               fS )Nz.shaper  r  c              3  2   K   | ]  }t        |        y wr   r   )r   r  s     r   r   zATritonKernel.indexing.<locals>._get_expand_str.<locals>.<genexpr>$  s     *Fa3q6*Fr  r  )r   r   r&  dense_size_strr   r  )r
  r  s   r   _get_expand_strz.TritonKernel.indexing.<locals>._get_expand_str  sh    j#.(\0$66*F:*F!FFLjXX**,eD4H4H4J.KKKr   r  r  z, tl.int32)c              3  v   K   | ]0  }|j                   s"j                  |      s|j                   d  2 ywr  N)r\  r  r   r  s     r   r   z(TritonKernel.indexing.<locals>.<genexpr><  s:      ',,T5L5LT5R {{m4('r   )r  )xmaskymaskzmaskr0_maskr   r  r  r  r  r  r   )r  r   r  rf   r   BlockParameters | None)r   r   r  rf   r   rN  )r   zBlockDescriptorOptions | None)Aprepare_indexingr   r   r'  operator
attrgetterr   r   r  r   r   r   r    r   r   r   r   rv  r	  r   r   r   r   r   r   r   r   r   r'  r#   r   r  r  r   r6  var_listr   allow_block_ptrr9  r  is_indirect_indexingr  r  r   r  r   r   r  r  re  r  rY  r  copyissubset
differencerW  rn  r  r  r   r  r&  r(  r8  )%r  r  r
  r  r  r  r  
index_varsr  r   r   r   prefix_matches
need_dense
have_densehave_loop_varsdense_mask_varsr   rB  rA  r
  r  r  rG  
mask_shapexyzr	tmp_masksr  expand_listr  rg  expand_shape_strr(  r	  r  r/  r)  s%   ```   `                          @@@@r   r  zTritonKernel.indexing  sk    %%e,''

%/\	*(*=*=f*EF !	:Cc5<<000# ~]22(J TXX.((..sxx8  !2!23%%II))JJJJ''
 
 !. 9 9"%c40 t$" "
 ~&!+>*a/N3CCHH:1NN/!2 3489C!	:H MM(( ++d* qj	 	 
+5<++- 	6D&&t}}5!%"
4;;-t 45	6 t338S8S-1==? "OOI/0A5--e4  J.!/B',x!x/Bx'xt / ((_ _D '(G"
'+%%e,		L !' S!5!5!78A=+:+<(
L !s4+?+?+A'B!BC
$aS3t/C/C/E+F%FG":,b;GI  D$8$8& ' $ 0 0' 	 'L	doo.")  j$$)>)>. '^^-
??NN4??3!"HI$--d3 * 5 5d ;I#--/C%c+<===&&s+%%cmm4 %--d3  #ec$*>*>*@&AA& KD%dC000 $ 7 7 9 K??4;;7"&//C#-c3#77#7/3/C/C/Ec/JK,	KK !388C[,A#BBSH
$[1.ykJ<qI	+:+<(
L.ykJ<qI	+	J-<->*l*9+R8H7IKI'IZ"1"3<!"M?3I??MM$//*)$%
 	
a"s   &Z
c                   |j                         }t        |t              r|r&|r$|dk(  sJ d}n|sd}n|r|dk(  sJ d|d}nd|}| j                  r)| j                  d   j
                  r|j                         s|j                  r|j                  r!|| j                  v r| j                  |   }||fS |j                  |d      }| j                  j                  |      }|rt        |      |fS t        | j                        }	t        |t              rd|	 }nd	|	 }| j                  j!                  |t"        j$                  g 
      }
| j                  j'                  ||
       t)        || d|       }|j                  r+| j*                  j-                  |       || j                  |<   n| j.                  j-                  |       t        |t              rn|| j0                  |<   t2        j4                  D ]L  }|j7                  |      }t9        d |D              r'| j:                  |   }||vsJ d| d| d       |||<   N ||fS |j                  |      }||fS )a  Generate a block pointer or tensor descriptor for Triton kernel operations.

        This method creates either a block pointer (for regular Triton operations) or
        a tensor descriptor (for TMA operations) based on the indexing type. It handles
        caching and reuse of descriptors for performance optimization.

        Args:
            name: The name of the buffer/tensor being accessed
            var: The variable name for the pointer
            indexing: Block pointer options or tensor descriptor options containing
                     indexing information and boundary check settings
            other: Additional parameters string (e.g., padding options)

        Returns:
            A tuple containing:
            - block_descriptor: The generated block pointer or tensor descriptor variable name
            - other: Modified additional parameters string with boundary check options
        , other=0.0r   , boundary_check=z, padding_option='zero'rK  F)rr  r  tma_descriptorr  r  c              3     K   | ]A  }t         j                  j                  j                  |t	        j
                  d              C ywr   N)rO   rP  rI  rQ  r   r   )r   r   s     r   r   z1TritonKernel.codegen_block_ptr.<locals>.<genexpr>  s=       !' GG,,DD &a(8s   AA	z#duplicate advancement for pointer 'z' at type 'r   )r  r   r  rY  re  r  r  r<  r  r  r   r  r   nextr  r  namedvarr   uint64r  rX   r  r%  r8  r  r   r   r  rR  r  )r  r   r   r  r9  checkblock_descriptorblock_ptr_line	block_varblock_descriptor_id	named_var	line_bodyr   advance_offsetsadvancementss                  r   codegen_block_ptrzTritonKernel.codegen_block_ptr  s   2 '')h 78 ------+E94KL+E95 !!  $,,##%  SD,?,?%? $(#6#6s#; h  &&e "*e!D HH,,^<	 y>500&*4+<+<&=#h8)23F2G'H$)78K7L'M$ HH--$ELL . 	 ^Y7(2B1C3~FV/WX	$$MM++I6/?D'',II''	2h8 BFD,,-=> !. = = I*2*B*B4*H   +:	  %'+'@'@'F/|C ABRASS^_c^ddefC :I%56!I&  &&  (s3&&r   c                ,   d| d|j                    d}t        t        |j                   |j                              D ]B  \  }\  }}t        j
                  j                  j                  ||      s4d|j                  |<   D |j                  ||j                   |j                  dd      }t        j
                  j                  |      }	|| j                  j                  v rU| j                  j                  |   }
t        |
t              s,t        j
                  j                  |
j                   d         }	| dt#        |	       d}t        |t$              rd	| d| | dS | d
t        j&                  j)                  |j*                         d| dS )Nr  r  r  FTr  r  r   r  	tl.store(z.store()r8  r!  r  r5  rO   rP  rI  rQ  r7  r  r?  	get_dtyper   inplace_buffersr   r^   other_namesrV  r  r   r  rD  )r  r   r  r  r  r9  r  rg  broadcast_dimstore_dtypebufs              r   codegen_block_ptr_store_linez)TritonKernel.codegen_block_ptr_store_line  s    #5'H,@,@+AC *3$$h&>&>?*
 	8%C%#} ww77]K27**3/		8 66      7 
 gg''-499,,,))++D1Cc:.gg//0BC'/<=Q?h0ykE75';;GAHH$9$9(:J:J$K#LBugUVWWr   c                   |s|sy t        |t        j                        sJ | j                  |dd       }t        |t              sJ |j
                  }|j                         r|j                  nd }|rt        | j                  |            nd }| j                  ||rdnd ||      }	| j                  |      }
| j                  j                  |
|	dt        j                         y )NFr  0)
assignmentr   )r   r   r   r  r  r  r  r)  texprrename_indexingindirect_assertget_load_bufferr   r  r   r[  )r  r   r  lowerr  r  r  r)  size_strlinebuffers              r   check_boundszTritonKernel.check_bounds,  s     $

+++==RV=W(O444&&	(0(9(9(;8$$8=5--d344 ##esx
 %%h/&$5Lr   c                    |j                         s|j                         r| j                  S | j                  r5| j                  d   j
                  r|j                         s| j                  S | j                  S )NrK  )	r  r  r  rY  re  r  r  r8  loads)r  r  s     r   r  zTritonKernel.get_load_bufferF  sb      "h&:&:&<<<!!  $,,'') 99::r   ztl.extra.cuda.gdc_wait()z%tl.extra.cuda.gdc_launch_dependents()c                    t         j                  j                  j                  j                  syt        t        j                  t         j                  j                  j                        ryt         j                  j                  ryt        j                  j                         j                  dk(  xr$ t         j                  j!                         d   dk\  S )NFr  r   r  )r   r  r#   r   
enable_pdlr   rO   r   select_algorithmTritonTemplateKernelr  r  rP  r  r"  r  r  r   r   r   _enable_pdl_codegenz TritonKernel._enable_pdl_codegenX  s    %%,,77ahh @ @ U UV==GG//166&@ ;

00215:	
r   consider_readsc                  | j                         sy t        j                  j                  t        j                  j
                  $t        j                  j
                  j                  nd fd|sJ d u xs t        fd|D              }|sy | j                  r%| j                  d   j                  r| j                  }|j                  | j                         y )Nc                     J j                   j                  }r*t        j                  |j                   j                        }t         fd|D              S )Nc              3     K   | ]7  }j                   j                  |j                  |j                        k(   9 y wr   )mutation_renamesr  r   )r   wcurrent_nodedeps     r   r   zOTritonKernel._handle_pdl_before_access.<locals>.matching_dep.<locals>.<genexpr>u  s9       |4488HHs   =A )read_writeswritesr  r  readsr}   )r  	prev_depsr  r  	prev_nodes   ` r   matching_depz<TritonKernel._handle_pdl_before_access.<locals>.matching_depp  sY    (((!--44I%OOIy7L7L7R7RS	 "  r   c              3  .   K   | ]  } |        y wr   r   )r   r(  r  s     r   r   z9TritonKernel._handle_pdl_before_access.<locals>.<genexpr>{  s     ,S\!_,Ss   rK  )r  rO   r   r  rP  	schedulerprevious_noder}   rY  re  r  r8  r%  GDC_WAIT)r  wait_bufferr  dependencies	need_waitr  r  r  s     `  @@@r   _handle_pdl_before_accessz&TritonKernel._handle_pdl_before_accessf  s     '')xx,,/0ww/@/@/LAGG++RV 		 |%S,Sl,S)S	  T%5%5b%9%A%A))Kdmm,r   c                   | j                         sy |j                  dkD  ry | j                  r%| j                  d   j                  r| j
                  }|j                  | j                         |j                  | j                         y )NrQ   rK  )	r  	use_countrY  re  r  r  r%  r  
GDC_LAUNCH)r  launch_buffer
result_vars      r   _handle_pdl_after_loadz#TritonKernel._handle_pdl_after_load  so    '')!#  T%5%5b%9%A%A 22M 	.0r   c                $   g }d}d }|j                   D ]t  }t        |      t        u r| j                  |v r|r%d}t        |      t        u r,| j                  |v r||j                  |       t        |      }|j                  |       v ||_         y )NFT)_linesr"  r   r  r  rW  r   r  )r  code	new_lineshas_waitprevious_launchls         r   _filter_pdlzTritonKernel._filter_pdl  s    	 
	 AAw#~$--1"4#HAw#~$//Q"6".MM/2"%i.Q
	   r   c                P    | j                   j                  t        |||             y r   )saved_partial_accumulater  rg   )r  r   r   r  r  s        r   r  zTritonKernel.partial_accumulate  s$     	%%,,dNC8	
r   c           
        | j                   j                        }| j                  xx   dz  cc<   t        }| j	                  |      |}t
        j                  j                        }| j                  |d| j                  | |dd            }t        |t              r"| j                  |j                        rd| _        |j                         |j!                         }t#        d | j%                  |      j'                         D              }	| j)                  |      rd}
nX|	sd}
nS| j*                  rE| j,                  d   j.                  r,fd	}   d
}
t1        j2                  t4        d|      }nd}
|sr8|j7                         r(| j8                  rdt;        | j8                         }nd}nd}	 d}t<        j>                  j@                  r&| jB                  jE                         }|v r|   dkD  }	 | j)                  |       xr | j*                   xr | xr |	}d}|rd}d}d}tG              re|}|tH        jJ                  tH        jL                  fv r=t<        j>                  jN                  rtH        jP                  }n|dtS        |       dz  }d}nt        |tT        tV        f      r| jY                  |||      \  }}t        |tT              rd| | |
 | d}n/| dt
        jZ                  j]                  |j^                         d}|ja                  ||jb                  |jd                  dd      }|jd                  }ntg        |      rd| d| d}|jh                  }d}n^d| d|jj                   d|jl                   |
 | | d
}|jn                  r|jn                  }ntp        js                  |j                        }|tH        jJ                  tH        jL                  fv r/t<        j>                  jN                  r|dz  }tH        jP                  }|tH        jt                  k(  r/tH        jv                  jx                  |dz  }tH        jt                  }| j{                  |      }| j}                  |       | j~                  j                  | ||      ||      }| j                  ||       |j                  dkD  rxx   dz  cc<   t        |t              sJ |j                  |_D        |rd| d| d}| j~                  j                  ||||jn                        }|j                  r|j                  rd}n|tH        jt                  k(  rd }nd!}| j8                  rt;        | j8                        n|}d"|jl                   d| d| d}| j~                  j                  ||||j                        }| j*                  r|j                         ss| j                  j                  |       |S )#zc
        Load from the memory location 'name', offset by some indexing expression 'index'.
        rQ   TFr  r  r  c              3  &   K   | ]	  }|d k(    ywr  r   )r   r,  s     r   r   z$TritonKernel.load.<locals>.<genexpr>  s      
AF
r  z, eviction_policy='evict_last'rK  c                          kD  rsryy)N
evict_lastevict_firstr   )expected_countr  indirect_indexingload_countsr   s   r   decide_laterz'TritonKernel.load.<locals>.decide_later  s    t$~5"3'$r   z, eviction_policy='<EP>'z<EP>r   z, other=rd  z, cache_modifier='.cg'Nr  r  r   r
  z.load(rw  r  r  ), r  r  r  r  r  z0.0Truer  r  )Jr   r  r  r   rT  rO   rP  ry  r  tma_compatibility_checker_clsr   r  r  r  r  r  r  r}   get_strides_of_loadr  is_broadcastedrY  re  r  r  r  r?   r  _load_otherrc   r#   r   skip_l1_cacher[  buffer_read_countsrm   r   rj  rk  ri  r   rI   r  r  ru  r   r  rD  r  r?  r8  r   r
  r  r)  r  r   r   r   r  r  r  r  r   r  r  r  rn  r	  r   r   r#  r  r'  )r  r   r  r   	make_lineoriginal_indexr   r  r  is_coalescedepr  r9  has_read_depsr  r  cachemodappend_broadcastr   r  rm  load_bufferr  r  	other_valr  r  r  r  s    `                       @@@@r   r  zTritonKernel.load  s    iiood#''DQ=E	 55e<!!$'==&*&H&H	 'I ' ! 	
 h0T5N5NNN6
 26D.((*
**,  
 44^DKKM
 
 ~.1B1B""t'7'7';'C'C% % ).N+B!))*:FLQIB:8+<+<+>"=1A1A#B"CD%E	 ==&&!%!A!A!C )) 24 81 <	 ##N33 )))!! 	 	 /H $#D)D 77==77!MMEd;u#5"6a88DE (_6M$NO*.*@*@#x+' % h8%&6%7wrd8*AND./vahh6K6KHL\L\6]5^^_`D==((((#'# >  !,,&~6!#d>*:"=#+#6#6 !#d8+=+=*>c(BSBSATUWTXY^X_`h_iijk (($11E)99(..IE %--88MM88))

"u}}'8'8'@ &

**84&&{D9XX&&4U ' 

 	##K<!#"*&7888'11
%j\4D3EQGD**Th6K6K + J !!** Dejj(!DD7;7G7GM$"2"23T  #8#4#4"5R
|2i[PQR!XX..U*:J:J / 
 $$X-?-?-A*""&&z2r   c           	        | j                   j                  |      }|}t        j                  j	                  |      }d}||dk(  r|dk(  }	| j                  | |d|	      }| j                  |d|du |      }
t        |
t              r6| j                  |
j                        r| j                  j                  |       || j                   j                  v }| j                  |      }|r'|r%| j                  j!                  t#        |d             t        |
t$        t&        f      r-| j)                  |||
      \  }}| j+                  ||
|||      }nV||
j,                  }t/        |      r~| j0                  r|d| j3                          dz  }nZ|j4                  Nt7        d	 |j4                  D              s2d
j9                  t;        t<        |j4                              }|d| dz  }d| d| d| d
|
j>                   d	}n|dk(  rd| _         |
j,                  }t/        |      rZ|j4                  Nt7        d |j4                  D              s2d
j9                  t;        t<        |j4                              }|d| dz  }d| d| d| d
|
j>                   d	}ntC        d|       tE        jF                         }| jH                  s7| jJ                  r+|jM                  | jO                  || j                               | jQ                  | j                  |d       | j                  j!                  t#        ||             | jH                  s| jR                  jU                  |       |jW                          y)zn
        store the 'value' to the memory location 'name', offset by some indexing expression 'index'.
        NtmaTr  )r  r  r  ztl.debug_barrier().broadcast_to(r  c              3  8   K   | ]  }t        |      d k(    ywr   NrE  r  s     r   r   z%TritonKernel.store.<locals>.<genexpr>  s      9&'CFcM9   r  rx  r  r  
atomic_addc              3  8   K   | ]  }t        |      d k(    ywr  rE  r  s     r   r   z%TritonKernel.store.<locals>.<genexpr>       ?aCFcM?r  ztl.atomic_add(z, sem='relaxed')zstore mode=r  ),r   r  rO   rP  ry  r  r  r   r  r  r  r  r  rz  r  storesr%  rX   r  r  ru  r  r  r   r  rF  r   rR  r&  r(  r   r)  atomic_add_foundr  
contextlib	ExitStackrY  r  enter_contextguard_cooperative_storer  r  r'  close)r  r   r  r  moder   r  r   r  r  r  
is_inplacer  rm  r9  r  indexing_strvalue_shape
exit_stacks                      r   storezTritonKernel.storei  sL    iit$!!$'$(!<45=EME(,(J(J	 )K )% ==dl&?	 ! 
 h0T5N5NNN6
 ,,33D9 TYY666
,,^<.KK!!,t5I"JKh2I JK&*&<&<T3&Q#e44h 0%D \ $--L$U+'' nT5H5H5J4K1$MML[[,S 9+0;;9 6 #'))CU[[,A"BK n[M$CCLse4~Sr(BSBSATTUVD\!$(D!#--L%e,KK+?5;;??"iiC(=>.Q ??#C5\N#eWBxGXGXFYYijD%D6&:;;))+
$$)C)C$$T%A%A$%TU&&t{{D&Nl467$$""&&u-r   c                Z    | j                   j                  d| dt        |       d       y )Nztl.device_assert(r  r  )r  r%  r"  )r  condmsgs      r   device_assert_asyncz TritonKernel.device_assert_async  s(    !24&49+QGHr   c                    | j                   j                         }|j                  t        |d| d             |j	                         S )z
        For cooperative reductions only one thread block should write out the result.
        We rotate which thread block does each write for better parallelism
        zif rsplit_id == (z % RSPLIT):)r  rz  r%  rX   indent)r  r   r  r  s       r   r  z$TritonKernel.guard_cooperative_store  sC    
 88NNPd.?uK,PQR}}r   c                t    d }|D ]0  }|t        |d      s||j                  }"||j                  z  }2 |S )Nr	  )r   r	  )r  	variablesmaskselems       r   _combine_maskszTritonKernel._combine_masks  sK     	3D|t[)= NNE!DNN2E	3 r   c                V   | j                   j                  t        j                         | j                  j                  |d         }| j                  |d         }	| j                  |d         }
| j                  |d         }|r| j                  j                  |d         nd}|r| j                  |d         nd}|t        j                  k(  rd}n!|t        j                  k(  rd}nt        d       | j                  | j                  |d   g|r|d   gng   | j                  j                  | j                  d	| d
| d
|	 d
|
 d
| d
| d
| d
| d
| d
| d
| d||j                        }| j!                  | j                  |       | j#                  |||      }||_        |S )z3
        See [Note: Inductor bucketize op]
        r   rQ   r"   r   r   r  ztl.int64z5Bucketize only supports indexing with int32 and int64z'triton_helpers.bucketize_binary_search(r  z, )r  )r  r'  r0   ONE_ELEMENT_PER_THREADr   r  r  r   r[  r  r  r  r  r   r  r   r  r  r	  )r  r  
boundariesboundary_indicesindexing_dtyper  sortersorter_indicesboundaries_ptrboundary_sizeboundaries_underlying_numelboundary_stride
sorter_ptrsorter_stridetriton_dtyperk  r  s                    r   	bucketizezTritonKernel.bucketize  s   $ 	 C CDA7))*Q-8&*&7&7
1&F#++JqM:39TYY__VAY/v
8>))&)4FU[[(%Lu{{*%L%G  	'&&LL*Q-	
:@F1I;b	
 ""LL5fXRbr2M1NbQ`Paac nBgRl"]O2  !,, # 
 	##DLL&9##F,<nM r   c                    | j                         }|dk(  rd| dS | j                  }dg||z
  z  dg|z  z   }| ddj                  |       dS 	NrQ   z!triton_helpers.promote_to_tensor(r  r  r   r  r  r  r   rX  r&  )r  r  ndimsnreducesizess        r   reduction_resizezTritonKernel.reduction_resize  sj    '')A:6ugQ??)))VHw,>>$))E*+1--r   c                    | j                         }|dk(  rd| d|fS | j                  }dg||z
  z  dg|z  z   }|g |d ||z
   dg|z  nd }| ddj                  |       d|fS r  r  )r  r  r   r  r	  r
  r  s          r   reduction_resize_and_shapez'TritonKernel.reduction_resize_and_shape#  s    '')A:6ugQ?FF)))VHw,>>=B=N9e'uw(9A3=9TX 	 $))E*+1-y88r   c                   | j                   dk(  r|S | j                         | j                   z
  }| j                         }|d| dgz   }| j                  j	                  |t        t        |      ||      |t        |            S )zC
        Reshape to RBLOCK, collapsing all reduction dims.
        rQ   Nr   r  )rX  r   r  r   r  r  r   r   )r  r  r  r   target_ndimr  target_shapes          r   reduction_collapse_dimsz$TritonKernel.reduction_collapse_dims/  s     ""a'L--/$2I2II,,.$\k2hZ?xx  3u:}lC%	 ! 
 	
r   c                8   :;<=>?@AB dEd?dF?fd}t        j                  ?fd|      }}|rJt        j                  ||      } ?      rt        j                  n ?      rt        j                  n j
                  sJ t        d  j                  D              } j                  |       t        |      } j                  r|j                   j                          j                  d   j                  d   }	 j                  rY j                         }
t        |
      dk\  sJ |
D cg c]  }d|v sd	|v s| }}d
dj!                  |       d;t#        |      An) j%                         ;t#         j                               A j'                  ; Afd|      }d}dv rt)        |t"              r|\  }} j+                          j,                  z
  <	 	 	 	 	 	 dG< fd=	 	 	 	 	 	 	 	 dH=fd}<> fd}|f}| j.                  j0                  v r j.                  j0                  |   S t3              }t5              }t7         j                               }d|<<    j.                  j9                  |t#        |            }t        d |D              |_        dj!                  |      ::fdB j<                  rst>        j@                  jC                        }d @dI @Bfd}dk(  rd}nwt)        |t"              r&tE        ||      D cg c]  \  }} |||       }}}nAdk(  r3 j.                  jG                   jH                  ||jJ                        }n	 |||      }dv rt)        |tL              sJ tN        jP                  jS                         }|r"dtU        |       d jW                  |       d }nCtU         j.                  jG                   jH                  d!|	 d"| d#||jX                              }d$d%d   > | jH                  |||       ||_%        nd&k(  r8 jZ                  r j]                  ||B|      }nm j_                  |      }nYd'k(  rMt)        |t`              sJ |\  }}}t#         fd( jc                   jH                  |||<      D              }ndk(  r je                  |      }nt)        |tL              sJ  = jH                  ||jJ                        \  }} }! j.                  jG                   jH                  || |!      }n j.                  jg                  d)| |t#         j                                     }"t>        j@                  ji                        } j'                  tj        |      }t)        |t"              sdk(  r j                         }
t        |
      dk\  sJ |
D cg c]  }d|v sd	|v s| }}t#        |      |"_,        d
dj!                  |       d; jl                  jo                  |" d*; d| d| d        n5 jl                  jo                  |" d* j%                          d| d| d        dv rd)| d+} jp                  js                         }# jl                  jo                  | d* j%                          dt        jt                  |#      jv                   d jW                  |#       d        d$d%d   >|!dtU        |       d jW                  |#       d n|	 d,}$ jH                  jy                  d-|" d.| d/> d0|" d| d| d|$ d1|" d2 B|" d3|"       d4| d2 B| d3|       d4        | jz                  ||"|       n4t}              r j]                  ||B|      }ndk(  r3d)| d5}%d)| d6}& jl                  jo                  |% d* j%                          d7| d         jl                  jo                  |& d8 j%                          d| d         jH                  jy                  d9|% d.|& d:|% d|& d| dt~        j                   d;        jH                  jy                  d9|% d2 B|% d3|%       d9|& d2 B|& d3|&       d9	       |}' j.                  j9                  |'jX                        }( j                   jz                  |'|(|%|&<      }nt?        j                        }) |)|"|      }*dk(  r! jH                  jo                  |" d2|*        n' jH                  jo                  |" d2 B|*|"              t        j                  k(  rD j.                  jG                   jz                  |" d<t        j                  |"jX                        }" | jz                  ||"d        jZ                  rt>        j@                  ji                        }t        j                         }+ jz                   j                  fD ]2  },|,jo                  d=       |+j                  |,j                                4 dv r jz                  jo                  | d> j                  | d?               j                  | d@|      }- jp                  js                         }# j                  ||#t        jt                  |#      jv                        }. | j                  ||-|.       nFt}              rd&k(  sJ |\  }/}0}1 j                  |/t5              |d         }2 j                  |0t5              |dA         }3 j                  |1t5              |dB         }4 j                   j                  |/|0|1|2|3|4<	       ndk(  rw|\  }'}(t)        |t`              sJ  j                  |'t5              |d         }5 j                  |(t5              |dA         }6 j                   j                  |'|(|5|6<       n1 j                  |t5              |      }7 | j                  ||7d       |+j                          | j.                  j0                  |<   t)        |t"              r|n|f}8 j                  j                  |8       t        dC |8D              sJ |rD|8D ]?  }9|9jJ                  |k7  s jz                  jo                  |9 d2|9 dDt        |       d        A |S c c}w c c}}w c c}w )JzS
        codegen reduction of value to Triton according the reduction_type
        c                H    | d uxr | j                   xr | j                  dk  S )NrX  )r   rZ  )r(  s    r   should_upcastz-TritonKernel.reduction.<locals>.should_upcastN  s$    D=KQ%8%8KQZZ!^Kr   c                t     | j                         r$t        j                  | t        j                        S | S r   )r   r  r  r   r   )r  r  s    r   maybe_upcastz,TritonKernel.reduction.<locals>.maybe_upcastQ  s4     !- UEMM2 r   c                (     | j                         S r   rR  )r  r  s    r   <lambda>z(TritonKernel.reduction.<locals>.<lambda>\  s    mAGG.D r   c              3  :   K   | ]  }|j                    d   ywrI  r   r  s     r   r   z)TritonKernel.reduction.<locals>.<genexpr>e       MDdkk]$/M   rK  r   r   XYr  r  r  c                x    j                   j                  j                  d|  d d| j                        S )Nr  r  r  r  )r   r  r  r   )r  rF  r  r  s    r   r  z(TritonKernel.reduction.<locals>.<lambda>  s?    dhh''"1#R'7q9gg!	 (  r   N)argminargmaxc                   t        	      }
j                  | |      }	dk(  rYt        
j                               dk(  }|j                  J |r| d}dg|j                  d}n>| d}g |j                  d}n(
j                  | d| d d|j                        \  }}|| d	
j                  |       d}n|j                  }|||fS )
zK
            Helper to generate a reduction call, e.g. tl.sum.
            r  rX  z[None,:,:,None]rQ   z
[:,:,None]r  r  r  r  )r   r  r   r  r   r  r  r   )r  r  result_typetriton_reduction_fnis_bmmrk  r   rg  r   r   r  s          r   final_reductionz/TritonKernel.reduction.<locals>.final_reduction  s    #@"O00FE&T11349{{... %wo6F00a0E %wj1F-ekk-1-E $ ? ?*+1UG2cU!<ekk! &"84(9(9+(F'GqI#kk;--r   c                N     | ||      \  }}}| j                  | d|        y)zU
            Generate a reduction and assign it to an existing variable.
            r  N)r  )r  r  r  r#  r(  r&  s        r   final_reduction_definez6TritonKernel.reduction.<locals>.final_reduction_define  s0     *&%EKE1aMMZLE734r   c                    j                  | |      }j                  | |      }| j                  d| d| d d| d| d d| dj                  | d       d	       y )
N                z_val, z_idx = triton_helpers.z_with_index(r  )
                r  _idx
                )r  r  r  )r  r  r  r  rg  r   root_opr  s       r   final_argreducez/TritonKernel.reduction.<locals>.final_argreduce  s    00FE00FEMMF:,.DWI\Z_Y``bchbiiklokp qC 5 5D6I JK Lr   r   r  c              3  >   K   | ]  }t        |d          r|  ywrh  )rE   r  s     r   r   z)TritonKernel.reduction.<locals>.<genexpr>  s!      *
(;CF(CC*
s   r%  c                :    s| S t         j                  | |      S r   )r  r  )tvalfvalr  s     r   r  z*TritonKernel.reduction.<locals>.where_cond  s     (..tT4@@r   c                d   |j                   s|t        j                  k(  r| S ||k(  s| dk(  r| S | t        j                  |      j                  k(  rt        j                  |      j                  S | t        j                  |      j
                  k(  rt        j                  |      j
                  S | S )z7update reduction constant mask value to match dst_dtyper   )r   r   r   iinfor~   r   )r  r  r  s      r   update_constant_dtypez5TritonKernel.reduction.<locals>.update_constant_dtype  s    
 ..)uzz2I#O	)X]#Ou{{95999 ;;y1555Y!7!;!;; ;;y1555#Or   c                     || j                         }j                  t        |      }j                  j	                  j
                   | |      | j                   | j                        S )Nr  )r   _map_tuple_or_scalarrc   r   r  r  r   )r  r  default_strr  r  r6  r  s      r   _mask_valuez+TritonKernel.reduction.<locals>._mask_value  sb    /EKKP"77wOxx((LLuk2++++	 )  r   online_softmax_reducer  rR  )r!  r   r  r  r  r  zindex, z.shape)r~   r   welford_reducewelford_combinec              3  v   K   | ]0  \  }}j                   j                  j                  ||        2 yw)r  N)r   r  r  )r   r  r   r   r  s      r   r   z)TritonKernel.reduction.<locals>.<genexpr>4  s9      #$u HH%%dllEe%T#r   r(   = tl.full(_indexr  r*  _next, z_next = triton_helpers.z%imum_with_index(
                    #
                )
                r  _nextr-  _max_sumz, float('-inf'),  = tl.zeros(z
                    zG_next = triton_helpers.online_softmax_combine(
                        z+
                    )
                    z.to(tl.int8)zif HAS_RSPLIT:z_bval = _val_bvalrQ   r"   c              3  <   K   | ]  }t        |t                y wr   )r   rn  r  s     r   r   z)TritonKernel.reduction.<locals>.<genexpr>  s     J:a!23Jr2  r  )r(  r  r   r   )r  rW   r   rW   )r  rW   r#  r  r   z.tuple[str, torch.dtype | None, BlockShapeType])r  rW   r  rW   r#  r  r   r   r   rW   )Rpytreetree_anytree_mapr   r   rY  r   re  r8  r'  r  r  r   r  r  r   r&  r   rF  r8  r   r   rX  r   reduction_cacher^  r\  ra  r$  r	  r  r$   r  r  r  r  r  r   rW   rO   r   r  r   r  r   r  r<  welford_reduce_fallbackr   _welford prepare_softmax_twopass_fallbackrj  default_accumulatorrc   r8  r%  r[  select_index_dtyper5  r~   r  r  rC   r#   r  %online_softmax_reduce_final_reductionget_reduction_combine_fnr   rU  r  r  r  r  r  r  *codegen_cooperative_reduction_peer_combinewelford_reduce_final_reductionr  r  rv  rR  rS  )Cr  r   r  r   r  r  	do_upcastoriginal_dtyper  reduction_range_prefixdense_sizesr  xy_sizes_onlylogical_indexr(  r/  r  acc_typetorch_acc_typeresult_shaper  r  r:  masked_valuer  r(  accumulator_dtypeaccumulator_indexmeanm2weight_result_dtype_shapeaccumulatorr  r  accumulator_maxaccumulator_sum
result_max
result_sum
combine_fnupdatedr  r~  peer_valpeer_idxresult_mean	result_m2result_weight	peer_meanpeer_m2peer_weightpeer_maxpeer_sumpeersresult_tuplerk  r  rF  rg  r&  r.  r  r6  r  r  sC   ````                                                      @@@@@@@@@r   	reductionzTritonKernel.reductionC  s7   	L		 OO$DeL	OOL%8E)6y)AyI%25%9EMMuE$$$$MD<L<LMM% u??LL)!%!1!1"!5!<!<Q!?   ..0K{#q(((.9XdSD[CSWKTXMX =!9 :!<N.K!002N 4 4 67K )) 
 11%'',$}%%'$*A*AA	.	. ,	. <		. 	.B	5#	5 	5 ,		5
 	5	 6	00088++I66"9-))4D0023S((// l(; * 

  * *
 *
  

 zz% 	A
 $$ll00KG$$	 	 !88  $E5)>A%>QRdaAq 1RR5(
  $xx00uEKK0X*5':!55!,<<<$%HH$K$K$M! *+C,>+?uTEVEVWhEiDjjk(l%(+)) LL./E.Fgl^[bc"3"."4"4	 * )% &+e<^LLL*l<M $5
 #33--!%!4!4"NE:xQV"J "&!=!=eU!KJ#44!,999%1"r6" #(,dBU)# 
  #:: "BB5%P
!,<<<*9LL,0B0B+' "XX..LL'v / 
 ((++J< $D0023 , K
 ll66~yQG//wGGgu-!U*"&"6"6"8K{+q000)4%!%tsd{%M % ).m(<K%'(=)A(B!%DNII''&-{>2B"WIRPXzYZ[ II''&-{43F3F3H2IG9TVW_V``ab !55&'
|6$:!"mm>>@		##()T5H5H5J4K2{{;/334Bt7H7H7U6VVWY &+e<^L %0 M*+51B1B;1O0PPQR2359 
 ##W%6$77Nwi X M$5#6br) MS{m5,A;!O P Q"#3z5F4Gu2MO`'a&b c  **JEV &n5!00z8U
  #::$%j\"6$%j\"6 		##&'{43F3F3H2IIZ[cZddef 		##&'|D4G4G4I3J"XJVWX ##$%W_,= >()O+<BugRH\H\G] ^ ##$%S6Gu4M)_(` a$%S6Gu4M)_(` a (
!XX__5
@P@P_Q
!GG**##
  88S
$[%8!U*LL**k]#gY+GHLL**&-s:g{+K*LM 

* #'(("3"3..&-|4#jj)//	 #4 #K '**JT %%ll66~yQG#--/J..0D0DE 7./((67
 !55&&00!l(4+@+@J<tAT+U*VW  JJ!l%()W #mm>>@JJU[[-E-I-I   4 4j(HU%n5%)99998B5Y KK$Y/AJ	
 II$Y/AJ
 #MM!$Y/AJ
 33((!
  #::)3&
J!'8444JJ 0 ;WQZ  JJ 0 ;WQZ ::(( GG 0 ;W 't';';ZPTU.8  +%/
E%Bz%%l3J\JJJJ & <<>1**44!(#fXT2En2U1VVWX Q Yd  SR%s   8tt't.t<tc                   | j                  |||      }| j                  |||      }t        d      D cg c]'  }t        | j                  j	                  |            ) c}\  }}|j                  d| d| d| d| d| dt        j                   d| d| j                  |        d| d| j                  |        d       ||fS c c}w )Nr"   rR  
            r  9 = triton_helpers.online_softmax_reduce(
                )
            r  )	r  rc  r   r   r$  r  r#   r  r  )	r  r  rk  rl  rg  r   r(  rm  rn  s	            r   _online_softmax_reducez#TritonKernel._online_softmax_reduce!  s     66vPUV66vPUVMRSTX!V#dhhooEo&B"C!V
JL:, ' !O#4Bse2f>R>R=S TLD11ZLBC DLD11ZLBC D		
 :%% "Ws   ,Cc           
          fd|||fD        \  }}}d| d| d| d d	}fd}|||fD 	cg c]/  }	 j                   j                   ||	j                              1 }
}	j                  dj	                  |
D cg c]  }t        |       c}       d|        t         fd|
D              S c c}	w c c}w )	z;
        Helper to codegen triton_helpers.welford.
        c              3  D   K   | ]  }j                  |        y wr   )r  )r   r  r  r   r  s     r   r   z(TritonKernel._welford.<locals>.<genexpr>6  s(      
 ((>
s    ztriton_helpers.welford(r  r  c                2    t        | d | dz   d  z         S r  )r   )r   rg  s    r   reduced_shapez,TritonKernel._welford.<locals>.reduced_shape<  s$    qcAgi(8899r   r  r  c              3  V   K   | ]   }j                  ||j                         " y wr   )r  r   )r   r  r  s     r   r   z(TritonKernel._welford.<locals>.<genexpr>E  s*      
 ++E5;;?
s   &))r   r$  r   r%  r&  r   r   )r  r  rd  re  rf  rg  r   welfordr  r  welford_resultsr"  s   ``   ``     r   rP  zTritonKernel._welford2  s    
F+
b& ,D6B4r&C5J	:
 F+
 HHOO%}U[[/IOJ
 
 	DII&G!s1v&GHIWIVW 
(
 
 	

 'Hs   4B:B?c                   | j                         | j                  z
  }t        | dt        | j	                               |t        j                               }t        | dt        | j	                               |t        j                               }	t        | dt        | j	                               |t        j                               }
| j                  j                  | d| j                          d| d       | j                  j                  |	 d| j                          d| d       | j                  j                  |
 d| j                          d| d       |dk(  r>|\  }}}| j                  j                  d	| d
|	 d
|
 d| d|	 d|
 d| d| d| d       n8|dk(  sJ | j                  j                  d	| d
|	 d
|
 d| d| d|	 d|
 d       | j                  j                  d| d || d|       d|	 d ||	 d|	       d|
 d ||
 d|
       d       |}| j                  | j                  |dd||	|
||	      S )z%Helper to codegen a welford reduction_meanr   r   rs  _m2_weightrF  r  r  r=  r*  rA  z<_next = triton_helpers.welford_combine(
                    z,
                    rB  r<  z;_next = triton_helpers.welford_reduce(
                    z1, roffset == 0
                )
                z            r  rC  r  N)r   rX  rn  r   r  r!   unknownr8  r%  rF  r  r  rW  r  )r  r  r   r  r  r^  r   rg  rj  accumulator_m2accumulator_weightrd  re  rf  rs  s                  r   r<  zTritonKernel.welford_reduceJ  s     %%'$*A*AA'l% ,,./&&(	
 +l#,,./&&(	
 /l'",,./&&(	
 			m<(;(;(='>b
!L	
 			l4+>+>+@*AH:QO	
 			!",t/B/B/D.ERzQRS	
 ..$D"fLLW^$4G<N;O P MN#326H5I JF"RD6( + "%5555LLW^$4G<N;O PG2k]"^,<B?Q>R S 	MZ;-u(={KL MC
n-=U+C^ TU V J2D1EU/KM_$`#a b	
 !22""

 
	
r   c
           
        t        | j                  ||||||	            }
|||g}t        t        ||
            D ]E  \  }\  }\  }}|"| j                  j                  |	|      }|||<   |j                  | d|        G t        |      S )z0Helper to codegen call to triton_helpers.welfordr  r  )ra  rP  r!  r  r   r$  r  r   )r  r  rs  rt  ru  rd  re  rf  rg  r   r  result_exprsr,  result_exprr  r   s                   r   rW  z+TritonKernel.welford_reduce_final_reduction  s     dmmFD"fc5IJ#Y>09#lF:S0T 	6,A,^eU""hhooEoG"-QMM[MUG45		6 \""r   c                   | j                  |||      }| j                  |||      }	|j                  d| d| d| d|	 d| dt        j                   d| d| j	                  |        d| d| j	                  |        d       ||fS )Nr  r  r  r  r  )r  r  r#   r  r  )
r  r  rm  rn  ry  rz  rg  r   rk  rl  s
             r   rT  z2TritonKernel.online_softmax_reduce_final_reduction  s     66vxO66vxOL:, ' !O#4Bse2f>R>R=S TLD11ZLBC DLD11ZLBC D		
 :%%r   c                D    | j                   r| j                   d   S t        S )NRSPLIT)r  r4   r  s    r   
max_rsplitzTritonKernel.max_rsplit  s"    $$X..  r   c                   | j                   d   }| j                         sdnd}||j                  z  | j                         z  }| j                  j                  |      \  }}| j                  j                  d| d| d| j                  |       dt        |       d| d	| d
| dd       | j                  | dddg|t        j                               }	| j                  j                  |	 d| dt        |       d       |	S )a	  
        Generate code to save a [XBLOCK, RSPLIT] temporary workspace, where each thread block writes a different
        column.  After the barrier, every thread block loads the completed value so that it can compute the final
        value independently.
        rN  zxindex < xnumelNr-  z_ws = (r  z).to(tl.pointer_type(z))
                tl.store(z%_ws + (xindex * RSPLIT + rsplit_id), r  r  Tr   _peersr   r  r  z = tl.load(z_ws + (xindex * RSPLIT + rsplit_arange), rsplit_mask, eviction_policy='evict_first', other=triton_helpers.if_mask(rsplit_mask, r  )rZ  r  rZ  r  r  rt  r  r  r  rI   create_cse_varr!   r  r  r%  rc   )
r  r  r   default_valxnumelr  rp  rr  rs  r{  s
             r   rV  z7TritonKernel.codegen_cooperative_reduction_peer_combine  sS    S!(,(@(@(B %..(4??+<<!GGPPQWX%%GG9C0A0A)0L/MMbcnotcubv w$%J:,VXY]X^ _  	& 	
 ##l&!X&&&(	 $ 
 	&&g[ -eers~e  eA  ACD	
 r   c                H   | j                   sJ d| _         t        j                  j                  |      }| j	                  |d| j                  | |dd            }d| _         | j                  j                  |      }t        j                         }| j                  r+|j                  | j                  || j                               | j                  | j                  |       t        |t         t"        f      rY| j                  j%                  t'        || j)                  |||j+                  |      |d|j-                                            nt        |t.              sJ |j0                  }t3        |      rZ|j4                  Nt7        d |j4                  D              s2dj9                  t;        t<        |j4                              }	|d|	 d	z  }| j                  j%                  t'        |d
| d| d| d|j>                   d		             |jA                          y )NFT)r   r   r  r  r  re  c              3  8   K   | ]  }t        |      d k(    ywr  rE  r  s     r   r   z/TritonKernel.store_reduction.<locals>.<genexpr>  r  r  r  r  r  rx  r  r  )!rY  rO   rP  ry  r  r  r   r  r  r  r  r  r  r  r  r   r  r  r%  rX   r  r  r  r  r  r   r   rR  r&  r(  r   r)  r  )
r  r   r  r  r   r  r   r  r  r  s
             r   store_reductionzTritonKernel.store_reduction  s    $$$$ %!!$'==&*&H&H	 'I ' ! 	
 !%iit$))+
%%$$,,T43G3GH 	&&t';';SAh2I JK  **55  ,+H,C,C,E+HI	 h888#--L%e,KK+?5;;??"iiC(=>.Q ??  **uDc%8CTCTBUUVW 	r   c                N   t               j                  d       t               t        d      D cg c]*  t	        fdt        t        ||            D              , }}dj                  d t        j                  j                  |      D              }j                  d| d       t               dd	d
lm} d	dlm}  |        |        G fddt               }	j#                         5  t%        j&                   |	             5   || }
dj                  d |
D              }
j                  d|
        d d d        d d d        | j(                  j+                  j-                               S c c}w # 1 sw Y   AxY w# 1 sw Y   ExY w)Nz@triton.jitr"   c              3  r   K   | ].  \  }\  }}j                  d  d| ||j                         0 yw)r   r(  r  N)rj  r   )r   rc  r  r   r   r,  s       r   r   z,TritonKernel._lift_helper.<locals>.<genexpr>)  s@      %A~u s1#Qqc]%u{{Ks   47r  c              3  2   K   | ]  }t        |        y wr   rE  r  s     r   r   z,TritonKernel._lift_helper.<locals>.<genexpr>/  s     Rc!fRr  zdef {name}():r!  r   rq   )ShapePropagationOpsHandlerc                  4    e Zd Z	 	 	 	 	 	 	 	 d fdZy)+TritonKernel._lift_helper.<locals>.CSEProxyc                    	d| z  	 t        |      |i |} t        |      |i |}j                   t        
|      |i |||      S )Nr(  r  )r   r  )r  r   r   rw  output_dtypeoutput_shaper   dtype_handlerhelperhelper_name	overridesshape_handlers         r   _defaultz4TritonKernel._lift_helper.<locals>.CSEProxy._defaultA  s     4&z) w!   # " #
 w!   # " #
 ||,GIt,d=f=&&	 $  r   N)r   r   r   ztuple[Any, ...]rw  r  r   r	   )r   r   r   r  )r   r  r  r  r  r  s   r   CSEProxyr  @  s-    '6@N r   r  c              3  2   K   | ]  }t        |        y wr   rE  )r   r  s     r   r   z,TritonKernel._lift_helper.<locals>.<genexpr>Z  s     BFBr  return r"  )rY   r%  rV   rc  r   r!  r  r&  r  r  from_iterabler  r}  rr   !torch._inductor.shape_propagationr  r-   r  rO   set_ops_handlerr  r'  getvalue)r  r  r  dtypesr,  r   	signaturerr   r  r  outputsr   r  r  r  r  r  s       `      @@@@@@r   _lift_helperzTritonKernel._lift_helper  sy   
  !'e 1X

 	  )23vv3F)G 
 
 IIRioo.K.KD.QRR	=267#%	 *PP2424	 	~ 	0 ]]_ 	2a//
; 	2$iGiiB'BBGwwi01	2 	2
 $$(():k(RRk
`	2 	2 	2 	2s)   /F
F2FFF	FF$c                B     j                   sJ  j                  rJ d       t        d  j                  D              } j	                  |       t        |      } j                  rJ d       g }g }t        d |D              }t        j                   j                  j                   j                        } j                  |||      } j                          j                  z
  }	t!        ||      D ]U  \  }
} j                  j                   j                  |
 dt#        |       d||
j$                        } j                  j                   j                  d| d	 j'                          d|t         j)                                     }
|j+                  |
       t-        |      } j.                  rǉ j)                         }d
|d<    j                  j1                  ||      }dd	j3                  |       d}|j4                  rdnd} j6                  j9                  | d| d	| d	| d       |j+                  |       X d  fd} |d |       d|	 d	| d|||      } j.                  sd }|D cg c]*  } |d| dt;        |j<                         ||            , }} |t        |      t        |            } |t        |      |      }t!        ||      D cg c]+  \  }} |d| d	| d|j<                  |j$                        - }}}t!        |||      D ]*  \  }}} j                  j9                  | d| d	| d       , n|}|D ]$  }t?        |t@              sJ t        |      |_!        & t        |      S c c}w c c}}w )z:
        Perform an associative scan on 'values'.
        TODOc              3  :   K   | ]  }|j                    d   ywrI  r  r  s     r   r   z$TritonKernel.scan.<locals>.<genexpr>l  r  r  z(ops.scan not supported inside ops.maskedc              3  2   K   | ]  }t        |        y wr   rK   r   r   s     r   r   z$TritonKernel.scan.<locals>.<genexpr>t       Fe*51Fr  r  r  r  r  r  r   rK  r  r  zfloat('nan')z-1r?  c                2    dj                  d | D              S )Nr!  c              3  &   K   | ]	  }| d   ywr  Nr   r   r  s     r   r   z1TritonKernel.scan.<locals>.csv.<locals>.<genexpr>       <EugQK<r  r&  r  s    r   csvzTritonKernel.scan.<locals>.csv      88<V<<<r   c                N   t        |      }t        |      D cg c]  }|  d| d|  }}t        fd|D              r)|D cg c]  }j                  j	                  |       c}S t        ||      D 	cg c],  \  }}	j                  j                  ||	j                        . }
}}	j                  j                   |
       d|         t        |
|      D ]*  \  }}|r||_
        j                  j                  ||       , t        |
      S c c}w c c}w c c}	}w )Nr  c              3  T   K   | ]  }j                   j                  |       ! y wr   r   containsr   r  r  s     r   r   z:TritonKernel.scan.<locals>.cse_multiple.<locals>.<genexpr>        LI488$$Y/L   %(r  r  r   rc  rR  r   r  r  r$  r   r  r%  r	  r  r   )r  r  r  r  rc  r,  
cache_keysr  r   r  result_varsr  r  r  s               r   cse_multiplez'TritonKernel.scan.<locals>.cse_multiple  s   FA;@8DaTF"QCr%1DJDLLLAKLIY/LL '*&&&9"UE e5;;?K  LL""{#$Cv. *-[*)E 4%
I+0J(Y
34 %% EL   D"D91D!ztl.associative_scan((r  c                T    | j                   y t        | j                         }d|d<   |S )Nr   rK  )r   ra  )r   r   s     r   _partial_scan_shapez.TritonKernel.scan.<locals>._partial_scan_shape  s*    99$ OE #E"I Lr   ztriton_helpers.select_one((z1), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)ztl.where(roffset > 0, z = tl.where(roffset > 0, )"rY  r  r   re  r8  r'  r  r   r  r  r   r  r  r  r   rX  r  rS  r   rF  r  r  r^  r  r$  r&  r   r8  r%  rK   r   r   rn  r	  ) r  r  ro  r  r  broadcasted_valuesaccumulatorscse_computecombine_helper_fnrg  r  r   value_dtyper^  reduced_sizerj  reduced_size_strr  r  partial_scan_varsr  partial_scan_varpartial_reduce_vars	accs_nextfull_scan_vars	full_scanpartial_scanr  acc_nextpartial_reducer  r  s    `                              @r   scanzTritonKernel.scan_  s    $$$$--5v5-MD<L<LMM% u??N$NN"FvFF''(9(94<<H --j&&I%%'$*A*AA/ 	1LE5((++'1%89;kk	 , K HH%%";-r$2E2E2G1HJD0023	 & E %%e,&u-H,,#335#&R "hhooEoN%&tyy'>&?q#A ,1,C,C.		##"m;/?.@7)2hZWXY ##K09	1<	=	&$ )#C(:$;#<CuBGXFYYZ[	
 ((! ):# % 12B1CCtu-.>.D.DE-.>?# # #5#6>Q8RSI'l(;=NON 03>CT/U ,I| ,YKr,qI&,,&,,K  :=<)<: 5+~ &&"m#<XJbHXXYZ ,K% 	5Jj*;<<<#-e#4J 	5 [!!?#s   /N20Nc                ,     j                   sJ  j                  rJ d       t        d  j                  D              } j	                  |       t        |      } j                  rJ d        j                  sJ d       t        j                   j                  j                   j                        } j                          j                  z
  }t        d |D              }t!        |      t!        |      k(  sJ t#        |      D 	cg c]?  \  }}	 |d|	 d j%                          d||   t         j'                               	      A }
}}	d
  fd} j                  d   j(                  sJ  j+                   j                  d         rdnd}t!        |      dk(  r'd|
d    d|
d    d| d| d| d| d} |||
||      }nt-        d      t/        ||      D ]  \  }}||_        |j2                  |_         t        |      S c c}	}w )Nr  c              3  :   K   | ]  }|j                    d   ywrI  r  r  s     r   r   z$TritonKernel.sort.<locals>.<genexpr>  r  r  z(ops.sort not supported inside ops.maskedz3ops.sort is only supported in persistent reductionsc              3  2   K   | ]  }t        |        y wr   r  r  s     r   r   z$TritonKernel.sort.<locals>.<genexpr>  r  r  r  r  r  r  c                2    dj                  d | D              S )Nr!  c              3  &   K   | ]	  }| d   ywr  r   r  s     r   r   z1TritonKernel.sort.<locals>.csv.<locals>.<genexpr>   r  r  r  r  s    r   r  zTritonKernel.sort.<locals>.csv  r  r   c                N   t        |      }t        |      D cg c]  }|  d| d|  }}t        fd|D              r)|D cg c]  }j                  j	                  |       c}S t        ||      D 	cg c],  \  }}	j                  j                  ||	j                        . }
}}	j                  j                   |
       d|         t        |
|      D ]*  \  }}|r||_
        j                  j                  ||       , t        |
      S c c}w c c}w c c}	}w )Nr  c              3  T   K   | ]  }j                   j                  |       ! y wr   r  r  s     r   r   z:TritonKernel.sort.<locals>.cse_multiple.<locals>.<genexpr>  r  r  r  r  r  )r  r  r  r  rc  r,  r  r  r   r  r  r  r  r  s               r   r  z'TritonKernel.sort.<locals>.cse_multiple  s   &'A;@8DaTF"QCr%1DJDLLLAKLIY/LL %(0B$C E5 e5;;?K  LL""{#$Cv. *-[*)E 4%
I+0J(Y
34 %% ELr  rK  r   rnumelr"   ztriton_helpers.sort_with_index(r   rQ   z	, stable=z, descending=zUnhandled sort)rY  r  r   re  r8  r'  r  r  r  r  r   r  r  r   rX  r   r   r!  rF  r  r\  r  r`  r  r	  rs  )r  r  r  stable
descendingr  r  rg  r,  r  r  r  r  r  r  r  	input_varr  s   `                @r   r>  zTritonKernel.sort  s@    $$$$--5v5-MD<L<LMM% u??N$NN"(( 	
A	
(  ''(9(94<<H%%'$*A*AAFvFF6{c&k))) &f-
 5 "5'D,?,?,A+B!DQiD0023
 
	=	&$ #00002243C3CB3GHhv;!12DQ2G1HK]^_K`Ja b82cU)F8=AO  't-?OK !122%(f%= 	1!J	#(J  ) 0 0J	1 [!!]
s   AHc                    | j                   sy|j                  | j                          | j                   j                          | j                  j                          y)z
        Generate the output from prologue. This should be
        extracted from the subgraph, which is why this is
        partitioned from codegen_body.
        N)r  r  clearr  )r  r  s     r   codegen_prologuezTritonKernel.codegen_prologue&  s@     }}DMM"!!#r   c                f   | j                   s=| j                  s1| j                  s%| j                  s| j                  s| j
                  sy| j                  D cg c]  }|j                  s| }}| j                  r| j                  sJ d       i }t        | j                        D ]  \  }}|j                  }t        j                  j                  |t         j"                        }| j%                  t&        |      }d| }| j(                  j+                  | d| d       | j,                  j/                  |t         j"                  d      ||<    | j(                  j+                  d       | j(                  j+                  d	       | j(                  j1                  d
      5  | j3                         sZ| j                  d   }	|	j4                  dk(  sJ |	j4                  }
| j(                  j+                  |
 d|	j6                   d|
 d       | j(                  j9                  | j                          | j(                  j;                  dg       | j(                  j9                  | j                         | j(                  j9                  | j                         | j(                  j9                  | j                         | j(                  j9                  | j
                         t        | j                        D ]  \  }}|j<                  }d| }t        j>                  |j                  t         j"                        }tA        |j                        }| j,                  jC                  | j(                  | d| d|jD                  d      }ddl#}|jH                  jJ                  jM                  | d| j(                        5   |||   |      }ddd       | j(                  j+                  | d         	 ddd       tO        tQ        | j                              D ]$  }| j(                  j+                  d| d| d       & n/| jR                  rtQ        |      dkD  rt        |      D ]  \  }}| j(                  j1                  |      5  |j4                  }| jT                  rdnd}| jT                  rdn| d}t         jV                  jX                  rt[               dkD  rd}nd}| j(                  j+                  d | d!| d"| d"|j]                          d#| d$       ddd       | j(                  j1                  |d
z         5  | j_                  || j(                         ddd        | j(                  j1                  tQ        |            5  | ja                  | j(                         | j(                  j9                  | j                          | j(                  j9                  | j                         | j(                  j9                  | j                         | j(                  j9                  | j                         ddd       tc        g t        |            D ]u  \  }}| j(                  j1                  |d
z         5  | jd                  |jf                     ji                         D ]  \  }}|tQ        |      d
z
  k  ry||d
z      }| jd                  |jf                     }||v rT||   }tj        jm                  |      }to        |jp                  |      }ts        ||      D cg c]  \  }}|||z  z
   }}}| j(                  j+                  tu        | jv                  |   | d%| d"tx        jz                  j}                  |       d&              	 ddd       | j,                  j                  | j                         |j                          x n| j(                  j9                  | j                          | j(                  j9                  | j                         | j(                  j9                  | j                         | j(                  j9                  | j                         | j(                  j9                  | j                         | jT                  rb| j                  s| j
                  rJ| j                   d'}| j(                  j9                  d(| d)d*+       | j                  j                          | j                  s%| j(                  j9                  | j
                         | j                   j                          | j                  j                          | j                  j                          | j                  j                          | j                  j                          | j
                  j                          yc c}w # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY wc c}}w # 1 sw Y   xY w),a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        Nz1Mix order reduction requires persistent reductionaccumz = tl.full([R0_BLOCK], z, tl.float32)[None, :])r   R0_BLOCKr  z/split_size = min(RSPLIT_SIZE, xnumel - xoffset)z@for _ in tl.range(0, split_size, XBLOCK, num_stages=NUM_STAGES):rQ   )r   r   rN  mask =  < r  zxindex += XBLOCKr  z, 0))r  r  r  z&tl.store(ws_ptr + (tl.program_id(0) + z3 * tl.num_programs(0)) * r0_numel + r0_index, accumz
, r0_mask)rsplit_startr  
rsplit_end)r   r"   z, num_stages = 2r   zfor zoffset in tl.range(r  r   r  z = tl.advance(r  z + tl.program_id(1)zR
                if HAS_RSPLIT:
                    triton_helpers.x_grid_barrier(r+  Tr  )Findexing_coder  r  r  r  r  re  r  mix_order_reductionr  r!  r  r   r$   r  rR  r   r  r8  rc   r8  r%  r   rj  r  r  r   r   r  
writelinesr  rU  r   r  r   unittestmockpatchobjectrc  r   rY  r  r  r  r   r  r  r  rb  r  r   r  r   r   r   r  r  rX   r  rO   r   r  
invalidater  cache_clearr  r  rw  r  ) r  r   
loop_treesaccumname2varr  partial_accumr   r  r   entryrN  r   ro  triton_reduction_functionnewvalr  rp  levelr   
loop_startloop_end
num_stagesr  advancement	prev_treeprev_advancementsprev_advancement
prev_blockprev_num_itercurprevsem_ptrs                                    r   codegen_bodyzTritonKernel.codegen_body3  s    zz{{||%%##'+'7'7Ht4<<dH
H##,, C, M&/0M0M&N 
"]!.!=!=,,::>5;;W33M7Kse}		##f3G9<RS '+hh&7&73D '8 'd#
 II QRIIR !!!+ )?//1 ,,Q/E <<3...AII''1#WUZZLA3e(LM		  !3!34		$$*
 		  ,		  .		  -		  !5!56 +4D4Q4Q*R ?&C'--C"3%=D!#!<!<%44ekk"J 1N%441- "XX..		45Qse4@!ii+	 / F $!,,33D)TYYO ",)$/"#
 II''4&G9(=>-?')?V S!>!>?@ 		##<SEAtuxty  zD  E
 ""s:':(4 JtYY%%U%3 ![[F373M3MSVJ(,(B(B6(RWHX 
 }}((-?-AF-J%7
%'
II''vh&9*RzQSTZT`T`TbScchishttvw YY%%UQY%7 J88tyyIJ J!J( !!Z!9 ...tyy9		  !3!34		  ,		  .		  -.  ((@)J*?(@A  #tYY%%UQY%7 262K2K		3eg.	; !3z?Q#66(2519(=I040I0I )1-
  ),==3DY3O 0-:-I-I)-T
07	0T 69FV5W/"(1T %($*>$>/" /"
 		++( $ 8 8 C#,+^I;bI^I^_jIkHllm n)< ##D$:$:;  "A #D IIT//0IITZZ(IIT\\*IIT[[)		//0%%""d&:&:--..ABGII33:) <    66BBD''IIT112  "

$$&""${ Ir I)? )?f J J. .4/"# sw   g*g&Hg,g*g,Bg9hB0hB#h&4h Ah&g)$g,,g69h	h	h h&&h0	c                   g }| j                         r3g }| j                  d|g        |D ]  }t        |t              r|j	                  t        |             /t        |t              rZt        j                  j                  j                  |j                  | j                        }|j	                  t        |             t        |t        j                        rQt        j                  j                  j                  || j                        }|j	                  t        |             t        dt!        |              |S )Nr   r  z!Unsupported numel argument type: )r  add_numel_to_call_argsr   r  r  r   ro   rO   rP  rI  optimization_hint_with_override
inner_exprr  r   r   rL  r"  )r  r   
numel_argsr   hints        r   kernel_benchmark_extra_argsz(TritonKernel.kernel_benchmark_extra_args  s   !+-J''J;! Vc3'KKC)_577++KK&*&8&8 L D KKD	*UZZ077++KK&*&8&8 L D KKD	*$'Hc%TUU!V" r   c                   t               }| j                  j                         \  }}}}|j                  g d       |j	                         5  t        j                         }g }t        ||      D ]  \  }	}
dt        |       }t        j                  j                  |	      }|rt        j                  j                  j                  |j                         | j                        }t        j                  j                  j                  |j!                         | j                        }|j#                  | d| d| d|j%                          d|j'                          d
       n|	t        j                  j(                  v rt        j                  j(                  |	   }t        j                  j                  j                  |j+                         | j                        }t        j                  j                  j                  |j-                         | j                        }|j#                  | d| d| d|j.                   d|j0                   d
       nt3        |
t4              rft        j                  j                  j7                  |
j8                  | j                        }d	|
j:                  v rd
}|j#                  | d|        nt3        |
t<              rt        j                  j?                         }t        j                  j                  j7                  |
j                  | j                        }|j#                  | d| d| d|
j0                   d       ntA        d|	       |jC                  |        |jE                  | jG                                |j#                  ddjI                  |       d       ddd       |j                  g d       t        j                  j?                         }|jJ                  }|j	                         5  |j#                  dt        j                  jL                  jO                  |       d       |j	                         5  |j#                  t        j                  jL                  jQ                  |             tS        |      }|j#                  | d| d       |j#                  tU        tV        jX                         d| d       ddd       ddd       |j                  g d       |j	                         5  |j#                  dt        j                  jL                  jO                  |       d       |j	                         5  |j#                  t        j                  jL                  jQ                  |             |j#                  dtU        tV        jX                         d       ddd       ddd       |j                  g d       |j	                         5  |j#                  d       |j#                  d       |j#                  d       |j#                  dt        j                  j?                         jZ                   d       |j#                  d|        |j#                  d       |j#                  d        ddd       |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   |S xY w)!a  
        Generates Python code for benchmarking this Triton kernel.
        - Creates example inputs (random tensors, constants, sizes).
        - Runs the kernel on the current GPU/stream.
        - Prints runtime (ms) and throughput (GB/s) using `num_gb`.
        Args:
            num_gb (float): The number of gigabytes to use for throughput calculation.
        Returns:
            IndentedBuffer: A buffer containing the generated Python benchmark code.
        )r   r   zdef get_args():arg_r  z = rand_strided(r  z
, device='z	', dtype=r  r  r   r  z = torch.zeros(z*Don't find the buffer or const tensor for r  r  N)
r(  zdef call(args):zwith r  z = get_raw_stream(z.run(*args, stream=)r(  r(  z def benchmark_all_configs(args):z.benchmark_all_configs(*args))r(  r(  zif __name__ == '__main__':z<from torch._inductor.runtime.benchmarking import benchmarkerr   zargs = get_args()z7ms = benchmarker.benchmark(lambda: call(args), device='z
', rep=40)z	num_gb = zgb_per_s = num_gb / (ms / 1e3)z<print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")).rY   r   python_argdefsr  r  r  r  r  ri  rO   rP  try_get_bufferrI   optimization_hints_with_overrideget_sizer  
get_strider%  
get_devicery  	constantsr  ri  devicer   r   r_   r!  r   r   ra   r  KeyErrorr  extendr%  r&  r  
device_opsdevice_guard
set_devicer=   r   rD   KERNEL_NAMEr"  )r  num_gbrk  _argdefs	call_argsr  r(  name_cnt	var_namesarg_namearg_sigvar_namer~  r  ri  const_tensorsymval_hintr0  r  current_devicer  stream_names                         r   codegen_kernel_benchmarkz%TritonKernel.codegen_kernel_benchmark  s:     !,0II,D,D,F))Y56]]_ :	@ (HI%(I%> 5+!'!$x.!12gg,,X677++LL&*&8&8 M D WW--NN(&*&8&8 O F $$#*$4TF"VHJs~~O_N``ijmjwjwjyizz{| !2!22#$77#4#4X#>L77++LL$))+&*&8&8 M D WW--NN$++-&*&8&8 O F $$#*$4TF"VHJ|ObObNcclmymm  mA  AB  C  1"#''"2"2"R"R&*&8&8 #S #K %4&'$$z[M%BC6WW@@BFGG,,LLT5G5G M E $$#*OE7*VHIV]VcVcUddef #DXJO    *k5+l T==?@wtyy';&<A>?u:	@x 	9:<<>$$]]_ 
	uQWW%7%7%D%DU%K$LANO   GG&&11%8 2%8  K=0B5'!KL  ;22344G}TUV
	 	JK]]_ 	uQWW%7%7%D%DU%K$LANO   GG&&11%8   c+"9"9:;;XY		 	DE]]_ 	N R 01I!''JmJmJoJtJtIuu  A y12=>N	  Q:	@ :	@B 
	 
	  	 		  sf   N[$A[1B	[:[-A[5:A%[([5B%\[[	[[%([2	-[55[?\c                    t        j                  dj                  t        j                  j
                  j                  d                  S )Nzl
            from torch._dynamo.testing import rand_strided
            {}
            import torch
        get_raw_stream)textwrapdedentr  rO   rP  r3  import_get_raw_stream_asr  s    r   imports_for_benchmark_kernelz)TritonKernel.imports_for_benchmark_kernelw  s:     F177%%>>?OPQ
 	
r   c                    | j                   ry| j                  ry| j                  r| j                  sJ y| j                  ryy)Nr  r  r  r}  	pointwise)r  r  r  rY  r  s    r   _get_heuristiczTritonKernel._get_heuristic  sD    !''*&&(((()""r   c                T   t               }|j                  d       	 dd l}|j                  d       t	               x}r|j                  |       |j                  d       t        j                  j                  r|j                  d       |j                         S # t        $ r Y tw xY w)NzP
            import triton
            import triton.language as tl
            r   zU
               import triton.language.extra.tlx as tlx  # noqa: F401
               a'  
            from torch._inductor.runtime import triton_helpers, triton_heuristics
            from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
            from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
            z
                import triton.profiler as proton
                import triton.profiler.language as pl
                pl.enable_semantic('triton')
                )
rY   r  triton.language.extra.tlxImportErrorr   r%  r#   r   proton_profilingr  )r   importsr   	attr_descs       r   gen_common_triton_importsz&TritonKernel.gen_common_triton_imports  s     !"	
		,NN 34494i(	
 ==))NN !!+  		s   B 	B'&B'c                x    t         j                   | j                         t         j                  j                  dS )N)enable_fp_fusion
launch_pdldisable_ftz)r#   emulate_precision_castsr  r  rW  r  s    r   triton_meta_commonzTritonKernel.triton_meta_common  s5     %+$B$B B113!00<<
 	
r   c                ~   t         j                  j                  j                         t        j
                  t        j                  t        j                  j                  t        j                  t        j                  t        j                  t        j                  t        j                  t        j                  j                  t        j                  j                  t        j                  j                   t        j"                  t        j$                  j&                  t        j                  j(                  d}t        j*                  rt        j,                         |d<   t         j.                  j0                  d|d<   t	        j2                         rd|d<   t        j4                  rLt        j4                  |d<   t        j6                  |d<   t        j8                  |d<   t        j:                  |d	<   t        j<                  r9t        j<                  |d
<   t        j>                  |d<   t        j@                  |d<   |S )N)backend_hashassert_indirect_indexingautotune_local_cacheautotune_pointwiseautotune_remote_cacheforce_disable_cachesdynamic_scale_rblockmax_autotunemax_autotune_pointwisemin_split_scan_rblockspill_thresholdstore_cubinr  r  &mix_order_reduction_allow_multi_stages$are_deterministic_algorithms_enabledTis_hipr  profile_bandwidthprofile_bandwidth_regexprofile_bandwidth_output/profile_bandwidth_with_do_bench_using_profilingcoordinate_descent_tuning coordinate_descent_search_radius'coordinate_descent_check_all_directions)!r   r&   _tritontriton_hash_with_backendr#   r\  r]  r   r^  r_  r`  ra  rb  rc  rd  re  rf  r  r  r  rg  *write_are_deterministic_algorithms_enabledrh  r  r  r  rj  rk  rl  rm  rn  ro  rp  )r   inductor_metas     r   inductor_meta_commonz!TritonKernel.inductor_meta_common  s    "KK//HHJ(.(G(G$*$?$?"(--"B"B%+%A%A$*$?$?$*$?$?"//&,&C&C%+]]%H%H%}}<<!==44#11.4.A.A.`.`6<mm6j6j
$ <<::< @A ==(&*M(#)-M+&##171I1IM-.7=7U7UM348>8W8WM45FF KL ++00 56 77 <= >> CD r   c                  -. t               }i }| j                  j                         D ]_  \  }}t        |      r| j                  st
        j                  j                  j                  |      }t        t        |            }|||<   a ||j                  | j                                t
        j                  j                         j                  }|dk(  r|j                  d       n|j                  d       t        j                   r|j                  | j#                                | j$                  j'                         \  -}	.}	t)        .      D ]  \  }
}t+        |t,              st/        t0        j2                  |j4                        }|t
        j                  j                  j6                  v sbt-        |j8                  t
        j                  j                  j6                  |         .|
<    t;               }| j<                  D ]  }|| j$                  j>                  v r(|jA                  | j$                  j>                  |          || j$                  jB                  v rj|t
        j                  jD                  vrN|| jD                  vr@|jA                  t/        tF        | j$                  jB                  |         jH                         || j$                  jJ                  v s| j$                  jJ                  |   }t+        |tL              rJ |jA                  |        tO        -.      D ]O  \  }}t+        |tP              s|jR                  tT        jV                  k(  s5|jA                  |j8                         Q tY        |      }| j[                         D ]Z  }t-        |j\                   d|j^                        }.ja                  |       -ja                  tc        |j8                               \ -.fd}| jd                  D ]K  }|jf                  r| jh                  r|jj                  ) ||j\                  jm                          d       M | jn                  r |d       | jp                  r |d	        |d
       ts        .| jt                  -      }|tw        jx                  t
        j                  j                               i tz        j|                  j                  j~                  j                  xr0 dt        | j                        v xs dt        | j                        v d| j                         }| jn                  rd|d<   t
        j                  j                  xs t
        j                  j                  }| j                         j                  t        | j                        t        t        j                        ||| j                  | j                  | j                  | j                  | j                  d
| j                         }| jp                  r| j                  |d	<   t        j                  st        j                  j                  r| j                  xs | j                  |d<   t
        j                  j                  jg                         xr | jh                   }| j                  }t        | j                        dk(  }|r|r| j                  j                  | j                        }|j                  j                  j                  d   }|j                  }|'d|v r#d|v r|d   t        |d   d      z  }|t        k\  }n,| j                  j                  |      t        j                  k(  }|j                  j                  j                  } |j                  j                  j                  }!t
        j                  j                  j                  |       t        t
        j                  j                  j                  |!      d      z  }"|"dk\  r|rt
        j                  j                  j                  | j                  j                  d      rHt
        j                  j                  j                  | j                  j                  d      r
|dk  rd|d<   | j                  r| j                  |d<   | j                  r| j                  |d<   | jn                  r| jh                  |d<   d}#t        j                   st        j                  r| j                         dz  }#|#|#|d <   t        j                   r| j                         }$|$|$|d!<   t        .      D ]  }%d|d"   .|%   j8                  <    || _r        || _s        | j                  | j                         | j                          | j                  | j                         tz        j                  j                  | j                  rt        .d#$      g|d%<   nt        .      g|d%<   | j                  D ]$  }&|j                  d&       |j                  |&       & | j                  r2d'| j                          d(| j                  j                  d)|d*|d+	}'n| j                  rF| j                  j                  | j                        }(d'| j                          d,|d-|( d)|d*|d+}'nYd&})t        |      dk(  rt        t        .            d.k(  rd/})nd0})d'| j                          d,|d1|) d2|d*|d3| j                   d+}'|j                  |'       |xs t        t        j                         }*|j                  d4|* d5d1j                  d6 -D               d7       |j                         5  t        j~                  j                  r|j                  d8|* d9       | j	                  |       | j$                  j                         D ]  \  }+},|j                  |+ d:|,         |j                  | j                         t        j~                  j                  r|j                  d;|* d9       ddd       t        j                   r!|j                  | j                  |#             |j                         S # 1 sw Y   KxY w)<z
        Convert the TritonKernel from Inductor SIMD IR to triton code, including inductor triton heuristics, imports,
        metadata, and benchmarking infra.
        Ncpuz"triton_helpers.set_driver_to_cpu()z"triton_helpers.set_driver_to_gpu()r  c                    t               rj                  t        |              j                  t        | d             y )NT)is_constexpr)rJ   r  rU   rS   )r<  argdefsr  s    r   add_constexpr_argz6TritonKernel.codegen_kernel.<locals>.add_constexpr_argF  s/    -/  h!78NN78$?@r   r   r  RSPLIT_SIZE
NUM_STAGES)
size_dtyperz  ztl.dot)r  r0  r/  native_matmulTlaunch_cooperative_grid)
	grid_typer  kernel_namemutated_arg_namesoptimize_memrV  r  num_load	num_storenum_reduction"has_loadstore_with_contiguous_rdimr"   r   rN  r0_rQ   g?i   i   
   add_persistent_rblocktiling_scoresr  r  g    eAkernel_num_gbkernel_flopr/  r   )pointer_range_overrideconfigsr   z$
                @triton_heuristics.z(
                    config=zI,
                    filename=__file__,
                    triton_meta=z$,
                    inductor_meta=z;
                )
                @triton.jit
            z!(
                    size_hints=z%,
                    reduction_hint=rX  ztile_hint=TileHint.SQUARE,ztile_hint=TileHint.DEFAULT,r  zH
                    filename=__file__,
                    triton_meta=z*,
                    min_elem_per_thread=zdef r  c              3  <   K   | ]  }|j                           y wr   )	full_namer  s     r   r   z.TritonKernel.codegen_kernel.<locals>.<genexpr>  s     *JQ1;;=*Jr2  r  zpl.enter_scope("r  r  zpl.exit_scope(")rY   rZ  r  rE   rY  rO   rP  rI  optimization_hintr6   r  r  rS  r  r"  r#   benchmark_kernelrI  r   r)  r!  r   r_   r
   r   r  r   inv_precomputed_replacementsr   r   	mutationsinput_buffersr'  rz  removed_buffersrZ   
inner_nameoutput_buffersr^   r  ra   	zero_moderb   ZERO_ON_CALLr'  r   r   r  r  rS   re  r\  r  r   r  r  r  rn   r  r1   rl  r   r  r   r  r   r8  r  rY  is_inferenceis_backward_get_grid_typer   setr  rD   DESCRIPTIVE_NAMErV  r  r  r  r  ru  rsplit_sizer  r  r  r  r  r   r[  r  r   r  memory_stats
persistentmemoryrg  count_per_threadr~   INNER_REDUCTION_RATIO_THRESHOLDget_reduction_hintr2   INNERloopedbytesstatically_known_leqreduction_numelstatically_known_gtr  rj  estimate_kernel_num_bytesestimate_flopsrk   r  rt  r  r  r  r  r  rj   r  r%  r  rL  rl   r  r6  r&  r  rP  codegen_static_numelsaliasesrC  r  )/r  r   r  
size_hintsr   r  
numel_hint	size_hintdevice_typer(  r,  r   r  mutated_argsmutationmutation_argargnamer   sizeargr{  triton_meta_signaturer  r  rt  
looped_redr  	two_d_redr  	dim_statsmem_ops_per_threadr_coalesce_ratiocontiguous_red
looped_mempersistent_memsaved_bytes_ratior7  flopsarg_numr  heuristics_linereduction_hint	tile_hintr  oldnewrz  r  s/                                                @@r   codegen_kernelzTritonKernel.codegen_kernel  s4    
![[..0 	+MFE"6*43H3H));;EBJ'J8I!*Jv	+ <KK6689''==?DDKe#@A@A&&D==?@#'99#;#;#= Iq	* 	FAs#w' ellCHH5QWW--JJJ#*!''"2"2"O"OPV"W$IaL	 )3 	/H499222  !8!8!BCDII555AGG$;$;;D$8$88  )B)B8)LMXX 499333#yy77A%lJ???  .	/6  3 	/LGS3-MM%6%C%CC  .	/ l+++- 	2DU3TZZ@GW%NN77<<01	2	A $$ 	=D  T%>%>&!2!2!4 5U;<	= %%h'##m,l+ 1$"2"2G!
 /&--agg.Q.Q.ST&&--;; RTYY/P8s4<<?P3P	'
 %%'	'
 %% 6:K12
 ww++Bqww/B/B ,,.77!$"5"56{;;<!-( $ 5 5!//
 '')
 ##+/+;+;M-(6#6#6#U#U22 766 >? XX&&335Wd>W>W:W
**$)	)==55dkkBL$//66::1=I!*!;!; )=(]* $1#7#mC>PRS:T#T !15T!T MM44]C$**+ 
 &,,3399J)44;;AAN ! 0 0 B B: NQT  22>BR ! "S("GG$$99MM115 GG$$88MM114 '",9=56-1-?-?M/*##373K3KM/0%%484M4MM01""f&>&>335;F!17o.""'')E /4m, +95 	BG@AK$Yw%7%<%<=	B '*dii(#
 ==(T-B-B&/	RT&U%VK	"&/	&:%;K	"++ 	 FNN2KK	  #$$($7$7$9#: ; --447 8!!, 0##0"3 4O ""!]]==d>P>PQN#$$($7$7$9#: ;  *~ .$$2#3 4!!, 0##0"3 4	O I:!#/	:;q@ <I =I#$$($7$7$9#: ;  *~R	{ ;!!, 0##0"3 4))-)A)A(B C	O 	O$:c+"9"9:;-q*J'*J!J K2N	
 [[] 	B}}--!1+bAB&&t, II--/ 1S#c#/01KK		"}}--R@A	B ""KK55f=>}}	B 	Bs   :Cssc                   t         j                  j                  j                  |       } t	        | t
        j                  t        f      rt        |       }t        |      }|S d}t         j                  j                  j                  | |      sC|dkD  rt        d|        |dz  }t         j                  j                  j                  | |      sC|S )NrQ   i @  z!Failed to find static RBLOCK for r"   )rO   rP  rI  simplifyr   r   r   r  r6   r  rL  )r  r  s     r   r  z#TritonKernel._get_persistent_RBLOCK(  s    !!**62fu}}c23f+C!#&C 
 Cgg&&;;FCH?$'H%QRRq gg&&;;FCH
 Jr   c                N    	 t         j                  |        y# t        $ r Y yw xY w)NTF)r  r  rL  )r  s    r   has_persistent_RBLOCKz"TritonKernel.has_persistent_RBLOCK9  s*    	//7 		s    	$$c                   d
d}| j                   D ]e  }|j                  r| j                  rdt        j                  j
                  j                  |j                        } ||      r)|j                  |j                   dt        |              |j                  r| j                  r| j                  r1| j                  | j                  |j                              }d| d}n3| j                  |j                        }| j                   rt#        |d      }|j                  |j                  j%                          d|        |j                  dk(  sG| j&                  sU|j                  d       h y	)a  
        We get a small speedup from hard coding numels if they are static.

        This code stomps on the passed-in values by writing an constant to the top of the kernel.

        In a kernel like:
        def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):

        We would add
        xnumel = 4096
        r0_numel = 768

        After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
        a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
        knows that its a static numel, as that you just plop a constant into the kernel.
        c                B    t        | t        j                  t        f      S r   )r   r   r   r  r   s    r   is_static_integerz=TritonKernel.codegen_static_numels.<locals>.is_static_integerS  s    dU]]C$899r   znumel = z*triton_helpers.constexpr_next_power_of_2((z + RSPLIT - 1) // RSPLIT)r  zBLOCK: tl.constexpr = rN  zXBLOCK: tl.constexpr = 1N)r   r   r   r   )re  r\  rY  rO   rP  rI  r  r  r%  r   r  r  r  r  r  r  r  r~   r  rV  )r  r  r  r   simplified_tree_numelr  r  s          r   r  z"TritonKernel.codegen_static_numelsA  s,   $	: $$ 	;D$$(=(=()(8(8(A(A$**(M%$%:;NNdkk](3?T;U:V#WX  T%>%>-- JJt';';DJJ'GHEFugMfgC55djjAC,,!#rl$++"3"3"5!66LSERS{{c!dmm9:'	;r   c                J   t        | j                  D cg c]  }t        |j                          c}      }| j                  r|dk(  sJ t
        j                  S | j                  r|dk(  sJ t
        j                  S |dk(  rt
        j                  S |dk(  rIt        t        | j                  | j                              rt
        j                  S t
        j                  S |dk(  r,| j                  rt
        j                   S t
        j"                  S t%        d|       c c}w )NrQ   r"   r   z"Unsupported number of dimensions: )r7  re  r  r\  r  r.   MixOrderReductionGridr  CooperativeReductionGridGrid1Dr}   r(  r|  Grid2DWithYZOverflowGrid2Dr  BatchMatmulGrid3DGrid3DrL  )r  r   rc  s      r   r  zTritonKernel._get_grid_typek  s    8H8HI***+IJ##6M6$:::''6M6$===!V$+++!V3t22D4D4DEF(===$+++!V$$(:::$+++=aSABB# Js   D c                   | j                   D ]  }t        |j                  t        j                  t        j
                  f      r|j                  }n*t        j                  j                  j                  ||      }|j                  r| j                  s|j                  |       |j                  t        |              y r   )re  r   r  r   r   r  rO   rP  wrapper_codegenerate_numel_exprr\  rY  r  r"  )r  r   r9  	arg_typesr   r   s         r   r   z#TritonKernel.add_numel_to_call_args  s    $$ 	-D$**u}}ell&CDzzww++??dK$$(=(=  &  d,	-r   c                   t         j                  j                  }|j                          | j                  j                         \  }}}}| j                  |||       | j                  j                  D ]  }|j                  |        |j                  ||d|| j                  | j                         |r| j                          y y )NT)r   r  r  rt  )rO   rP  r  write_triton_header_oncer   r)  r   workspace_argsgenerate_workspace_allocationgenerate_kernel_callr  rt  deallocate_workspaces)	r  r   r   deallocate_wswrapperr(  r9  r  wss	            r   call_kernelzTritonKernel.call_kernel  s     ''&&((*%)YY%=%=%?"9a##D)Y?))** 	6B11"5	6 	$$((,, 	% 	
 &&( r   c                   t         j                  j                  }| j                  j	                         \  }}}}t        ||      D ]w  \  }}t        |t              st         j                  j                  r|j                  d| d| d       Jd| d}|j                  |       d| d}|j                  |       y y )Nz:AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_check_inf_and_nan("z", z));zassert not z.isnan().any().item()z.isinf().any().item())
rO   rP  r  r   r)  r  r   r`   cpp_wrapperr%  )r  r  r(  r9  arg_signaturesr   arg_signaturer  s           r   codegen_nan_checkzTritonKernel.codegen_nan_check  s    ''&&*.))*B*B*D'9na"%i"@ 
	,C-377&&%%TUXTYY\]`\aade )-BCD%%d+(-BCD%%d+
	,r   c                    t        |i |S r   )rn  )r  r   rw  s      r   r  zTritonKernel.create_cse_var  s     $1&11r   c                F   |j                    d| j                  | j                  |j                               }|j                  j
                  s| j                  r+|j                  dk(  r| j                  j                  |       y | j                  j                  |       y )Nr  rN  )r   r  r  r   rootr  r  r   r  r%  r8  )r  r  r  s      r   codegen_iteration_ranges_entryz+TritonKernel.codegen_iteration_ranges_entry  sy    **SD,@,@,L!M NO ::$":":u||s?R((. II%r   c                |   |j                   J | j                  |j                         }| j                  r$|j                   dk(  r| j                         dk(  rd}| j                  }|dk7  rd| dnd}| j
                  r| j                  r|j                  r| d}d|j                  j                          d	| | S )
Nr   rX  r   r  r  r  z + rsplit_startztl.arange(0, zBLOCK))
r   indexing_size_strr  r   r  r  r  r\  r   r  )r  r  r  r  r  s        r   r  z)TritonKernel.iteration_ranges_ranges_code  s    +++%%e&6&67 !!  A%'')Q.D&&*5*C4}A&&&))""x/Fu||1134F4&IIr   c                ^    | j                   }| j                         }dg|z  }d| d| d| dS )NrQ   r  r  r  )r  r   )r  r  r  r  r   r  s         r   iteration_ranges_scalar_codez)TritonKernel.iteration_ranges_scalar_code  sC     &&&&(sTz$r%;-q99r   c                   |j                   J d|j                    d}| j                  |      r#d| d|j                   dz    d|j                    d}| j                  r,| j                         dk(  rd	dd
d}d||j                       d}|j                  j                  ||      }| j                  dk7  r| d| j                   dS |S )Nztl.program_id(r  r  z + tl.program_id(rQ   z) * tl.num_programs(r  rX  r"   r   )r   rQ   r"   r  r  )r  r|  r  r   	pid_cacher  r  )r  r  r  reversed_pid_mappids        r   iteration_ranges_get_pidz%TritonKernel.iteration_ranges_get_pid  s    ~~)))u~~.a0 &&u- cU+ENNQ,>+??STYTbTbSccefC
   T%<%<%>!%C#$q1"#3ENN#C"DAFCoo!!#s+z)U$t//022
r   c                   | j                   rt        j                  ry|j                  dk(  xr[ |j                   xrL | j
                   xr= t        j                  j                  j                  |j                  t                      S )NFrQ   )r  r#   !combo_kernel_per_subkernel_blocksr  has_zdimr  rO   rP  rI  r  r  r5   )r  r  s     r   r|  z#TritonKernel.needs_yz_grid_overflow  sw     F$L$LNNa YNN"Y...Y GG$$99%++~GWXX		
r   c                    | j                   r | j                   |j                          d   S t        |j                            S )Nr   )r  r  r3   )r  r   s     r   r  zTritonKernel.max_block  s;    $$'7u%=>>//r   c                   | j                   r5t        j                  j                  j	                  |j
                  d      ry| j                  sy| j                  rW|j                  j                          d| j                  v r.| j                  |j                  j                          d   dk(  rZy| j                  sMt        j                  j                  j                  |j
                  d      r|j                  r| j                  sy|j                  r~| j                  rr| j                  |j
                        }t        j                  j                         }t!        j"                  |      j$                  xs d}t'        |t(              r?||k  r:y|j                  dk(  r| j*                  rd}n| j-                  |j                        }|j                  r| j.                  r|| j1                         z  }t        j                  j                  j3                  |j
                  |      r[|j4                  dk7  xsJ |j6                  xs< t        j                  j                  j9                  |j
                  t;                     S y)Nr  Fr   rQ   T    rN  )r  rO   rP  rI  statically_known_ltr  r  r  r   r  r  rQ  r\  r  r  r  r1   rl  	warp_sizer   r  rV  r  r  r  r~  r  r   r  r5   )r  r   r  r0  r  s        r   r  zTritonKernel._has_constant_mask  s      ww33DJJC!!DKK$5$5$7#8!>$BSBS!S  DKK$5$5$7#8!>?1D%%ww77

AF))d.G.G !:!:33DJJ?I WW88:F(//7AAGRI)S)i).C[[CDMMIt{{3I!;!;!DOO$55I 7788YO" W==W77##88^EUV r   c                d    | j                   d   }|j                  dk(  sJ | j                  |      S )Nr   rN  )re  r   r  )r  xtrees     r   r  z TritonKernel._has_constant_xmask?  s5      #||s"""&&u--r   c                    | j                   D ]2  }| j                  |      s|j                  |j                   d       4 |j                  d       y )Nr  r   )re  r  r  r   )r  r	  r   s      r   r8  zTritonKernel.filter_masksD  sN    $$ 	8D&&t,!!T[[M"67	8
 	&!r   c                    t        t        j                        d | j                   D cg c]  }t        |    c}S c c}w r   )ra  r   r   rX  r   )r  r   s     r   get_reduction_prefixesz#TritonKernel.get_reduction_prefixesL  sB     ]::;<Ud>U>UV
 t
 	
 
s   <c                   | j                   D cg c]  }|j                  s| }}dj                  t        d |D                    }|j	                  d| j                  |              | j                   D cg c]+  }|j                  rt        j                  |j                     - }}t        |      }|j	                  d| j                  |              yc c}w c c}w )z^
        Generates code that flattens ND reduction numels, block sizes, etc. into 1D.
        rK  c              3  :   K   | ]  }|j                    d   yw)r  Nr  r  s     r   r   z8TritonKernel.codegen_reduction_numels.<locals>.<genexpr>Y  s     "UTdkk]%#8"Ur  z	rnumel = zRBLOCK: tl.constexpr = N)
re  r\  r&  r'  r  r  r   r   r   rG   )r  r  r   reduction_treesr  	rn_blocksr  s          r   r  z%TritonKernel.codegen_reduction_numelsS  s    
 -1,<,<RD@Q@Q4RRF"U_"UUV	$**V"4!567
 ((
   %%dii0
	 

 y)/

60B/CDE S

s   CC90Cc                |    | j                         }|D cg c]  }t        j                  | | fi | c}S c c}w )zK
        Helper to initialize symbols like rn_numel, rn_base, etc.
        )r
  r   r  )r  r  rw  rn_prefixesr   s        r   r  z#TritonKernel._get_reduction_symbolse  s=     113JUVxx0;F;VVVs   !9c                    | j                         }| j                  ddd      }t        t        |      dz
        D cg c]  }t	        ||dz   d        c}t        j                  d      gz   S c c}w )z
        Compute coefficients to convert ND reduction indices to linear indices.
        For example:
          rindex = r0_index * r1_numel * ... * rn_numel + ... + rn_index.
        r  Tr   rQ   N)r
  r  rc  r   rG   r   r   )r  r  	rn_numelsr  s       r   _get_reduction_index_coeffsz(TritonKernel._get_reduction_index_coeffsl  s{     113//PT/U	;@[AQTUAU;V
47M)C!GI./
]]1 	 
s   A0c                :    | j                         }t        ||      S )zK
        Compute linear reduction indices from N dimensional ones.
        )r  rF   )r  
multi_indscoeffss      r   r  z'TritonKernel._flatten_reduction_indicesy  s     113,,r   c                $   | j                  ddd      }| j                  ddd      }| j                  |      }|j                  d| j                  |              | j                  |      }|j                  d| j                  |              y)zX
        Generates code that converts ND reduction indices into linear indices.
        r   Tr   r  z
roffset = z	rindex = N)r  r  r  r  )r  r  
rn_offsetsrn_indsrr  rindexs         r   r  z&TritonKernel.codegen_reduction_indices  s    
 00d 1 

 --gtQU-V 11*=
4#4#4W#=">?@009	$"3"3F";!<=>r   c                   |j                   }|j                  r%|j                  |j                   d| d| d       n|j                  D|j                  |j                   d| j                  |              |j                  | d       n|j                  | d| j                  |       }n| j                  || d      }| j                  s|j                          dnd}|j                  | d| j                  |       d	| |j                   d| g       | j                  |      r!|j                  | j                  |             y |d
k(  r| j                  s%|j                  | d|j                   d| d       y y )Nr  z	offset + r
  z
offset = 0r   r   r|  z	offset = rK  rN  r  r  r  )r   r  r%  r   r  r  r   r  r  r  r  r  r  create_constant_mask)r  r  r  rN  r  
block_sizes         r   r  z,TritonKernel.iteration_ranges_codegen_header  s    LL==NNejj\QCy4@A^^#NNejj\T-N-Nu-U,VWXNNaS
+,+Id&G&G&N%OP881#VM ,0+C+C1779+U#  OOc4#@#@#G"HJ<Xzzl#dV, ""5)NN444U;<s(t77NNaS

|3qc?@  8r   )r   TNNF)
r  dict[str, sympy.Expr]r  zFixedTritonConfig | Noner  z
int | Noner  r   r   r   r*  r  r+  )r  r   r
  zstr | tuple[str] | Noner  zTMACompatibilityChecker | None)r   )r   r   r   r   r  z)BlockPtrOptions | TensorDescriptorOptionsr   ztuple[str, str])r   r   r  r   r  r   r  r   )r  rY   )r   r   r  r  )r   r   r  r   r   
r   r   r  r   r  rW   r  rN   r   r   r-  )r  CSEVariable | NoneNN)r  rW   r  z.tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]r  rW   r  ry  r  r   r  ztuple[str, sympy.Expr] | Noner  r   r   rW   )r   ztuple[str, BlockShapeType])r  rW   r   ry  r   rW   )
r   ry  r  ry  r   rM   r  %CSEVariable | tuple[CSEVariable, ...]r   r"  )r   ry  rJ  )r   r   r  r   r  rW   )r  tuple[CSEVariable, ...]r  tuple[torch.dtype, ...]r   r   )r  r$  ro  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]r  r#  r   r#  )
r  r$  r  r#  r  r   r  r   r   r#  )r   r  )r7  zfloat | Noner   rY   )r   z type[triton_heuristics.GridExpr]r  )r   r   r   zIRNode | Noner  r   )r   rn  )r  re   )r  rf   r   r   )r  rf   r  r	   r   r   )r  rf   r   r   )r   r   r   r  )r   rf   r   r   )r	  r  r   r   )r  rY   r   r   )r  r   r   zlist[sympy.Symbol]r  )r  rO  r   r   )r  rf   r  rY   r   r   )fr   r   r   r   r  r  r   r  r  rS  r  r  r  r  r  r  r  rr  r  r  r,  r  r  r  r  r  r  r  r  r  r	  r  ru  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r}  r  rP  r<  rW  rT  r  rV  r  r  r  r>  r  r  r%  rC  rI  rL  r   r   rS  rY  ru  r  r  r  r  r  r   r  r  r  r  r  r  r  r|  r  r  r  r8  r>   r
  r  r  r  r  r  r  rz  r{  s   @r   r  r  
  sl    &I%%).E&.O$;!3B0B "#@  HL7K
 15$( %39%39
 /39 "39 39 
39j : :6 
 

"

#J*:0


 " " /3DHb
b
 ,	b
 $Bb
R f'f' f' <	f' 
f'P#XJMM M 	M
 M4 *H8J
 
 :?-<1  "

:H
{| SW\\ *\3>\FO\	\|I
& 15-199 C9 &	9
 $9 9 .9 +9 
9v.
9
(
1<
	
(\\ \ &	\
 5\ 
/\|&DO&"
0F
P#0&!
	B>> > 	>@>S1>S;R>S	>S@"'"
" (" 
!"BD"'D" (D" 	D"
 D" 
!D"L$q%f0wr

 t_%"  %"N 
 
 - -^yv	     (;TC(
- LP)),)DH).,2	&J*:(:14:	:.

0
9v.
" 
 
F$W 
 
-? A(A0>A	Ar   r  c                  f     e Zd ZdZ	 	 	 	 	 	 	 	 d fdZddZ	 d	 	 	 	 	 	 	 	 	 d	 fdZd
dZ xZS )FusedUserDefinedTritonKernelz
    When fusing a user-defined triton kernel with epilogues, we use this class to generate the modified triton kernel source
    c           	     ^   t         |   ||ddd d d       || _        t        | j                  j                  j
                  t        j                        sJ | j                  j                  j
                  | _        | j                  j                         sJ t        | j                  j                  j                        dk(  sJ t        j                  | j                  j                  j                  d   j                        | _        | j                   j#                  dd      | _        y )Nr   TF)r[  r  r  r  r  r  rQ   r(  r   )rq  rr  scheduler_noder   kernel_noder   r$   UserDefinedTritonKernelir_nodecan_fuse_epiloguer   kernel_storesr  astunparsestore_value_nodeoriginal_stored_exprr#  )r  r  r[  r(  rt  s       r   rr  z%FusedUserDefinedTritonKernel.__init__  s    	 !! 	 	
 -++00"2L2L
 	
 
 483F3F3R3R3W3W||--/// 4<<--445:::$'KKLL&&--a0AA%
! %)$=$=$E$EdB$O!r   c                   t        | j                  j                        dk(  sJ || j                  j                  d   j                         k(  r| j                  }| j                  |      }t        j                  j                  |      }|j                  r|j                  }nt        j                  |j                        }| j                  j                  | j                  |||      }|S t!        d| d      )NrQ   r   r  z!Epilogue attempted to load from 'zE'. Inductor indexing variables are not defined in user kernel scope. )r   r+  mutable_argsget_namer1  r  rO   rP  ry  r  r   r   r  r   r  r  r`  )r  r   r  loaded_exprr  r   r   r  s           r   r  z!FusedUserDefinedTritonKernel.load  s    4<<,,-2224<<,,Q/88:: 33K}}U+HGG%%d+E$$ --%55hnnE**

KuE + J  !3D6 :U U r   c                   t        | j                  j                  j                  t        j
                        sJ || j                  j                  j                  j                         k(  r|| _        y t        | %  ||||       y r   )
r   r(  fused_epiloguer   r$   ComputedBufferr4  new_store_cse_varrq  r  )r  r   r  r  r  rt  s        r   r  z"FusedUserDefinedTritonKernel.store  sl     $--<<AA2CTCTUUU4&&55::CCEE &+D"GM$ud3r   c                $   | 5  | j                  | j                  j                  j                               }| j                  j                  j	                  |       d d d        t        j                  | j                  j                        }ddl	m
}m}  ||      }t        |j                        dk(  sJ t        j                  | j                   j"                        }	 	 	 	 	 	 dd} ||j                  d   j$                  dd|       t        j&                  |      }|j)                         }	 ||      }|j                  d   j$                  j*                  dz
  }
d|j                  d   j$                  j,                  z  }| j.                  j1                         D cg c]  }||z   	 }}| j2                  j1                         D cg c]  }||z   	 }}|	d |
 |z   |z   |	|
d  z   }dj5                  |      S # 1 sw Y   xY wc c}w c c}w )	Nr   )identify_triton_storesidentify_triton_stores_from_astrQ   c                    t        | j                        |kD  r|| j                  |<   | j                  D ]  }|j                  |k(  s||_         y r   )r   r   keywordsr   r  )	call_noder<  positional_indexnew_argkeywords        r   _replace_argz:FusedUserDefinedTritonKernel.codegen.<locals>._replace_arg
  sO     9>>"%553:	/0 %-- ,;;(*$+GM,r   r  r!  r(  )r?  zast.Callr<  r   r@  r  )split_and_set_rangesr(  r7  
get_rangescodegenrU  deepcopyr+  
kernel_ast*torch._higher_order_ops.triton_kernel_wrapr;  r<  r   r  r.  Namer9  r   
store_noder/  
splitlineslineno
col_offsetr  get_lines_refr  r&  )r  rX  new_astr;  r<  r-  new_store_value_noderC  src_with_store_replaced	src_linesstore_line_indexindentationsr  
load_linescompute_linesnew_src_liness                   r   rF  z$FusedUserDefinedTritonKernel.codegen  s    	C22##22==?J ..66zB		C -- 7 78	
 8@=''(A---"xx(>(>(C(CD	,	,	, "	, 	  #..<P	
 #&++g"6 ,668	 //FG(//2==DDqH]11!4??JJJ04

0H0H0JK1lQ&K
K37<<3M3M3OPa)PP ''( ()*+ 	 yy''w	C 	Cd LPs   AG;!HH;H)r  r  r[  rv   r(  r8   r   r   )r   r   r  r   r   rn  r   r  r+  )	r   r   r   r   rr  r  r  rF  rz  r{  s   @r   r&  r&    sv    P%P %P =	P
 
P:2 SW	4	4 *	43>	4FO	4		4<(r   r&  c            
      l    e Zd ZU dZeZded<    eej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                   g      Zd fdZedd       ZddZd Zd Zddd	Z	 d	 	 	 dd
Z	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 ddZd Z xZS )TritonSchedulingz5Scheduling backend for Triton kernel code generation.z	type[Any]kernel_typec                    t         |   |       |t        |d      sy |j                  D ]$  }t	        |t
        t        f      st        |_        & y )Nr  )	rq  rr  r   r  r   r;   r9   debug_triton_codedebug_device_str)r  r  r   rt  s      r   rr  zTritonScheduling.__init__F  sM    #GIw$?OO 	:D$0B CD(9%	:r   c                    t         j                  j                  st         j                  j                  r't	        g | j
                  t        j                        S | j
                  S r   )r#   r   cooperative_reductionsforce_cooperative_reductionsr   backend_featuresrT   REDUCE_TO_SINGLE_ELEMENT)r   r0  s     r   get_backend_featuresz%TritonScheduling.get_backend_featuresN  sR     MM00}}99P#&&P(O(OP  ###r   c                  
 t         j                  j                  }t        ||      \  }}|r|j	                  |       t
        j                  rtddlm}m	
 t        
fd|D              sX|D cg c]  }t        ||      r|j                           }}|j	                  |j                   ddj                  |              |rt        ||      }	|j!                  ||	       y y c c}w )Nr   )r7   ForeachKernelSchedulerNodec              3  6   K   | ]  }t        |        y wr   )r   )r   rc  rf  s     r   r   z3TritonScheduling.codegen_comment.<locals>.<genexpr>e  s      >?
189s   z Fused node name list: r  )rO   rP  r  rB   make_commentr#   debug_fusiontorch._inductor.schedulerr7   rf  r}   r   r4  commentr&  r,   write_provenance_debug_handle)r  node_scheduler  r  origins_detailed_originsr7   rc  
node_namesdebug_handlerf  s             @r   codegen_commentz TritonScheduling.codegen_commentY  s    ''&&%8%P""  )
  CP  +!!%67 JJL
 
 $$''>tyy?T>UV BL 11+|L s   .#C$c	                   |j                  ||||||      ryt               }	t        j                         rt        j	                  ||       |	j                  d|d       |	j                  |d       t        j                  j                         }
|	j                  d|
j                   d       d| } |||      \  }}|d	|z   d	z   |z   z  }|j                  ||	j                         |       y)
zDEmit kernel to wrapper, with support for external template handlers.Nzasync_compile.triton(z, '''Tr  z''', device_str='z')z# kernel path: r(  )emit_kernel_overriderY   async_compileuse_process_poolr   r%  r  rO   rP  r  r"  define_kernelr  )r  r  r   src_coder  	subs_namerm  kernel_pathrB   compile_wrapperrA  metadata_commentrn  detailed_originss                 r   _emit_kernel_to_wrapperz(TritonScheduling._emit_kernel_to_wrapperz  s     &&
 (*))+   H5!!$9)e"LMxt4<<>!!$5n6I6I5J""MN,[M:$7w$O!!D7NT14DDDk?+C+C+EGWXr   c           
        t         j                  j                  }||j                  v r|j                  |   }|S t        j
                  j                  r$t        |t        j
                  j                        nd}|r t         j                  j                  ||      }t        |      d d }dj                  d|||j                         g      }t        j                  j                  rt        j                  j                   d| }||j                  |<   t        j
                  j                  r|nd}|j!                  t#        t$        j&                        |      }|j!                  t#        t$        j(                        |      }|j!                  dd      }t+        t-        |j/                               d      \  }	}
}| j1                  |||||||t2               t5        j6                  d	      rt5        j8                  |||       |S )
Nr   r   r(  r   triton_z#pragma CMT#pykernel_metadata)rO   rP  r  src_to_kernelr#   r   descriptive_namesrA   r  customize_fused_kernel_namerP   r&  next_kernel_suffixaot_inductormodel_name_for_generated_filesunique_kernel_namesr#  r   rD   r  r6  r)   r(   r   r~  rB   r%   is_metric_table_enabledlog_kernel_metadata)r  rx  rm  r   r  r  
fused_namekernel_categoryry  	_basenamer(  rz  s               r   rw  zTritonScheduling.define_kernel  s   ''&&w,,,!//9Kf _ ==22 &mV]]5T5TU 
 YYBB:xX
@J2ANO((?J8R8R8TUK ""AA "(!4!4!S!S TTUVaUbc /:G!!(+'-}}'H'HiI
  ''K,H,H(I;WH''K,C,C(DiPH  ''s;H(08>>;K1Ld(S%Iq+((#	 ../@A++KhOr   c                    | j                  |d      }t        j                  |      }| j                  ||t	        d |D                    S )z
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        T)r  c              3  <   K   | ]  }|j                           y wr   r4  r   rc  s     r   r   z9TritonScheduling.benchmark_fused_nodes.<locals>.<genexpr>  s     :WA1::<:Wr2  )rp  )generate_kernel_code_from_nodesr*   r  benchmark_codegened_moduler   )r  r  n_spills_thresholdrx  r  s        r   benchmark_fused_nodesz&TritonScheduling.benchmark_fused_nodes  sV    
 77PT7Ux(..#
:WQV:W0W / 
 	
r   c           	     8  
 t        t        j                  j                        }t	               5  |j                  t        j                  j                               5  dfdfd}fd}||nt        dg      }t        j                  d|j                          |        j                  fcddd       cddd       S j                         
j                  j                  	   j                  
 d          j(                  }t+        |      d
k(  sJ |d   j,                  |kD  rt'        d	      nyt        j                  j                         }	t/        j0                  
fd|	      t+        j2                        dkD  r't/        j0                  
fdt5        |	            z
  t        j                  d|        |        j                  fcddd       cddd       S # t        $ rn}t         j"                  j$                  r t        j                  d||       t'        d	       |        j                  fcY d}~cddd       cddd       S d}~ww xY w# 1 sw Y   nxY wddd       y# 1 sw Y   yxY w)z$Benchmark an already compiled moduleNc                 ~     j                   J t        j                  j                   j                         d   dz   S Nr   z.kernel_perf__file__ospathsplitextr  s   r   cache_file_pathzDTritonScheduling.benchmark_codegened_module.<locals>.cache_file_path  s6    ||///ww''5a8>IIr   c                 >            } t        | t                     y r   r+   r   )r  r  mss    r   store_cachez@TritonScheduling.benchmark_codegened_module.<locals>.store_cache  s    &(T3r7+r   c                             } t         j                  j                  |       r.t        |       5 }t	        |j                               cd d d        S y # 1 sw Y   y xY wr   )r  r  existsopenr  readr  fdr  s     r   
load_cachez?TritonScheduling.benchmark_codegened_module.<locals>.load_cache  sM    &(77>>$'d 0r$RWWY/0 00s   AA r  %kernel src code for %s written to: %sr   z*Exception (%s) in compiling fused nodes %sinfrQ   c                 4      j                     d         S rv  
clone_argsr   callwrapped_jit_functions   r   r  z=TritonScheduling.benchmark_codegened_module.<locals>.<lambda>+       D!@!5!@!@$!G!JK r   r0  c                 "     j                     S r   r  r   r  s   r   r  z=TritonScheduling.benchmark_codegened_module.<locals>.<lambda>4  s     ? 4 ? ? F r   z+The fused kernel for %s took %.3f ms to run)r   rO   rP  r  r   r0  r  r   r  r  r  get_argsr  r  r  	Exceptionr#   r   .disallow_failing_autotune_kernels_TESTING_ONLYr  	launchersr   n_spillsr/   	benchmarkr  r   )r  r  r  rp  device_interfacer  r  r  r  r0  r   r  r  r  r  s    `        @@@@@r   r  z+TritonScheduling.benchmark_codegened_module  sf    4AGG4G4GH R	$##AGG$G$G$IJR	$ BJ, )4
*i[:Q  II7
 B~3<<'?R	$ R	$ R	$B <<>D88D#&;; (4)44d;A>? -66Iy>Q&&& |$$'995\<<> !**K! +==>Bk33F"6{ B
 II=
 Ms||#eR	$ R	$ R	$N  
(==OO		@
 5\3<<''cR	$ R	$ R	$N
(OR	$ R	$ R	$ R	$ R	$sh   .J"AI;<	J(I;8HCI;.	J	I8
AI3I8I; 	J3I88I;;J	 JJc                   |j                  d      }|xr  t        d |j                         D              }| j                  }|rddlm} |}|rd|d<   |j                  d      r
d|d	<   d|d<   t        j                  |j                        s|j                  d	      rJ d|d	<   t        j                  j                  ||||      } ||i |}| j                  |||      S )
Nr  c              3  <   K   | ]  }|j                           y wr   )is_split_scan)r   r   s     r   r   z9TritonScheduling.create_kernel_choices.<locals>.<genexpr>G  s      (
%)D (
r2  rQ   )TritonSplitScanKernelFoverride_cooperative_reductionr>  Toverride_persistent_reduction)contains_opr}   scheduler_nodesr[  triton_split_scanr  r  r  r  r  rO   r  triton_kernel_kwargsadd_multi_kernel_choices)	r  kernel_featureskernel_argskernel_kwargsis_scanr  r[  r  r   s	            r   create_kernel_choicesz&TritonScheduling.create_kernel_choices@  s     "--f5 
C (
-<-L-L-N(
 %
 +/*:*:@/K>CM:; &&v.=AM9:>CM:;11/2Q2QR$(()HIII=BM9:		66+}
 k;];,,V[-PPr   c           	        |g}t         j                  j                  s|S |j                  xr |j	                  d       }|j
                  xr |j	                  d       }|r%|j                   | j                  |i |ddi       |r|j                  j                  }t        j                  j                  j                  |d      r[|j                   | j                  |i |ddix}       |r2|j                  r&|j                   | j                  |i |ddd       t        |      dkD  r.|dd  D ]  }	|j                  |	_         |j!                  d        |S )	Nr  r  Fi   )r  r  rQ   c                    | j                   S r   )r  )r  s    r   r  z;TritonScheduling.add_multi_kernel_choices.<locals>.<lambda>  s    q'='= r   r  )r#   r   multi_kernelr  r  r  r  r[  r[  r  rO   rP  rI  r  r   must_keep_buffersr>  )
r  r   r  r  kernelsoptional_persistentoptional_cooperativer  r9  kernel2s
             r   r  z)TritonScheduling.add_multi_kernel_choicese  s    (.h}}))N$99 
-BSBS+C
 ?
  &;;  
MDUDU,E
 A
 NN    # 38  __44Fww44VUC-T--$' 8= E '5+E+ENN((((+ <A:?	 w<!"12; E,2,D,D)E LL=L>r   c                   fdfd}fd}dg }}d}t         j                  j                  }t        |      t         j                  _        t         j                  j                  }	t        |	      t         j                  _        t
        j                  dkD  }
t
        j                  dkD  }| j                  |d|
|d      }|D ]:  \  }}}|D cg c]  }|j                          }}|D cg c]  }|D ]  }|j                           }}}t        |      dk(  r'||d      \  }}||z  }|dz  }|j                  |       |J |j                  t        t        j                         d	      }t#        j$                  |      t&        j)                  d
|j*                          |       \  '|z  }|z  }|j                  j*                         j-                         j.                  j0                    j2                   d          j4                  }t        |      dk(  sJ |d   j6                  dkD  rt9        d      xnUt         j                  j;                         }t=        j>                  fd|      t=        j>                  fd|      t&        j)                  dt        d |D                      |        |z  }|z  }|j                  j*                         = |t         j                  _        |	t         j                  _        |||fS c c}w c c}}w )a[  
        Benchmark combo kernel partitions and return total execution time.

        Generates kernel code for each partition and benchmarks them.
        Single-node partitions use benchmark_fused_nodes(), while multi-node
        partitions use the combo kernel benchmarking path.

        Returns (total_ms, total_clone_ms, file_list).
        c                 ~     j                   J t        j                  j                   j                         d   dz   S r  r  r  s   r   r  z@TritonScheduling.benchmark_combo_kernel.<locals>.cache_file_path  s6    <<+++77##CLL1!4~EEr   c                             } t         j                  j                  |       rCt        |       5 }t	        d |j                         j                         D              cd d d        S y# 1 sw Y   yxY w)Nc              3  2   K   | ]  }t        |        y wr   )r  )r   r  s     r   r   zNTritonScheduling.benchmark_combo_kernel.<locals>.load_cache.<locals>.<genexpr>  s      Eaq Er  r!  )r  r  r  r  r   r  r  r  s     r   r  z;TritonScheduling.benchmark_combo_kernel.<locals>.load_cache  s^    "$Dww~~d#$Z F2  E2779??3D EEF FFs   .A,,A5c                 \            } t        | t              dz   t              z          y )Nr!  r  )r  r  r  ms_clones    r   r  z<TritonScheduling.benchmark_combo_kernel.<locals>.store_cache  s&    "$Ds2w}s8}<=r   r   g        T)subkernel_nodescustom_part_algorithmenable_autotunemixed_sizesonly_gen_src_coderQ   r  r  r  c                 4      j                     d         S rv  r  r  s   r   r  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>  r  r   r  c                 (     j                     d   S rv  r  r  s   r   r  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>  s    ;0;;TB1E r   zDThe fused kernel for %s took %.3f ms to run, %.3f ms to clone inputsc              3  <   K   | ]  }|j                           y wr   r  r  s     r   r   z:TritonScheduling.benchmark_combo_kernel.<locals>.<genexpr>   s     <A1::<<r2  ) rO   rP  r  r   inplaced_to_remover#   combo_kernels_autotunecombo_kernel_allow_mixed_sizesgenerate_combo_kernel_code	get_nodesr4  r   r  r#  r   rD   r6  r*   r  r  r  r  r  r  r  r  r  r  r  r  r/   r  ) r  	node_listnode_benchmark_resultsr  r  total_ms	file_listtotal_clone_msremoved_buffers_originplaced_to_remove_origr  r  kernel_code_listrx  r   
node_groupr   fused_node_listsr  rc  namesnode_msr  r  r0  r   r  r  r  r  r  r  s                             @@@@@@@r   benchmark_combo_kernelz'TritonScheduling.benchmark_combo_kernel  s   	F	 	>  ) # ww66",-A"B"#''"<"<%/0G%H" 77!;;;a?::%"&+#" ; 
 -= ?	+(Hfj=GHT 0HH/?OeOAQZZ\O\OEO:!# 6z!} EG#!#  &'''''K,C,C(DiPH""8,CII7
 &<LB~B(*  .<<>D88D#&;;  0%00$7:;,66Iy>Q&&&|$$q( %e,X<<> !**K! '00E!
 IIV<<<	 MNHh&NS\\*?	+@ #7%<"22C  IOs   L:7L?)r  zScheduler | Noner   r   )r0  ztorch.devicer   )r{   )r   tuple[float, str])r{   N)rp  zOrderedSet[str] | Noner   r  )r  rv   r  	list[Any]r  r  r   list[TritonKernel])r   r  r  r  r  r  r   r  )r   r   r   r   r  r[  r   r   rT   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTRITON_TEMPLATESTUPLE_REDUCTIONrb  rr  r   rd  rr  r~  rw  r  r  r  r  r  rz  r{  s   @r   rZ  rZ  5  s   ?)K)!""$$**44++**		
: $ $MB(YT6p	
 OSW$5KW$	W$r#Q+#Q #Q &	#Q
 
#QJ33 3 &	3
 
3jp3r   rZ  c                   g }| j                         }|t        |t        j                        sJ |r0|j                  $|j                  | j                          d       |S ddlm} ddl	m
} | j                         }|J | j                  j                  |      }t        |t        ||f      sJ dt        |              t         j"                  j%                  |      5  t&        j(                  }|j+                  | j-                               j/                         }|t&        _        d d d        |j                  | j                          d       |j                  t1        j2                  d             |S # 1 sw Y   RxY w)Nz" Unfinalized multi template bufferr   )CUDACombinedScheduling)XPUCombinedSchedulingz]Scheduling backend should be SIMD or CUDACombined when generating debug Triton strings, got: z Triton code:z    )get_template_noder   r$   MultiTemplateBuffermake_kernel_renderr  r4  0torch._inductor.codegen.cuda_combined_schedulingr	  3torch._inductor.codegen.xpu.xpu_combined_schedulingr	  r.  r  get_backendri   r"  rO   rP  set_current_devicer%   generated_kernel_countr  r  r   rF  r  )	r   linesmulti_templater	  r	  r0  backendold_generated_kernel_counttriton_codes	            r   r]  r]    st   E++-N!Z@V@V%WWW.;;C((JKL< L9	
	
 "!!!..,,V4n&<>ST
 	
 llpqxlykz{	
 
 WW''/ 	H *1)G)G&!AA eg  .HG*	H 	(67X__[&9:L	H 	Hs   #A	E==F)r   r  )r   r   r   ry  r   r   r+  )r  r   r  r4  r  r4  r   r   r  )r   ry  r   ry  )r   ry  r   r   )r   zCSEVariable | Anyr   r   )r   r	   r   r  )r   r	   r   r   )r   rr   r  )r   r   r   zCallable[[_T], _T])r   r7   r   r  )
__future__r   r.  rg  r  rU  rB  r  r  loggingr  rP  r  rF  abcr   collections.abcr   r   r   r   typingr	   r
   r   r   r   sympy.printing.precedencer   r   torch._loggingtorch.utils._pytreer&   _pytreerK  torch._dynamo.device_interfacer   torch._dynamo.utilsr   r   torch._prims_commonr   r   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   r   r   torch.utils._tritonr   r   r   utils._sympy.symbolr   r   r   r    utils._sympy.value_rangesr!   r   r#   r$   r%   ru  r'   	codecacher(   r)   r*   r+   r  r,   ops_handlerr-   runtimer.   runtime.benchmarkingr/   runtime.hintsr0   r1   r2   r3   r4   runtime.runtime_utilsr5   r6   r  r7   r8   r9   r:   r;   shape_propagationr<   stream_utilsr=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   virtualizedrL   r  rM   rN   rO   wrapper_benchmarkrP   block_analysisrR   commonrS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   simdrc   rd   re   rf   rg   rh   ri   triton_utilsrj   rk   rl   rm   rn   r  ro   typesrp   r}  rr   %torch.fx.experimental.symbolic_shapesrs   rt   ru   simd_kernel_featuresrv   rw   	getLoggerr   r  _logginggetArtifactLoggerperf_hint_logschedule_log
fusion_logr  r   r   r   r   r   r   ra  r  r.  r  r  r  r  r  r  rS  rV  r\  r^  r`  rb  rg  rl  rn  r~  r  r  r   r  _initialize_pointwise_overridesr  r  r/  rc  r|  r   r   r  r  r  r&  rZ  r]  r   r   r   <module>r<	     s   " 
          	   8 8  4 4  0   $ $ C < ? /   Y X 4 ) ) ( F F ; ( ' .  D  6 .     C B B /    $    %  L>&8	Bg!00<H~~//*E^^--hA
 #$ ,	&'
&'(&'&'R6 6  4 $Q, Q,h 
 
 
B ~ ~ ~B
 ?4 ? ?8 =, = =@++/+<P++>aQM aQH 	3
&8
;P 8((V|&k |&B   22D 9  / / 9J"O J"Z$+ $+N N
 N
 N
b! !H # # #
%sU38_'<<= 
 A A AHN7A:/0 N7AbnB(< B(JU3~ U3p#r   