
    9j*                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZ d dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dl m!Z! d dl"m#Z#m$Z$m%Z% d dl&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7m8Z8m9Z9 erd dl:m;Z; ddl/m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZDmEZEmFZF ddl0mGZGmHZHmIZI ddlJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZT ddlUmVZVmWZWmXZX ddlYmZZZ dd l[m\Z\m]Z]m^Z^m_Z_ dd!l`maZambZb dd"lcmdZdmeZemfZfmgZgmhZh erd d#l:miZimjZjmkZk d d$lmlZl  ej                  en      Zoej                  j                  end%      Zrej                  j                  end&      Zsej                  j                  end'      Zt e_       j                  Zv e!g d(      ZwdAdBd)Zxej                   G d* d+             Zz G d, d-ez      Z{ G d. d/ez      Z|dCd0Z} ed1e\e\2      Z~ej                   G d3 d4             Z G d5 d6e      Z G d7 d8e^e~   ee~         Z G d9 d:eH      Z ej                  d;<       G d= d>             Z G d? d@e      Zy)D    )annotationsN)Counter)AnyGeneric
NamedTupleTYPE_CHECKING)TypeVar)metrics)MultiTemplateBuffer)analyze_memory_coalescing)free_unbacked_symbols)immutable_dict)
OrderedSet)FloorDivIdentityModularIndexing)free_symbol_is_type
prefix_strsymbol_is_typeSymT   )counters   )configir	scheduler)prologue_preserves_zero_mask)	code_hashPyCodeCache)	MemoryDepStarDepWeakDep)CallableIRNode)!indexing_dtype_strength_reduction)CoordescTuner)DeviceProperties)
green_textlast_power_of_2yellow_text)BaseSchedulerNodeBaseScheduling	WhyNoFuse)
cache_property_on_selfexpr_fits_within_32bitget_dtype_sizeIndentedBufferPlaceholderprefix_is_reductionsympy_index_symbolsympy_product
sympy_subsunique)ops
OpsWrapperV   )BlockPatternMatcher)CSEVariableindex_prevent_reorderingKernelPythonPrinter)MultiKernelSizeHintMultiKernel)DisableReductionEnableReductionNodeScheduleEntryNodeScheduleMarkerSIMDKernelFeatures)IterableIteratorSequence)CoalesceVarAnalysis
perf_hintsschedulefusion)zyxr0_r1_c                j    t         j                  j                  j                  j                  }||S | S N)torch	_inductorr   triton	max_tiles)defaultrZ   s     \/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/codegen/simd.pyget_max_tilesr]   ^   s-    &&--77I!-9:7:    c                       e Zd ZdZej
                  j                  ej
                  j                  d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZee	dd              Z
d	dZee	d
d              Z xZS )IterationRangesa  
    Each range tree represents multiple sets of iteration indexing
    in a single tiled dimension in the output kernel.

    If you have two loops ranges one (4, 3, 2) and another (4, 6),
    then the range tree will be:
            4 (i0)
        3 (i1)  6 (i3)
        2 (i2)
    Where i0 is shared between both loops, but then the split into
    different indexing vars.  All loop ranges must iterate over
    the same number of elements.
    )divisorlengthc                   t         
|           || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        y rV   )super__init__namevar_list
var_rangesnumelprefixra   rb   kernelroot)selfrf   rg   rh   ri   rj   rk   ra   rb   rl   	__class__s             r\   re   zIterationRanges.__init__s   sO     		 $
	r^   c                ,    t        | j                        S rV   )r4   rj   rm   s    r\   is_reductionzIterationRanges.is_reduction   s     #4;;//r^   c                ,    t        | j                        S rV   )r5   rf   rp   s    r\   symbolzIterationRanges.symbol   s    !$)),,r^   c                z    t        j                         D ci c]  \  }}||
 }}}|| j                     S c c}}w rV   )r   itemsrj   )rm   symtrj   prefix_to_symts       r\   rv   zIterationRanges.symt   s>     <F;K;K;MN<4&$,NNdkk** Os   7)rf   strrg   list[sympy.Symbol]rh   dict[sympy.Symbol, sympy.Expr]ri   
sympy.Exprrj   rx   rk   
SIMDKernelrl   IterationRangesRootreturnNoner~   boolr~   zsympy.Symbol)r~   r   )__name__
__module____qualname____doc__sympySOnere   propertyr/   rq   rs   rv   __classcell__rn   s   @r\   r`   r`   c   s    . ww{{ % 3	
    " 
0 0  0- +  +r^   r`   c                       e Zd ZdZ	 d
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZddZddZddZ	 	 	 	 ddZ	ddZ
	 	 	 	 dd	Z xZS )r}   z
    Root of a iteration range tree that represents a single
    tiled dimension in the output kernel. It contains multiple
    sets of iteration represented with IterationRangesEntry.
    c          	         |i }t         |   |g i ||||        || _        i | _        || _        |r| j
                  r|	J || _        || _        |	| _        |
| _	        y )N)rf   rg   rh   ri   rj   rk   rl   )
rd   re   indexnodes	pid_cacherq   is_loop
tensor_dimgrid_dimhas_zdim)rm   rf   ri   rj   r   rk   r   r   r   r   r   rn   s              r\   re   zIterationRangesRoot.__init__   s     I 	 	
 
=?
 *3
 t00X5EFF$  r^   c                <    d| j                   d| j                   dS )NzIterationRangesRoot(, z, ...))rf   ri   rp   s    r\   __repr__zIterationRangesRoot.__repr__   s    %dii]"TZZLGGr^   c                b    | j                   j                         D ]  }|j                           y rV   )r   valuescache_clear)rm   nodes     r\   r   zIterationRangesRoot.cache_clear   s*    JJ%%' 	D	r^   c                2    t        | j                   d      S )Nr   )r5   rj   rp   s    r\   	index_symzIterationRangesRoot.index_sym   s    !T[[M"788r^   c                   t         j                  j                  j                  ||z  | j                        rt        | j                         |      }nt        | j                         ||      }|| j                  vrt        | j                   t        t         j                  j                         ||||       }|t         j                  j                  |j                         <   | j                   j#                  |j                                || j$                  |j                         <   || j                  |<   | j                  |   S )zF
        Lookup a given RangeTreeEntry, creating it if needed
        )r;   graphsizevarsstatically_known_equalsri   r   r   r   r   IterationRangesEntryrj   nextrk   iter_vars_countrange_tree_nodesrs   rg   appendrh   )rm   ra   rb   exprr   s        r\   lookupzIterationRangesRoot.lookup   s     7733Gf4DdjjQDNN,g6D"4>>#3WfEDtzz!';;-QXX%=%= >?@D 8<AHH%%dkkm4MM  /-3DOODKKM*#DJJtzz$r^   c                    t         j                  j                  }g }t        |      D ](  }|j	                  | j                  ||             ||z  }* g t        |      S rV   )r   r   r   reversedr   r   )rm   lengthsra   itervarsrb   s        r\   construct_entriesz%IterationRangesRoot.construct_entries   s]     ''++w' 	'FOODKK89&G	' %(#$$r^   c                f    | j                  |      D cg c]  }|j                          c}S c c}w rV   )r   rs   )rm   r   es      r\   	constructzIterationRangesRoot.construct   s'    $($:$:7$CDq
DDDs   .c           
     \  	
 dd|j                   D cg c]+  }t        j                  j                  j	                  |      - }}|D cg c]!  }|s|j
                  | j
                  k(  s |# }}|j                  fd       t        j                  j                  g 	g 
	
fd}|D ]v  }t        j                  j                  j                  |j                        s8 || j                  t        |j                                     |j                   ||       x t        j                  j                  j                  | j                         s, || j                  t        | j                                      g t#        	      g t#        
      fS c c}w c c}w )z,Figure out vars from this tree used in indexc                    t         j                  j                  j                  | j                        }t         j                  j                  j                  | j
                        dk(  }|| fS )a:  
            Gets the key for sorting nodes. When two nodes have the
            same divisor, the node with length as 1 should be handled
            first so the current divisor is not changed after multiplied
            node.length. Returns `not length_is_one_hint` for ascending
            sort.
            r<   )r;   r   r   optimization_hintra   rb   )rR   divisor_hintlength_is_one_hints      r\   get_sort_keyz8IterationRangesRoot.vars_and_sizes.<locals>.get_sort_key   sV     77++==aiiHL!"!1!1!C!CAHH!MQR!R &8"899r^   c                     |       S rV    )rR   r   s    r\   <lambda>z4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>  s    a r^   keyc                    j                  | j                                j                  | j                         | j                  z  y rV   )r   rs   rb   )r   ra   
index_varssizess    r\   addz/IterationRangesRoot.vars_and_sizes.<locals>.add  s5    dkkm,LL%+Gr^   )rR   r   r~   ztuple[int, bool])free_symbolsr;   rk   r   getrj   sortr   r   r   r   r   r   ra   r   r   ri   r   )rm   r   sr   nr   r   ra   r   r   r   s          @@@@r\   vars_and_sizesz"IterationRangesRoot.vars_and_sizes   sK   

	: <A;M;MNa**..q1NN!CqQ188t{{+BCC

0
1''++
	,  	D77##;;DLL'RDKK$,,)HIJ,,I	 ww77

GLGXdjj'%BCD&*%&(:(5/(:::/ OCs   0F$F)F)/F)rV   )rf   rx   ri   r{   rj   rx   r   intrk   r|   r   dict[str, str] | Noner   r   r   
int | Noner   r   r   r   r~   r   r~   rx   r~   r   r   )ra   r{   rb   r{   r~   r   )r   list[sympy.Expr]r~   zlist[IterationRangesEntry])r   r   r~   ry   )r   r{   r~   z+tuple[list[sympy.Symbol], list[sympy.Expr]])r   r   r   r   re   r   r   r   r   r   r   r   r   r   s   @r\   r}   r}      s     ,0)!)! )! 	)!
 )! )! ))! )! )! )! )! 
)!VH9 .%'%	#%E(;(;	4(;r^   r}   c                  p     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d	 fdZd
dZddZddZd
dZddZddZ	ddZ
 xZS )r   c                $   t         |   ||j                  |z  |j                  |j                  |j
                  |||j                  |j                  	       || _         t        j                  d       | j                        | _        || _        y )N)	rf   ri   rg   rh   rj   ra   rb   rk   rl   )rd   re   ri   rg   rh   rj   rk   rl   parent	functools	lru_cache_codegencodegenr   )rm   rf   ra   rb   r   r   rn   s         r\   re   zIterationRangesEntry.__init__&  s~     	,,'__((==== 	 
	
 0y**40?	r^   c                    d| j                    d| j                   d| j                   d| j                   d| j                   dS )NzIterationRangesEntry(r   ))rf   ra   rb   r   rh   rp   s    r\   r   zIterationRangesEntry.__repr__=  sH    &tyykDLL>DKK=PRSWS\S\R]]_`d`o`o_ppqrrr^   c                L    fd| _         d | j                   _        | _        y )Nc                      S rV   r   )rf   s   r\   r   z/IterationRangesEntry.set_name.<locals>.<lambda>A  s    t r^   c                      y rV   r   r   r^   r\   r   z/IterationRangesEntry.set_name.<locals>.<lambda>B      r^   )r   r   rf   )rm   rf   s    `r\   set_namezIterationRangesEntry.set_name@  s    ##/ 	r^   c                8    | j                   j                          y rV   )r   r   rp   s    r\   r   z IterationRangesEntry.cache_clearE  s      "r^   c                X    t         j                  j                  |        | j                  S rV   )r;   rk   codegen_iteration_ranges_entryrf   rp   s    r\   r   zIterationRangesEntry._codegenH  s    	//5yyr^   c                   g }t        | j                  t        j                        r|S t        | j                  t        t
        f      sJ t        | j                               | j                  j                  dd  D ]l  }t        |t        j                  t        j                  f      r.|j                  }t        |      dkD  sIt        d |D              s\|j                  |       n |S )Nr<   r   c              3  P   K   | ]  }t        |t        j                           y wrV   )r   r   SIZE.0r   s     r\   	<genexpr>z8IterationRangesEntry.precomputed_args.<locals>.<genexpr>U  s       ,56N1dii0,   $&)
isinstancer   r   Symbolr   r   typeargsIntegerr   lenallr   )rm   precomputed_argsargsymbolss       r\   r   z%IterationRangesEntry.precomputed_argsL  s    -/dii.##$))h%@AR4		?RA99>>!"% 	1CcEMM5<<#@A**w<!# ,:A, ) %++C0	1  r^   c                ,    t        | j                        S rV   )hashrf   rp   s    r\   __hash__zIterationRangesEntry.__hash__[  s    DIIr^   c                X    t        |t              sJ | j                  |j                  k(  S rV   )r   r   rf   )rm   others     r\   __eq__zIterationRangesEntry.__eq__^  s&    %!5666yyEJJ&&r^   )rf   rx   ra   r{   rb   r{   r   r{   r   r`   r~   r   r   )rf   rx   r~   r   r   )r~   r   r~   r   )r   objectr~   r   )r   r   r   re   r   r   r   r   r   r   r   r   r   s   @r\   r   r   %  sf      	
    
.s
# 'r^   r   c                    | t        d      k(  ry| t        d      k(  ryt        j                  |       ryt        |       S )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)values    r\   constant_reprr   c  s9    e	%-		E	;r^   CSEVariableType)boundr[   c                  ,    e Zd ZU ded<   ded<   ded<   y)PartialAccumulaterx   buffer_namereduction_typer   r   N)r   r   r   __annotations__r   r^   r\   r  r  p  s    Jr^   r  c                  N    e Zd ZU dZded<   ded<   ded<   ded<   d	ed
<   ded<   y)NodeInfozF
    Pre-computed node information for combo kernel partitioning.
    listnode_scheduledicttilingr   ri   rnumelrH   featuresr   is_persistent_reductionN)r   r   r   r   r  r   r^   r\   r  r  w  s*     LJK  !!r^   r  c                  V    e Zd ZU dZeZded<   ded<   dZded<   ded	<   	 	 	 	 	 d>	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d? fd
Z	 	 d@dZ	dAdZ
dBdZd ZeedCd              ZdDdZdEdZed@d       ZdFdZ	 	 	 	 	 	 	 	 	 	 	 	 dGdZdHdZdIdZdJdZdFdZdFdZdKdZdCdZdBdZdLdZd@dZd@dZdMd Z 	 	 	 	 	 	 dNd!Z!	 	 	 	 	 	 dNd"Z"dOd#Z#dPd$Z$e%	 	 	 	 	 	 dQd%       Z&e'e(jR                  jT                  f	 	 	 	 	 	 	 dRd&       Z+e'e(jR                  jT                  f	 	 	 	 	 	 	 dSd'       Z,	 	 	 	 dTd(Z-e'	 	 	 	 	 	 dUd)       Z.dVd*Z/dVd+Z0dWd,Z1	 	 	 	 dMd-Z2dXd.Z3dYd/Z4dZd0Z5d1 Z6	 d[	 	 	 	 	 	 	 d\d2Z7e8jr                  d]d3       Z:d^d4Z;e%d5        Z<d_d6Z=d7 Z>d8 Z?d9 Z@d: ZAd; ZBd< ZCd`d=ZD xZES )ar|   zo
    Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
    zCallable[[sympy.Expr], str]sexprkexprFr   allow_block_ptrrx   kernel_namec                ~    |i }t                    | _        |j                          _        t                _        t                _        |j                         D 	ci c]/  \  }}	|t        j                  j                  j                  |	      1 c}	} _        g  _        i  _        t!        j"                          _        |j'                          _        ||n j+                          _        | _        | _        ||n j3                          _        | _         j9                          _        d  _        t!        j"                          _        d _         tB        jD                  jF                  r j                  jH                  D ]h  }
tK        |
tL        jN                        stK        |
jP                  tR        jT                        sC|
jP                  jW                         dk(  sad _          n tX        jZ                  d fd       }| _.         j_                  |       d _0        g  _1        y c c}	}w )NFdotTc                    t         j                  j                  j                  | j	                               } j
                  D ]  }j                  | |      }  j                  |       S rV   )r;   r   r   simplify_with_rangesrh   range_treescombine_contiguous_dimscombine_modular_indexing_pairs)r   treerm   s     r\   simplify_indexingz.SIMDKernel.__init__.<locals>.simplify_indexing  sb    GG$$99%ARSE(( B44UDAB 66u==r^   r   )r   r{   )2rd   re   r  get_mutations	mutationsr2   bodyindexing_coderu   r;   r   r   simplifynumelsr  r   	itertoolscountr   rq   inside_reduction should_use_cooperative_reductioncooperative_reductiontiling_scoresr  should_use_persistent_reductionpersistent_reductionmix_order_reductionwant_no_x_dimno_x_dimr   store_output_ctris_native_matmulr   rY   native_matmulr	  r   r   SchedulerNoder   r   ComputedBufferget_reduction_typer   cacher  initialize_range_treersplit_sizesaved_partial_accumulate)rm   r  r  r   override_persistent_reductionoverride_cooperative_reductionr(  r+  rj   valr   r  rn   s   `           r\   re   zSIMDKernel.__init__  s    I !//1"$	+-FLlln
7BvsFAGG$$--c22
 79JL(0 ( 5 5 7 .9 +668 	"
 <I-3 -8 *557 	!
 *= **,%) ) 1 %==&&33 tY%<%<="499b.?.?@		446%?,0D) 
	> 
	> "3""9-AC%a
s   "4H9c                    t         )u-  Generate template source code with fused prologues and epilogues.

        Subclasses override this to implement custom code generation.
        The default implementation raises NotImplementedError — the actual
        standard path lives in ``TritonTemplateKernel.codegen_template_body``.
        NotImplementedError)rm   
schedulingtemplate_nodeepilogue_nodesprologue_nodesbuf_name_to_prologue_groupprologue_preserves_zero_mask_fnrenders           r\   codegen_template_bodyz SIMDKernel.codegen_template_body  s
      "!r^   c                    g S )a  Return epilogue nodes that were not fused into the kernel.

        These nodes need separate codegen (via ``call_kernel``) and must
        be excluded from ``mark_run`` in ``_codegen_single_template``.

        The standard path fuses all epilogues, so this returns ``[]``.
        ``ExternalTritonTemplateKernel`` overrides this for epilogues that
        don't read exactly one template output and cannot be fused.
        r   rp   s    r\   get_unfused_epiloguesz SIMDKernel.get_unfused_epilogues  s	     	r^   c                    d| dS )Nz<STORE_OUTPUT_>r   )rm   is     r\   _get_store_output_subgraph_namez*SIMDKernel._get_store_output_subgraph_name  s    s!$$r^   c                n    t        | j                        }t        j                  |dz
  d      | _        |S )Nr<   )startstep)r   r.  r#  r$  )rm   totals     r\   get_store_output_countz!SIMDKernel.get_store_output_count  s.    T**+ )eaia Hr^   c                :    t        d | j                  D              S )Nc              3  2   K   | ]  }t        |        y wrV   )r4   )r   rj   s     r\   r   z0SIMDKernel.num_reduction_dims.<locals>.<genexpr>  s     I6&v.I   )sumr"  rp   s    r\   num_reduction_dimszSIMDKernel.num_reduction_dims  s     IT[[IIIr^   c                    t         rV   r<  )rm   dtypes     r\   dtype_to_strzSIMDKernel.dtype_to_str      !!r^   c                6    | j                   j                         S rV   )r  select_index_dtyperp   s    r\   get_index_dtype_as_torch_dtypez)SIMDKernel.get_index_dtype_as_torch_dtype   s    }}//11r^   c                @    | j                  | j                               S rV   )rX  r\  rp   s    r\   index_dtypezSIMDKernel.index_dtype  s      !D!D!FGGr^   c                     yNFr   rp   s    r\   r,  zSIMDKernel.want_no_x_dim      r^   c                   t        fdt        D              }| xs | }d	d}g d}	t        t        |	            }
ddg}|r|}n
|r|
}n|
|z   } |||      } ||	t              }g }t	        |      D ]s  \  }}t        |      }|j                  |      }|j                  |      }||n|}|j                  t        | d|   ||| ||xr | j                   ||dv 
             u |S )
Nc              3  ,   K   | ]  }|v s|  y wrV   r   )r   rj   r"  s     r\   r   z3SIMDKernel.construct_range_trees.<locals>.<genexpr>  s      %
61AF%
   	c                `    t        fd| D              D ci c]  \  }}||
 c}}S c c}}w )Nc              3  ,   K   | ]  }|v s|  y wrV   r   )r   r:  masks     r\   r   zOSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<genexpr>  s     2U3PT32Urd  )	enumerate)seqrg  idxr:  s    `  r\   filtered_index_mapz<SIMDKernel.construct_range_trees.<locals>.filtered_index_map  s3    )22U#2U)U%S#S  s   *)rR   rQ   rP   rS   rT   r   rP   )r   r   r   r   r   )r~   zdict[Any, int])
r   all_prefixesr  r   rh  r4   r   r   r}   r*  )rm   r   r%  rq   r"  r-  active_prefixesno_r_dimrk  	grid_dimspointwise_tensor_dimsreduction_dimstensor_dimstensor_dim_mapgrid_dim_mapr  rJ  rj   r   r   r   s       `                r\   construct_range_treesz SIMDKernel.construct_range_trees
  s3    % %
!-%
 
 (';|+;	
 $	 $Xi%8 9(K/K/.@K ,KI))\B"?3 	IAv.v6L'++F3J#''/H!)AxE#he$6N'(J1J1J-J)% F]	& r^   c                    | j                  || j                  | j                  j                         | j                  | j
                        }| j                  j                  |       y rV   )ru  r%  r  rq   r"  r-  r  extend)rm   r   r  s      r\   r5  z SIMDKernel.initialize_range_treeA  sR    00!!MM&&(KKMM
 	,r^   c                     y)zr
        Hook called right before codegen with every index that will be
        used in the fused kernel.
        Nr   )rm   indicess     r\   finalize_indexingzSIMDKernel.finalize_indexingK  r   r^   c                v    | j                   }d| _         	 | j                  |||      || _         S # || _         w xY wr`  )r%  store)rm   rf   r   r   priors        r\   store_reductionzSIMDKernel.store_reductionQ  s;    %% %	*::dE51$)D!ED!s   / 	8c                     yr`  r   rp   s    r\   r&  z+SIMDKernel.should_use_cooperative_reductionY  ra  r^   c                     yr`  r   rp   s    r\   r)  z*SIMDKernel.should_use_persistent_reduction\  ra  r^   c                t    t        t        j                  j                  d | j                  D                    S )Nc              3  P   K   | ]  }|j                   j                            y wrV   )rh   ru   r   r  s     r\   r   z(SIMDKernel.var_ranges.<locals>.<genexpr>a  s"      *,0%%'*r   )r
  r#  chainfrom_iterabler  rp   s    r\   rh   zSIMDKernel.var_ranges_  s4    OO)) *484D4D* 
 	
r^   c                :    t        d | j                  D              S )Nc              3  J   K   | ]  }t        |j                  d u        y wrV   )r   r   r  s     r\   r   z0SIMDKernel.triton_tensor_ndim.<locals>.<genexpr>g  s     Q3td23Qs   !#)rT  r  rp   s    r\   triton_tensor_ndimzSIMDKernel.triton_tensor_ndimf  s    Q@P@PQQQr^   c                ^    dg| j                         z  }d||<   ddj                  |       dS )Nr   :[r   ])r  join)rm   rJ  r   s      r\   indexing_size_strzSIMDKernel.indexing_size_stri  s9    42244a499U#$A&&r^   c                    dg| j                         z  }| j                  D ]R  }|j                  |j                  r| j                  s)|j
                  j                          d||j                  <   T |S )N1BLOCK)r  r  r   rq   r%  rj   upper)rm   r   r  s      r\   dense_size_listzSIMDKernel.dense_size_listn  sv    //11$$ 	GD&$$(=(=,0KK,=,=,?+@)Fdoo&	G r^   c                    |j                   }|j                  | j                         }| d| dS dg| j                         z  }d||j                  <   dj	                  |      }| d|j                          d| d}|S )	Nzmask = tl.full(z, True, tl.int1)r   r  r   zmask = tl.full([zBLOCK], True, tl.int1)[r  )rj   r   dense_size_strr  r  r  )rm   entryrR   sizestrr   suffixouts          r\   create_constant_maskzSIMDKernel.create_constant_maskx  s    LL#))+GSy0@AA42244"%e5!#AGGI;.EfXQO
r^   c                L    | j                         }ddj                  |       dS )Nr  r   r  )r  r  rm   r   s     r\   r  zSIMDKernel.dense_size_str  s)    $$&499U#$A&&r^   c                   t        |t              s|S |j                  d   }| j                  j	                  |      x}|S t        |||j                  i      }t        j                  j                  j                  |      }t        ||j                  j                         |j                  j                  t        j                  j                   |j                  j"                        j%                         i      S Nr   )r   r   r   r   r   r7   r   r;   r   r   r  rl   r   r   r   r   r   ri   rs   )rm   r   rR   	tree_node	new_indexs        r\   r  z)SIMDKernel.combine_modular_indexing_pairs  s    %1LJJqM..22155I>Luq)..&9:	GG$$CCIN	((*INN,A,AGGKK!5!5-&(
 	
r^   c                    t         j                  j                  j                  |      x}r!|\  }}t	        | j                  ||      |      S | j                  ||      S rV   )r;   r   r   expand_floor_divr   _combine_contiguous_dims)rm   r   r  
expand_resr  denominators         r\   r  z"SIMDKernel.combine_contiguous_dims  s[     ))::5AA:A%/"I{D99)TJKXX00==r^   c                   t        |t        j                  t        j                  f      r|S |j	                  |      \  }}t        |      dk  r|S t        j                  j                  j                  ||t        |g||            \  }}}||k(  r|S |j                  |      }t        |t        t        | ||                        }	|	S )zI
        More aggressive simplification to merge contiguous dims
        r<   )r   r   r   r   r   r   r;   r   r   _simplify_loopsr?   r   r7   r
  zip)
rm   r   r  r   r   	new_sizesreindex_prunenew_index_varsr  s
             r\   r  z#SIMDKernel._combine_contiguous_dims  s     eemmU\\:;L //6
Eu:?L%&WW%5%5%E%E7US&
"	7F L	2ud3z7>;R+S&TU	r^   c                      j                   d   j                  xs  j                  t        j                   fd       } |       S )Nc               3     K    j                   j                         s j                  rJ d  y r j                          d _        	 d  r j                          d _        y # d _        w xY ww)NFT)r  rq   r%  codegen_body)rm   should_flushs   r\   ctxz)SIMDKernel.disable_reduction.<locals>.ctx  sl     ==--/0000 !!#$)D!-%%'(,%%s   AA5A) !A5)	A22A5)r  r   r'  
contextlibcontextmanager)rm   r  r  s   ` @r\   disable_reductionzSIMDKernel.disable_reduction  sE    ''+33Qt7Q7Q		"	"	- 
#	-$ ur^   c                    t        |      t        | j                        k(  sJ t        || j                        D cg c]  \  }}|j                  |       c}}S c c}}w rV   )r   r  r  r   )rm   r   rb   rangess       r\   
set_rangeszSIMDKernel.set_ranges  s]    7|s4#3#34444 #&gt/?/?"@
 V$
 	
 
s   Ac                T   t        d |D              r| D cg c]  }g  c}g fS t        j                  j                  | D cg c]  }g  c}| D cg c]  }j	                  |       c}t        j                         dfd}	 	 	 	 	 	 dd}g }d}|D ]U  }	g }
|	D ]9  }j                  |d      r|
j                  d        )|t              k  r>j                  |   d      r)|dz  }|t              k  rj                  |   d      r)t              dk(  xr d   dk(  }|d	z   t              k  rj                  ||   |dz      z        r|rj                  ||   |dz      z        st        ||   |dz      z        |   }|dz      }t        |||z        }|
j                   |||g |||       ||dz   |       ||d	z   |      g             J|dz   t              k  rj                  ||         sj                  t        ||         d      rfj                  ||         st        ||         |   }t        ||         }|
j                   ||g |||       ||dz   |      g             |t              k\  rt        |d      |
j                  t        j                   |||                   < |j                  |
       X t        d
 D              sJ d d|        |fS c c}w c c}w c c}w )Nc              3  8   K   | ]  }t        |      d k(    ywr   Nr   )r   rb   s     r\   r   z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>  s     6Fs6{a6s   c                    j                  |      }j                  |    |      st        |    |      t        |    |      | <   |    j	                  |       t              S rV   )r!  statically_known_multiple_of	CantSplitr   r   r   )rJ  r   
new_ranges	remainingsv	var_counts     r\   	add_rangez5SIMDKernel._split_iteration_ranges.<locals>.add_range  sf    ;;t$D229Q<F	!d33#IaL$7IaLqM  &	?"r^   c                P     t              t               dz   k(  sJ d fd}|S )z
            Builds the nested expression:
              ((...((s1*v[i1] + v[i2]) * s2 + v[i3]) ... ) * sk + v[i(k+1)])
            r<   c                \    | d      }t        dd        D ]  \  }}||z  | |   z   } |S )Nr   r<   )r  )	flat_varsr   r   rj  idxsr   s       r\   getterzISIMDKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getter  sH     a)!%ab2 5FAst8in4D5r^   )r  r   r~   r{   r  )r   r  r  s   `` r\   make_combinedz9SIMDKernel._split_iteration_ranges.<locals>.make_combined  s+     t9E
Q... Mr^   r   r<   c                6    t         j                  j                  S rV   )r   r   Zero)_s    r\   r   z4SIMDKernel._split_iteration_ranges.<locals>.<lambda>  s    EGGLL r^      r  r   c              3  t   K   | ]0  }t         j                  j                  j                  |      d k(   2 yw)r<   N)r;   r   r   guarding_hint_or_throwr   s     r\   r   z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>f  s/      
@AAGG33A6!;
s   68zfailed to set ranges  )rJ  r   r   r{   r~   r   )r   r   r  z	list[int]r~   z(Callable[[list[sympy.Expr]], sympy.Expr])r   r;   r   r   r!  r#  r$  r   r   r   statically_known_gtr  r  r   operator
itemgetter)groupsr   groupr  gr  r  return_getters_groupscurrent_grouplength_groupreturn_getterssizeis_bmm_then_pwsize1size2size3r  r  r  r  s                   @@@@r\   _split_iteration_rangesz"SIMDKernel._split_iteration_ranges  s    6g66$*+5B+R//WW:@-AQb-A
-34R[[^4	OO%		# 	#	#	+4	5	" !## d	9LN$ a--dA6"))*@A#c)n49S9Sm,:
 "Q&M $c)n49S9Sm,:$ "%Y1!4!K2!9K!A%I6..i6=STCT9UU ' ::i6=STCT9UU ( %m4yQRAR7SS 
 &m4E%ma&78E$T55=9E"))%"EN )- ? )-!*;U C )-!*;U C
 #Q&Y7**4=1IJ **8D)M:R+SUVW ::i6 (i.FGG%m4E$T9]+CDE"))%"G )- ? )-!*;U C	 %I6'a00")) ++ImT,JK}aD "((8Id	9L  
EN
 
 	9"9+Qwi8	9 
 000[ , .B4s   	L	L L%c                   t         j                  j                  }t        |d         dk(  r\|j	                  |t
        j                  j                        s2|j	                  t        |      t        |d         |z        r|d   |gfS |S )z1Fill in the reduction numel of lengths if missingr<   r   )	r;   r   r   r   r   r   r   r   r6   )clsr  r   reduction_numelr   s        r\   prepare_split_iteration_lengthsz*SIMDKernel.prepare_split_iteration_lengthsl  s{     77##wqz?a00%''++N00f%gaj)O;
 AJ 122r^   c                n    | j                  |||      }	 | j                  ||       y# t        $ r Y yw xY wNTF)r  r  r  )r  r  r   r  s       r\   is_compatiblezSIMDKernel.is_compatible  sB     55fgW	''8 		s   ( 	44c                >   | j                   D ci c]  }|j                  |j                   }}| j                  s0|D ]+  }t	        |      st
        j                  j                  ||<   - g |j                         }| j                  ||| j                        S c c}w )a5  
        Split and set iteration ranges for the kernel based on the provided lengths.

        This method maps the kernel's tiling structure to the node's iteration space,
        handling both pointwise and reduction dimensions appropriately.

        Args:
            lengths: A sequence of sequences of symbolic expressions representing
                    the sizes of different dimensions for each node.

        Returns:
            A list of lists of symbolic expressions representing the mapped
            iteration variables for each dimension.
        )r  rj   ri   r%  r4   r   r   r   r   map_kernel_groups_to_node_sizesr  )rm   r   rtr  rj   r  s         r\   split_and_set_rangeszSIMDKernel.split_and_set_ranges  s    $ 150@0@A""))RXX%AA $$  1&v.%*WW[[F6N1
 $6==?# 33FGT__UU Bs   Bc           
     F   t        |      t        |      k(  r!t        d t        ||      D              r || S | j                  ||      \  }}g t        j
                  j                   ||       }|D cg c]  }|D cg c]
  } ||       c} c}}S c c}w c c}}w )a  
        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

        To do this we need to split up the iteration space of i0 into something like:
            for i1 in s0:
              for i2 in s1:
                i0 = i1*s1 + i2
                ....

        This function matches and resplits lengths to the groups of
        this kernel to enable tiled + non-tiled fusions.
        c              3     K   | ]?  \  }}t         j                  j                  j                  t	        |      |z
        d k(   A ywr  r;   r   r   r!  r6   )r   rR   r  s      r\   r   z=SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<genexpr>  s@      /
1 GG%%mA&6&:;q@/
s   AA)r   r   r  r  r#  r  r  )	r  r  r   r  r  r  r   fnsfns	            r\   r  z*SIMDKernel.map_kernel_groups_to_node_sizes  s    & w<3v;&3 /
GV,/
 ,
 w'',/,G,GPW,X)
)LY__22:z3JKL8MN,"H,NN,Ns   7	B BBBc                6    t        |t        j                        S rV   )r   r   TMPrm   r   s     r\   is_indirect_indexingzSIMDKernel.is_indirect_indexing  s    "5$((33r^   c                   | j                  |      rydgt        | j                        z  }|j                  D ]g  }|| j                  vr| j                  |   }t        |j                  t              sJ ||j                  j                  xx   |j                  z  cc<   i t        j                  j                  j                  t        fdt        || j                  j!                               D              S )NFr<   c              3  F   K   | ]  \  }} |       |      k7    y wrV   r   )r   	idx_range
iter_ranger!  s      r\   r   z,SIMDKernel.is_broadcasted.<locals>.<genexpr>  s,      
%	: Y8J#77
   !)r  r   r"  r   r   r   r   r}   r   rb   r;   r   r   r!  anyr  r   )rm   r   index_numelsrs   r  r!  s        @r\   is_broadcastedzSIMDKernel.is_broadcasted  s    $$U+sS--(( 	=FT222))&1Eell,?@@@++,<,	= 77##,, 
),\4;;;M;M;O)P
 
 	
r^   c                    t        |t              r)ddj                  t        | j                  |             dS | j                  | j                  |            S )a  
        Convert an index expr to a string that can be used in output code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the generated kernel.

        Index expressions often need to be passed in as arguments to the triton kernel.
        Rename_indexing and codegen_indexing keep track of the needed indices and add
        new parameters to the function signature.
        r  r   r  )r   r  r  mapindex_to_strr  rename_indexingr  s     r\   r   zSIMDKernel.index_to_str  sN     eT"tyyT%6%6!>?@BBzz$..u566r^   c                n   | j                  |      }t        |t        j                  j                  j
                        }t        |j                  t        j                              s(t        |j                  t        j                              r3|j                  t        j                  j                  j
                        }t        |j                  t        j                              r|j                  t        j                        D ]g  }|j                  }t        |      dkD  st        d |D              s1|t        j                  j                  j                  |      i}t        ||      }i | j                  |      }t        |t               s|n|j"                  d   }| j%                  |      S )Nr   c              3  p   K   | ].  }t        |t        j                  t        j                  f       0 y wrV   )r   r   r   PRECOMPUTED_SIZEr   s     r\   r   z.SIMDKernel.prepare_indexing.<locals>.<genexpr>	  s.      , #1tyy$2G2G&HI,s   46)r  r7   r;   r   r   precomputed_replacementsr   atomsr   floorceilingsubsr   r   lookup_precomputed_sizer   r   r   codegen_indexing)rm   r   ar   replacements
simp_indexs         r\   prepare_indexingzSIMDKernel.prepare_indexing  sG    &&u-5!''"2"2"K"KLu{{5;;'(CEMM0J,KJJqww//HHIE u{{5==)*[[/ 	< ..w<!# ,$, ) %&qww'7'7'O'OPQ'R#SL&ul;E	< ++E2
 )X>JJOOTUDV 	 $$Z00r^   c                r    | j                   D cg c]  }|j                  r| j                  s| c}S c c}w rV   )r  rq   r%  )rm   ts     r\   active_range_treeszSIMDKernel.active_range_trees  s3    ''
q~~AVAVA
 	
 
s   44c                4   t         j                  j                  j                  || j	                               }t        |j                  t              D ]  }|| j                  v si }| j                  |   j                         D ].  }t         j                  j                  j                  |      ||<   0 t        |      dkD  r5t        | j                  |   j                  |      | j                  |   _        | j                  |   j                           |S )Nr   r   )r;   r   r   r  rh   sortedr   rx   r   r   r
  r   r7   r   r   )rm   r   symr  pss        r\   r  zSIMDKernel.codegen_indexing   s    ww44T4??;LM$++5 	5Cd+++  "//4EEG TB'(ww'7'7'O'OPR'SL$T|$q(6@--c277$7D))#.3 %%c*224	5 r^   c                    t        d      )NzNYI: codegen_nan_checkr<  rp   s    r\   codegen_nan_checkzSIMDKernel.codegen_nan_check1  s    !":;;r^   c                    t         j                  j                  }t        | j                  j
                        D ]  }|j                  |        y rV   )r;   r   wrapper_coder   r   workspace_argsgenerate_workspace_deallocation)rm   wrapperwss      r\   deallocate_workspacesz SIMDKernel.deallocate_workspaces4  s=    ''&&499334 	8B33B7	8r^   c                    t        d      )NzNYI: call_kernelr<  )rm   rf   r   deallocate_wss       r\   call_kernelzSIMDKernel.call_kernel9  s     ""455r^   c              #     K   | j                   }| j                  }|rt        j                  ||      }t	        j
                  |      }|| _         || _        	 | || _         || _        y# || _         || _        w xY ww)z:Context manager to add an additional mask to tl.load/storeN)
_load_mask_load_otherr9   logical_andr:   _unwrap)rm   rg  r   r}  	prior_vals        r\   
mask_loadszSIMDKernel.mask_loads>  sy      $$	??4/D!!$' 	)J#DO(D $DO(Ds   AA=A* A=*A::A=c                (   | j                   j                         D ci c]  \  }}||j                   }}}t        ||      }i }| j                  D ]7  }t        |j                        }t        ||di      t        ||di      z
  ||<   9 |S c c}}w )a\  
        This gets the stride of the index for each of the tiling variables
        (technically, it does it at index 0)

        For example, if
        xindex = x0 + 512*x1 + 1024*r0
        x0 = (xindex//512)
        x1 = (xindex % 512)
        r0 = rindex // 1024

        this function would return
        {xindex: 512, rindex: 1024}
        r<   r   )r   ru   r   r7   r  r5   rf   )	rm   r   kvindex_to_tile_indexesindex_in_tile_varsstrides
range_treer   s	            r\   get_strides_of_loadzSIMDKernel.get_strides_of_loadP  s     8<7L7L7R7R7T Utq!AFF U U'/DE** 	J":??3A#$6A?*"QFC GAJ	
  !Vs   Bc                \    t        |t              rt        t        | |            S  | |      S rV   )r   tupler  )r  r   s     r\   _map_tuple_or_scalarzSIMDKernel._map_tuple_or_scalarh  s'    eU#R((%yr^   c                    t        j                  | j                  j                        D cg c]  }|j	                          }}t        t        d |            S c c}w rV   )rG   
only_nodesr  r	  estimate_flopsrT  filter)rm   r   flopss      r\   r7  zSIMDKernel.estimate_flopsn  sX     +55dmm6Q6QR
 !
 
 6$&''	
s   Ac           	        g }t        t        | j                  j                  j	                                     }| j                  j                         \  }}}}| j                  j                         }t        j                  j                  j                  t        | j                  j	                                     }t        |      D ]2  \  }}||vr|j                  d       t        j                  j!                  |      }	t        j                  j                  j                  |	      }
|
|kD  rwt#        t$                  }d}||   D ]M  }t'        |t(        t*        f      r|j-                  d|        |dz  }3|j-                  |j.                         O t        |      |z  }n|
}t        j                  j1                  |      }t3        |      }|j                  ||z  dt5        ||k        z   z         5 t7        |      S )a+  
        Try the best to estimate the total size (in bytes) of the
        kernel's inputs and outputs, which is used for estimating the memory
        throughput of this kernel. This information is used for checking how
        far we are from the peak memory bandwidth. It's important that
        we want to avoid overestimating the sizes of the inputs and outputs,
        because it can wrongfully give us a very large memory traffic value,
        which may be even larger than the theoretical bandwidth and thus
        become very misleading. This is particularly problematic for cases
        where we slice some inputs. In those cases, we should only count
        the size of the "slices" instead of the original inputs, because
        only the slices contribute to the real memory traffic.
        r   no_index_dep_r<   )r   r8   r   inplace_buffersr   python_argdefsr  buf_accessesr;   r   r   r   r6   r"  rh  r   	get_numelr   r   r   r!   r"   r   r   	get_dtyper1   r   rT  )rm   nbytesninplace_argsr  	call_argsr>  	out_numelrJ  r   	arg_numelbuf_sizery  no_index_dep_countdepri   rW  
dtype_sizes                    r\   estimate_kernel_num_bytesz$SIMDKernel.estimate_kernel_num_bytesu  s    F499#<#<#C#C#EFG!YY5579a}}113 GG$$66$++,,./
	  	* 	MFAs ,&a ))#.Iww''99)DH)# %S/+%&"', /C!#'9:m4F3G$HI*a/*CII./ Gy0 GG%%c*E'.JMM%*,CM8I4J0JKL;	M< 6{r^   c           	     &   t        | j                  j                        dk(  rEt        | j                  j                        dk(  r#t        | j                  j                        dk(  ry| j                  j                         \  }}}}d}|D ]F  }t        j                  j                  |      }|s&|j                         }	t        |	j                        dk(  sOt        |	j                  D 
cg c]
  }
|
dk(  s	|
 c}
      dk(  r|t        j                  |	j                        }||}||k7  st        d| dd| d	| z         }t        j!                  |       |D cg c]m  }t        j                  j                  |      rJt        j                  t        j                  j#                  |      j                         j                        ndo }}|D cg c]Z  }t        j                  j                  |      r7t        j                  j#                  |      j                         j                  nd\ }}|D cg c]@  }|t        j                  j$                  v rd
n|t        j                  j&                  v rdndB }}|D 
cg c]  }
|
j(                   }}
t        d| d| d| d| d| dz         }t        j!                  |        y t+        d| d      }t        j!                  |       yc c}
w c c}w c c}w c c}w c c}
w )zr
        Print message if the kernel have mixed layout inputs.
        Only care about 4D tensor for now.
        r<   r   Nr  r   zExpected stride order z, but found stride orderr  z for kernel 
GraphInputIntermediateBufferz  param names z
  buf names z
  strides z	
  sizes z
  sources 
z%All the inputs for the triton kernel z have uniform layout)r   r   input_buffersoutput_buffersr<  r=  r;   r   try_get_buffer
get_layoutr  r   get_stride_orderstrider+   logwarning
get_buffergraph_inputsname_to_bufferrf   r)   )rm   r  argdefsrC  
_signaturer  uniform_stride_orderarg_namebuflayoutrR   stride_ordermsgrf   stride_order_list	size_listsource_listargdef_namess                     r\   warn_mix_layoutzSIMDKernel.warn_mix_layout  s    		''(A-DII,,-2DII--.!3
 ,0II,D,D,F)J#! 0	H''((2C^^%F6;;1$6;;9a!q&9:a?!226==A'/+7()\9%01E0FF^_l^<}EFC KK$ %.) ! 7711$7 ++GG..t4??AHH "	")% ) %.	! ! 7711$7 **40;;=BB!"!I ! %.# !	  177#7#77 %  177#9#99 2!	"#K # 5<#<qAFF#<L#<%(nYK|\m[no&ykk]"MNC KK$a0	b 3K=@TU
 	C[ :)!# $=s'   -
K:
8K:
"A2K?AL?AL	
Lc                   t        j                  ||d|      }d| _        t        j                  | j                  j
                  |      }t        j                  ||      }d| _        t        j                  ||      }t        j                  ||      }t        j                  ||d|      }t        j                  |||f      S )NrT  FT)r9   	reductionr%  
index_exprr  r  truedivsubmulr:   r'  )	rm   rW  r   sum_r  meandxdx2m2s	            r\   welford_reduce_fallbackz"SIMDKernel.welford_reduce_fallback  s    }}UE5%8 % = =uE{{4( $WWUD!ggb"o]]5%4!!4V"455r^   c                    t        j                  ||d|      }t        j                  ||      }t        j                  |      }t        j                  ||d|      }t	        j
                  ||f      S )NmaxrT  )r9   rh  rk  expr:   r'  )rm   rW  r   vmaxrk  ru  vsums          r\    prepare_softmax_twopass_fallbackz+SIMDKernel.prepare_softmax_twopass_fallback  s\    }}UE5%8ggeT"ggcl}}UE5#6!!4,//r^   c                    t         rV   r<  rp   s    r\   codegen_kernelzSIMDKernel.codegen_kernel  rY  r^   c                     y rV   r   rp   s    r\   r  zSIMDKernel.codegen_body      r^   c                     y rV   r   )rm   r  s     r\   r   z)SIMDKernel.codegen_iteration_ranges_entry  r|  r^   )NNNNF)r  dict[str, sympy.Expr]r  rH   r   r   r8  bool | Noner9  r  r(  dict[str, sympy.Expr] | Noner+  r   r~   r   r   )r~   	list[Any])rJ  r   r~   rx   r   )rW  torch.dtyper~   rx   )r~   r  r   )r   r   r%  r   rq   r   r"  r~  r-  r   r~   list[IterationRangesRoot])r   zdict[str, str]r~   r   )ry  Sequence[sympy.Expr]r~   r   )rf   rx   r   r{   r   r>   r~   r   )r~   rz   )r~   z	list[str])r   r{   r~   r{   )r   r{   r  r}   r~   r{   )r~   z'contextlib.AbstractContextManager[None])r   r{   r~   ry   )r  Iterable[sympy.Expr]r   Sequence[Sequence[sympy.Expr]]r~   zStuple[list[list[sympy.Expr]], list[list[Callable[[list[sympy.Expr]], sympy.Expr]]]])r  r  r   r  r  r{   r~   r  )r  r  r   r  r  r{   r~   r   )r   r  r~   list[list[sympy.Expr]])r  r  r   r  r~   r  )r   r{   r~   r   )r   r{   r~   rx   )r~   r  )r   r{   r~   r{   r   )NT)rf   rx   r   zIRNode | Noner!  r   r~   r   )rg  zstr | OpsWrapperr   int | floatr~   zIterator[str])r   r{   r~   rz   )r~   r   )r  r   )Fr   r   r   r   pexprr  r  r  re   rE  rG  rK  rP  r   r/   rU  rX  r\  r^  r,  ru  r5  rz  r~  r&  r)  rh   r  r  r  r  r  r  r  r  r  r  staticmethodr  classmethodr   r   r   r  r  r  r  r  r  r   r  r  r  r  r  r"  r  r  r)  r1  r4  r7  rJ  rf  rr  rx  rz  r  r   r   r   s   @r\   r|   r|     s    */E&.&&!OT! ,0596:6:$)AD%AD %AD )	AD
 (3AD )4AD 4AD "AD 
ADF" 
"$
%
 J  J"2 H H5(5 5 	5
 &5 5 
#5n-*
R'
	'
$>>':>	>':	(0
 T1$T1/MT1
T1 T1l 
 ',ggkk	$ 0 $	
 
( & 
 ',ggkk	$ 0 $	
 
  V5 V	 VD O$O 0O
 
 O O84
,7$1$1 
$1L

"<8 LP66,6DH6	6
 ) )"0  
(@DFP
60"r^   r|   c                  $   e Zd ZU dZeZded<   d Zd ZeZ	eZ
d Zd Zd Zd	 Z	 d/	 	 	 d0dZd Z	 d1	 	 	 d2dZ	 	 d3dZe	 	 	 	 	 	 d4d       Z	 d5	 	 	 	 	 	 	 d6dZd7dZd8dZ	 	 	 	 d9dZd ZdddZ	 	 	 	 d:dZd;dZ	 	 	 	 	 	 d<dZdd
d	 	 	 d=dZd Z	 d5	 	 	 	 	 	 	 	 	 	 	 d>dZ d Z!e" e#jH                  d       d?d!              Z%e"	 	 	 	 	 	 d@d"       Z&e"	 	 	 	 	 	 dAd#       Z'e"	 	 	 	 	 	 	 	 dBd$       Z(e"	 	 dCd%       Z)e"	 	 	 	 	 	 	 	 	 	 dDd&       Z*e"	 	 	 	 	 	 	 	 dEd'       Z+e"	 	 	 	 	 	 	 	 dFd(       Z,e"e-j\                  j^                  d
f	 	 	 dGd)       Z0e"e-j\                  j^                  d
f	 	 	 dHd*       Z1d+ Z2dId,Z3	 dJ	 dKd-Z4d. Z5y
)LSIMDSchedulingzo
    Single Instruction Multiple Data parent class used for fusion across
    multiple different backends.
    z	type[Any]kernel_typec                &    t        d |D              S )Nc              3     K   | ]6  }t         j                  j                  j                  t	        |             8 y wrV   r  r   s     r\   r   z*SIMDScheduling.group_fn.<locals>.<genexpr>%  s*     PQQWW%%..}Q/?@Ps   <>r3  r  s     r\   group_fnzSIMDScheduling.group_fn$  s    P%PPPr^   c                X	   t        |t        j                        st        |t        j                        r t        j                  j                  ||      S |j                  \  }\  }|j                  \  }\  t        ||      }|j                         r)|j                         s|j                         rA |d       n8|j                         r(|j                         s|j                         r |d       |j                         r|j                         r|k(  xr k(  }|sddlm	} |j                  ||      }|s |d|       |r|j                         s|j                         ra|j                         s||}}| j                  |j                         |      t        fd|j                         D              s	 |d       y|S |j                         s|j                         s|k(  rk(  s|j                         s |d|       y|j                         D ]`  }|j                         r nN|j                         |j!                         z  s7|j                  \  }\  }	}
||	k(  r|
k(  rT |d	||	|
        y ||fD ]  }|j                         s y
 | j                  |j                         |      }| j                  |j                         |      }| j                  |j                         |j                         z   |      }t"        j$                  j&                  rVd
}t)        |      dkD  r%t)        |      dkD  r||cxk(  xr |k(  nc }n||k(  }nt)        |      dkD  r||k(  }|s |d|||       yy
|j                         s|j                         rɉdk(  rdk7  sJ |z  k(  rt        fd|j                         D              s	 |d       yt"        j$                  j*                  r\|j                         sLt-        | j                  |j                         |      j/                               |dfdffv }|s |d       |S y
|k7  r |d       |k(  S |j                         r|j                         rJ | j1                  ||      S )z
        Hook called by Scheduler to determine if the Triton backend
        can fuse node1 and node2.  These nodes might already be
        FusedSchedulerNodes.
        z&Split scan cannot fuse with reductionsr   )MixOrderReductionz1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)c              3     K   | ]8  }t         j                  j                         |j                                 : yw)r  N)r|   r  r   
get_ranges)r   n2rnumel1r  s     r\   r   z*SIMDScheduling.can_fuse.<locals>.<genexpr>_  s>        ,,' - s   >Az/invalid loop order and tiling for native matmulFz5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)z:numel/rnumel mismatch prologue mismatch (%s, %s), (%s, %s)Tr   ztiling mismatch (%s, %s, %s)r<   c              3  j   K   | ]*  }t         j                  f|j                                , y wrV   )r|   r  r  )r   r   numel2rnumel2s     r\   r   z*SIMDScheduling.can_fuse.<locals>.<genexpr>  s1       ,,fg->Os   03z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r   r   ForeachKernelSchedulerNodecan_fuser  r.   is_split_scanrq   torch._inductor.schedulerr  r/  select_tiling	get_nodesr   is_templateused_buffer_namesget_buffer_namesr   rY    tiling_prevents_pointwise_fusionr    tiling_prevents_reduction_fusionr3  r   can_fuse_horizontal)rm   node1node2r  numel1whyreduction_can_fuser  r   	pro_numel
pro_rnumelr   tiling1tiling2tiling3condis_reduction_tiling_validr  r  r  r  s                    @@@@r\   r  zSIMDScheduling.can_fuse'  s    eYAABj977G
 77@@NN${{FG${{FGu% )<)<)>!!#<=  "5+>+>+@!!#<=E$6$6$8!'6!1!Hg6H%G%6%?%?u%M"%G "&&(E,B,B,D --/#(%5E ++EOO,=vwO  $oo/	  IJ %%!!#E,>,>,@f$G);((*O ! !& 1 )++-!  $557%:P:P:RR$59ZZ22Iz &) 3:8M \ & ) ' * $)#)& U^  ==? 
 (():FGLG(():FGLG((!EOO$55vwG }}==w<!#7|a'&'<W<&'1\A%"g-D6	 !!!#(:(:(<a<GqL00')) "__.  <= MMBB!--/05**5??+<fELLN1  !,1- 5:;4412V##!!#E,>,>,@@@''u55r^   c           
     z   g t        t        j                            t               t               d fd}fd}fd}fd}t        j                  fd       }fd}	|D ]  }
|
v rj                  |
        ||
      r? |	|
      r |       5  	 d d d        r ||
      sxs t              nd  ||
       ` ||
      r" |       5  j                  |
       d d d        t        d d d	|
j                  d
           S # 1 sw Y   |xY w# 1 sw Y   xY w)Nc                b    | j                   \  }\  }}|k(  xr |k(  xs |z  k(  xr |dk(  S Nr<   r  r   r  
node_numelnode_rnumelri   r  s       r\   fits_in_main_bodyz@SIMDScheduling.generate_node_schedule.<locals>.fits_in_main_body  sH    +,77(A(
K%'AK6,A efn,A1Ar^   c                N    | j                   \  }\  }}|k(  xr |dk(  xr dk7  S r  r  r  s       r\   fits_outside_reductionzESIMDScheduling.generate_node_schedule.<locals>.fits_outside_reduction  s4    +,77(A(
K&K;!+;K!Kr^   c                \    | j                   j                  D ]  }|j                  v s y yr  )read_writesreadsrf   )r   readcurrent_loop_buffer_usages     r\   expect_improved_memory_usagezKSIMDScheduling.generate_node_schedule.<locals>.expect_improved_memory_usage  s1    ++  99 99  r^   c                   j                  |        j                  |        j                  | j                  j                  D cg c]  }|j
                   c}       | j                         rt        | t        j                        rrt        | j                  t        j                        rNt        | j                  j                  t        j                        s j                  | j                                y j                  | j                  j                   D cg c]  }|j
                   c}       y c c}w c c}w rV   )r   r   updater  r  rf   rq   r   r   r1  r   r   r2  dataScanget_namewrites)r   rR   r  doner	  not_ready_yet_nodess     r\   schedule_node_in_loopzDSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop  s    HHQK  #%,,amm>Q>Q-Raff-RS
  q)"9"9:qvvr'8'89"166;;8#''

5)00!--BVBV1WQ!&&1WX .S 2Xs   D; E c               3  L  K   rd   t         u rj                          nj                  t               r1j	                  t               j	                  dz   t                d d  j                  t                j                           j                          y w)Nr  r<   )rE   popr   rD   insertclear)r  maybe_split_indexr	  r  s   r\   end_current_reduction_loopzISIMDScheduling.generate_node_schedule.<locals>.end_current_reduction_loop  s      r!2o!E!!#$$%56 $$%68HI$$%6%:OL$(!  1%%'%++-s   B!B$c                    dk(  ry| j                   z  sy|rt        |d   t        t        f      rJ t	              S )Nr<   Fr  )	ancestorsr   rE   rD   r   )r   r	  r  r  s     r\   #requires_closing_previous_reductionzRSIMDScheduling.generate_node_schedule.<locals>.requires_closing_previous_reduction  sN    {&7 b!O5E#F*   +,,r^   zunexpected group: (r   z) != r<   )
r   r   r,   r  r  r   r   r   r=  r  )rm   r   ri   r  r  r  r  r  r  r  r   r  r  r  r	  r  s     ``       @@@@@r\   generate_node_schedulez%SIMDScheduling.generate_node_schedule  sW   #%)5568 0:|5?\!(,		L		Y" 
	"	"	. 
#	.	-  	Dt|HHTN &6t]K35  -5QRV5W(9(OS=O% )-%%d+'-/1 /!((./ / *)%6(%

1O -	4 ' / /s   (D%&D1%D.	1D:	c                    |j                   |j                  }}|j                  |j                         z  s|j                  |j                         z  rJ | j	                  ||       y rV   )r  r  r  get_operation_names_codegen_mix_order_reduction)rm   r   r  r  s       r\   codegen_mix_order_reductionz*SIMDScheduling.codegen_mix_order_reduction,  s[    zz4::u OOe&?&?&AAOOe7799	
 
 	))%7r^   c                    |j                         }g }g }|D ]5  }|j                         r|j                  |       %|j                  |       7 ||fS rV   )r  rq   r   )rm   r   r   
reductions	epiloguess        r\   #_split_mix_order_reduction_epiloguez2SIMDScheduling._split_mix_order_reduction_epilogue6  s\     
	 	'D  "!!$'  &		'
 9$$r^   c           	        |j                   |j                  }}|j                  }| j                  |||dg|dddd      d   }|j                  sJ |j
                  sJ ||_        | j                  ||       |j                  j                  t        |j                        |j                  d   z  |j                  d   |j                  z   dz
  |j                  z  z  d	t        j                  
      \  }}	}
|
dk(  s
J d|
       |5  |j                          ddd       t!        j"                         }t%        j&                  |      5  |5  |r%|j)                  t+        j,                  d             |j/                         }ddd       ddd       |r)j1                  t3        t4        j6                        d      }||	fS # 1 sw Y   xY w# 1 sw Y   MxY w# 1 sw Y   QxY w)z
        for_benchmark:
            True if the generated code is for benchmarking. We need make
            sure benchmark harness code is generated.
        )rR   rS   NT)r  r(  r+  r8  r   rS   rR   r<   F)rW  zws_off=)benchmark_kerneltriton_)ri   r  r	  create_kernel_choicesr*  r+  r6  !codegen_node_schedule_with_kernelr   	workspacer   r7  r"  rW   r   r  r  	ExitStackr;   set_kernel_handlerenter_contextr   patchrz  replacerx   r3   KERNEL_NAME)rm   kernel_features
split_sizefor_benchmarkri   r  r	  rk   r  ws_namews_offstacksrc_codes                r\   -_generate_kernel_code_for_mix_order_reductionz<SIMDScheduling._generate_kernel_code_for_mix_order_reductionB  s    (--/N/Nv'55++()+!%'+15		
 	 ****))))'..}fE $[[22//0mmE"#c"V%7%77!;@R@RRT ++ 3 
7F {(wviL({ 	"!	" $$&!!&) 	/5 	/##FLL$$GH,,.H	/ 	/
 
  ''K,C,C(DiPHw((	" 	"	/ 	/ 	/ 	/s0   
GG8GGGG	GG(Nc                    t         rV   r<  )rm   modn_spills_threshold
node_namess       r\   benchmark_codegened_modulez)SIMDScheduling.benchmark_codegened_modulew  s
     "!r^   c                   #$ t         j                  j                        \  $}$fd} |       }t        xj                  dz  c_         j                  |      \  }}g }|D ]C  }	|	j                          |	j                         }
|
j                          |j                  |
       E  j                  j                         |z   $|      }t        |$|      #t        j                  j                  j                   sqt        j"                  j$                  Wt        j"                  j&                  s t        j(                  st        j*                  r# fd}t-        j.                  ||d      } j1                  #|d      \  }}}t3        |d   j4                  j6                        }i }|rZ|D ]  }	|	j9                         d   j4                  j;                         }|	j9                         d   j<                  d   j4                  j9                         d   j4                  j;                         }|||<    j                   sJ  j                   j>                  jA                  |	j9                         d   j<                  d   j4                  j;                                tB        jD                  jF                  jA                  |        |jH                  D ]-  }|jK                  |jL                  |jL                        |_&        /  jO                  |||      }||_(        tS        |      |_)        tC        jT                  |      5  #jW                         D ]@  }|j9                         d   j4                  j;                         |vs1|jY                          B 	 d d d        tB        jD                  jZ                  j]                  d        j_                  |d        |ja                  |jP                  d	       tB        jD                  xjF                  |jF                  z  c_#        tB        jD                  xjb                  |jb                  z  c_1        te        |      te        |jH                        k(  sJ tB        jD                  jZ                  jg                  $|z   dz
  |z        }ti        |jH                        D ]Q  \  }}|jL                  }d
| d| d}| d| }d
| d| }ddd}|jK                  |jj                  |jj                        }tB        jD                  jm                  |      }|d uxr& te        |jo                         jp                        dkD  } | d| d| d| d| d| d| d|  d}!tB        jD                  js                  |      x}"t        jt                  k7  r	|!d|" dz  }!tB        jD                  jZ                  jw                  |!       tB        jD                  jZ                  jx                  jA                  |       T |j{                          |r j}                  |        j                          y # 1 sw Y   xY w)Nc                 r   t         j                  j                  t         j                  j                  S t        j                  j                               } | j                  }|dz  }t        j                  j                  j                        }t        t        ||z        d      }t        |d      }|S )N         )r   rY   mix_order_reduction_split_sizer(   create
get_devicemulti_processor_countr;   r   r   r   rt  r*   min)device_propnum_smestimated_num_splits
numel_hintr  r  ri   s        r\   _pick_split_sizezESIMDScheduling._codegen_mix_order_reduction.<locals>._pick_split_size  s    }};;G}}CCC +11%2B2B2DEK 66F#)A: 
 ));;EBJ_Z;O-OPRTUJZ-Jr^   r<   c                    j                  | d      \  }}}t        j                  |      }j                  |      \  }}|S )NTr  r  )r  r   loadr  )candidate_split_sizer  r  r  msr  rm   s        r\   _benchz;SIMDScheduling._codegen_mix_order_reduction.<locals>._bench  sS    !%!S!S#3"& "T "1h
 "&&x077<A	r^   r  Fr	  r   z!# Call mix order reduction kernel)r!  (z) * (r   z * z + 1) * aminamax)r  rt  z = r  z : z].view(r   z).z(dim=0, keepdim=z.to()@r   r  get_numel_rnumelr
   r  r  cancel_reduction_splitextract_pw_from_reductionswap_pw_red_dimensionr   r  r  rH   rW   rX   r   deterministicrY   r  'mix_order_reduction_autotune_split_sizemax_autotunecoordinate_descent_tuningr'   autotune_single_fieldr  r   r   _split_sizeget_outputsr  usersremoved_opsr   r;   r   removed_buffersr7  r   r  define_kernelr  r   r  scheduler_nodesmark_runr  make_commentcodegen_commentr"  inplaced_to_remover   codegen_python_sizevarrh  r  rW  rR  r  r@  r   	writeline	allocatedr  _codegen_nodesfree_buffers_in_scheduler)%rm   r  r  r  r  r  node2_reductionsnode2_epilogueconverted_nodessubnode	convertedr	  r  rk   r  r  is_split_reductionrenamebufnameusernamepartial_accumr  r   nsplitrj  r  
stride_strrM  endreduction_type2opopnamebufferkeepdimfinal_reducebuffer_dtyper  ri   s%   ``                                 @@r\   r  z+SIMDScheduling._codegen_mix_order_reduction|  s   !33DDUKv	$ &'
 	++q0+ ,0+S+S,
(. ' 	.G**,99;I++-""9-		.
 33OO/
 -]E6J &&44<<DEE&&33 '<<J %)$V$V! %W %
! ""21"5":":"F"FG+ 5!--/277@@B'')!,U1T++-+ T((*	  #+w~~%~**..'')!,2215::CCE ''++G45 "(!@!@ ,2JJ!--}/H/H-)
 ((=&I($X.!!&) 	$'779 $ ##%a(--668FMMO$	$ 	
))*MN]D16--UC	6#9#99	""f&?&??" ?#s6+J+J'KKKK%%<<Z!#
2
 #,F,K,K"L 	<C'33KVHE&3Je3zl+EcU(:,/C! '**,,m.J.JF WW''4FD(NS1B1B1D1I1I-JQ-NG)]#gYawc#gfXUWX^W__abhaiiy  {B  zC  CD  EL !" 1 1+ >>5;;N$|nA 66GG  **<8 GG  **..{;9	<< 	$$&/&&(o	$ 	$s   	AYYYc                b   | j                   sJ |D cg c]+  }|j                         | j                   j                  vs*|- }}|sy t        |d       j                  \  }\  }}| j                  |||      }t        j                  d|       | j                  t        ||||            S c c}w )Nc                4    t        | j                               S rV   r   rq   rR   s    r\   r   z/SIMDScheduling._codegen_nodes.<locals>.<lambda>(  s    c!..:J6K r^   r   zSchedule:
 %s)
r   r  r  rt  r  r  schedule_logdebugcodegen_node_schedulerH   )rm   r   coalesce_analysisr   r  ri   r  r	  s           r\   r(  zSIMDScheduling._codegen_nodes  s    
 ~~~"
dmmoT^^=W=W&WD
 
  ,KLRR?E633E5&I+];))}eV=NO
 	

s
   +B,B,c                   | j                   sJ |j                         D cg c]*  }|j                         | j                   j                  vr|, }}t	        |      dk(  ryt
        j                  j                  j                  j                  r_t	        |      t	        j                               k7  r.| j                   sJ t        j                  | j                   |      }t        |      }nd}| j                  ||      S c c}w )zK
        Given a set of pre-fused nodes, generate a Triton kernel.
        r   N)r   r  r  r  r   rW   rX   r   rY   coalesce_tiling_analysisFusedSchedulerNoder   r(  )rm   r   r   rD  s       r\   codegen_nodezSIMDScheduling.codegen_node1  s     ~~~ (
}}dnn&@&@@ 
 

 u:???!!((AA5zS!122~~%~ 33DNNEJ 9$ ? $""5*;<<!
s   /Dc                   t        j                  t         j                        j                  }t	        |       sy|D cg c]0  }|j                         r|j                         j                         2 }}|D ]}  }|j                         rt        |t        j                        s/|j                         }||D cg c]0  }|j                         r|j                         j                         2 c}z  } t        d |D              syt        j                  j                  j!                  | |       |D ],  }t        j                  j                  j!                  ||       . yc c}w c c}w )NFc              3  2   K   | ]  }t        |        y wrV   )r0   )r   r  s     r\   r   z8SIMDScheduling.can_use_32bit_indexing.<locals>.<genexpr>e  s     FD)$/FrS  T)rW   iinfoint32rt  r0   has_tensor_outputrR  storage_sizer   r   MutationOutputget_mutation_buffersr   r;   r   r   	check_leq)ri   buffersint_maxr^  	buf_sizesmutated_bufsr  s          r\   can_use_32bit_indexingz%SIMDScheduling.can_use_32bit_indexingJ  s=   
 ++ekk*..%e, 
$$& NN))+
	 
  	C((*z#r?P?P/Q"779+,,. NN$113 		 FIFF 	
""5'2 	6DGG&&tW5	6/
s   5E&;5E+Fc                   | j                  ||       |sIt        j                  |      5  t        j                  |      D ]  }|j                           	 ddd       t        j                  xj                  |j                  z  c_        t        j                  xj                  |j                  z  c_        y# 1 sw Y   dxY w)ze
        Process a kernel by generating code for its node schedule and updating graph state.
        N)	r  r;   r  rG   r6  r!  r   r  r$  )rm   rk   r	  only_gen_src_coder   s        r\   process_kernelzSIMDScheduling.process_kernelo  s     	..}fE %%f- $.99-H $DMMO$$ 	
6#9#99	""f&?&??"	$ $s   +B99Cc                    i }|D ]N  }t        |t              s|j                  !|j                  j                         }|s>|j	                  |       P |S )zCollect and merge config_patches from all operations in the node schedule.

        This enables scoped config (e.g., coordinate_descent_tuning) for kernels
        that contain decomposition operations.
        )r   r,   r   get_config_patchesr  )rm   r	  merged_patchesr   patchess        r\   _collect_config_patchesz&SIMDScheduling._collect_config_patches  sW     *,! 	3D$ 12tyy7L))668"))'2		3
 r^   c                   |j                   }| j                  ||j                  |j                  |j                        \  }}| j                  ||g||d      }|D ]  }| j                  ||        t        j                  |       | j                  |      }|D ]  }t        j                  |      5  t        j                  di |5  |j                         }ddd       ddd       | j                  ||      }	t         j#                  d|	       |	|_        t'        |      |_         ~t)        |      dkD  rt        |      }
n|\  }
t        j                  |
      5  |j+                         D ]  }|j-                           	 ddd       |D cg c]  }t/        |t0              s| }}| j3                  ||
j$                         t        j4                  j6                  r\t        j8                  j:                  j=                          t        j8                  j:                  j?                  |
j$                  |       |
jA                  |
j$                         t        j4                  j6                  r(t        j8                  j:                  jC                          t        jD                  r|
jG                          t        jH                  r|
jI                  |d   j$                         t        j8                  xjJ                  |
jJ                  z  c_%        t        j8                  xjL                  |
jL                  z  c_&        t        j8                  j:                  jN                  rt        jP                  r|d   jR                  jU                         }|j+                         D ]  }|jW                         }||vr|jX                  J |jX                  j[                         }|Ct\        d   dxx   dz  cc<   t        j8                  j:                  j_                  d|j`                  d	| d
        | jc                          y# 1 sw Y   sxY w# 1 sw Y   xxY w# 1 sw Y   xY wc c}w )z<
        Generate code for nodes in kernel_features
        )r  r(  Nz+Generating kernel code with kernel_name: %sr<   r   inductorintermediate_hookszrun_intermediate_hooks(r   r   r   )2r	  get_tiling_and_scoresri   r  rD  r  r  rB   merge_workspaces_inplacer^  r;   r  r   r  rz  r  rU  rB  r  r   r   r   r!  r   r,   r#  cppenable_kernel_profiler   r   write_kernel_context_guard_beginwrite_kernel_context_guardr"  write_kernel_context_guard_endnan_assertsr  rf  r  r$  supports_intermediate_hooksgenerate_intermediate_hooksr   live_output_buffersr  r   get_origin_noder   r&  rf   r)  )rm   r  r	  r  tiling_scorekernelsrk   config_patchesr  r  final_kernelr   base_scheduler_nodes	live_outsrf   origin_nodes                   r\   rC  z$SIMDScheduling.codegen_node_schedule  s    (55#99!!++--	 
 ,,H(<H

  	JF22=&I	J,,W5 55mD 	3F%%f- 3v||/Mn/M 3!0023 3,,X}fMKIIC[Q!,F(2F	3  w<!&w/L%O\!!,/ 	 '779   	  + 
j?P.QD 
  
 	1<3K3KL::++GG  AACGG  ;;(($ 	  !9!9:::++GG  ??A**,!!(()?)?@	<#?#??	""l&E&EE" GG  <<22  
;;=I'779 
}}y(yy,,,"ii779*Z()=>!C>GG((221+2B2B1ERvQO
 	&&(y3 3 3 3	  	 
 
s<   2QQ
Q&Q$Q1(Q1
QQQ!	$Q.c                (     | j                   |i |gS rV   )r  )rm   r  kernel_argskernel_kwargss       r\   r  z$SIMDScheduling.create_kernel_choices  s)     D
 	
r^   c           	     <   |5  t        j                         }i }|D ]  }|t        u r |j                  |j	                                +|t
        u r|j                          D|j                          |j                  |j                               }|j                  t        j                  |j                  j                  |      j                                       |j!                  |j#                                |D ]  }|t        u r |j                  |j	                                +|t
        u r|j                          Dt%        |j                         |j                  |j                               }|j'                  |        	 d d d        y # 1 sw Y   y xY wrV   )r  r  rD   r  r  rE   closedecide_inplace_updater  r  r  r
  fromkeys_bodyindexing_from_argsr   rz  keysr&   r   )rm   r	  rk   r  all_indexingr   r   s          r\   r  z0SIMDScheduling.codegen_node_schedule_with_kernel  sS    	-((*EL & ++''(@(@(BC_,KKM..0!'!<!<T__=N!OJ '' JJ99*ELLN $$\%6%6%89 & 	-++''(@(@(BC_,KKM 6djjA!'!<!<T__=N!OJLL,	--	- 	- 	-s   FFFrX  c          	     J   i }|j                         }g }	|D ]  }
|
j                         }|	j                  |
       ||z  s*t        |      dk(  sJ |	|t	        t        |            <   |j                  j                  t	        t        |                   g }	 t        |	      dk(  sJ |j                  D ](  }|j                  j                  j                  |d       * |j                  | ||||t        |      }t        j                  rH|j                         dz  }|j!                          d| d|j#                  |      j%                          }g |||}|r|S t'        |j)                         D cg c]  }t+        |       c}      }t-        j.                  |      5  |j1                          |D ]   }t+        |      |vs|j1                          " |D ]  }|j1                           	 ddd       | j3                  |||      |_        |S c c}w # 1 sw Y   (xY w)zK
        Helper method to codegen a single template kernel variant
        r<   r   Ng    eArN  )r  r  r   r   r   iterprologue_fused_inputsr   r   rO  r  rE  r   r   r  rJ  imports_for_benchmark_kernelcodegen_kernel_benchmarkgetvaluer   rG  idr;   r  r!  r  r  )rm   rk   rD  r?  r@  rA  rX  rB  template_readsprologue_groupprologuenamesbuf_namer  num_gbr	  r   unfused_setr   s                      r\   _codegen_single_templatez'SIMDScheduling._codegen_single_template  s2    &("&88:& 	$H--/E!!(+~%5zQ&@N*4U+<=,,00d5k1BC!#	$ >"a''' 44 	:HKK%%))(D9	: //&(
 ""557#=F6689*B226:CCEFH  J.I-I.IO !1M1M1O!PA"Q%!PQ!!&) 	 ""$& $d8;.MMO$ '   	  "//-P "Q	  	 s   6H$#H)HH"c                   ddl m fdg }t        |j                        |gz   D ]S  }t	        |t        t
        f      r$|j                  t        fd|D                     =|j                   |             U t        |      S )Nr   r$   c                    t        |       sy t        | t        j                        r| j                         } | j	                         x}y t        d |D              S )Nc              3      K   | ]  }|  y wrV   r   r   s     r\   r   zKSIMDScheduling._get_multikernel_shapes.<locals>.get_size.<locals>.<genexpr>i  s     )q)s   )r   r   BaseViewunwrap_viewmaybe_get_sizer3  )r   r  r%   s     r\   get_sizez8SIMDScheduling._get_multikernel_shapes.<locals>.get_sizeb  sR    c6*#r{{+oo'**,,5)D)))r^   c              3  .   K   | ]  } |        y wrV   r   )r   _argr  s     r\   r   z9SIMDScheduling._get_multikernel_shapes.<locals>.<genexpr>n  s      @D$ @s   )r   r%   r  inputsr   r3  r   )rm   r   r  r   r%   r  s       @@r\   _get_multikernel_shapesz&SIMDScheduling._get_multikernel_shapes]  st     	 	* $v- 	*C#e}-

5 @C @@A

8C=)		*
 Szr^   c                H    | j                  |      }t        d |D              S )Nc              3  @   K   | ]  }t        d  |D                yw)c              3     K   | ];  }t        |t        j                        xr t        |t        j                          = y wrV   r   r   Exprr   r   s     r\   r   zFSIMDScheduling._kernel_has_dynamic_shapes.<locals>.<genexpr>.<genexpr>v  s9       1ejj)N*Q2N.NNs   AAN)r  )r   shapes     r\   r   z<SIMDScheduling._kernel_has_dynamic_shapes.<locals>.<genexpr>u  s,      

 	   
s   )r  r  )rm   r   shapess      r\   _kernel_has_dynamic_shapesz)SIMDScheduling._kernel_has_dynamic_shapess  s.    --d3 

  
 
 	
r^   c                N    | j                  |      }t        fd|D              S )zk
        Returns cache key for hint-based multi-graph; key is tuple of shapes with hint filled in.
        c              3  F   K   | ]  }t        fd |D                yw)c              3     K   | ]<  }t        |t        j                        rt        |t        j                        sn| > y wrV   r  )r   r   hints     r\   r   zASIMDScheduling._make_shape_cache_key.<locals>.<genexpr>.<genexpr>  s@        a,Z5==5Q s   AANr  )r   r  r  s     r\   r   z7SIMDScheduling._make_shape_cache_key.<locals>.<genexpr>  s/      
    	 
r  )r  r3  )rm   r   r  r  s     ` r\   _make_shape_cache_keyz$SIMDScheduling._make_shape_cache_key}  s1     --d3 
  
 
 	
r^   rX  hint_overridec          	        |j                   \  }\  }}|dk(  sJ t        |j                  t              r|j                  j                  rt        |j                  j                        dkD  r| j                  |j                        ri }	g }
|j                  j                  j                         D ]  \  }} ||j                  |      \  }}|r;| j                  |||||d      }t        |t              sJ |
j                  |       Z|]| j                  |||||d      }|dn| j                  |j                  |      }||	|<    |rdj                  |
      S t        j                  t        |	j!                                      t#        |	      }g |||}| j%                  ||j&                         |j)                  |j&                         t*        j,                  xj.                  |j.                  z  c_        t*        j,                  xj0                  |j0                  z  c_        | j3                          y|j                  j5                  |j                  |      \  }}|r| j                  |||||d      S | j                  |||||d      }g |||}| j%                  ||j&                         |j)                  |j&                  |j                         t*        j,                  xj.                  |j.                  z  c_        t*        j,                  xj0                  |j0                  z  c_        | j3                          y)z
        Codegen a triton template with multi-kernel dispatch support

        If `only_gen_src_code=True` the src code will be returned instead of being
        codegenned into the wrapper
        r<   )r  Tr  NFz

)r  r   r   r   _make_kernel_rendersr   r  ru   r  rx   r   r  r  rB   rc  r  r   rC   r#  r  r"  r;   r   r  r$  r)  make_kernel_render)rm   r?  r@  rA  rX  r  r  _numelr  ro  	src_codes	size_hintr  rk   rD  r  shape_cache_keymulti_kernelr	  s                      r\   codegen_templatezSIMDScheduling.codegen_template  sD     ,11FF{{ }))+>?""77M&&;;<q@//0B0BCGI
 ##88>>@%6 "!3!&&m" %#<<%&&*.  =  H &h444$$X. ( !::%&&*/ ; F %, !778J8JIV $ 06GO,K%6N !{{9--00gnn6F1GH.w7LMnMmMnMM  0H0HI$$\%=%=>GG##|'C'CC#GG&&,*I*II&**,*//BB""- C NFF !44!""&* 5   66!""&+ 7  !R. Q- Q. Q$$]F4F4FG""6#5#5}7I7IJ''6+A+AA'**f.G.GG*..0r^   c                    t         j                  j                  j                  t         j                  j                  j                                y rV   )r;   r   r  r&  
device_opssynchronizerp   s    r\   codegen_synczSIMDScheduling.codegen_sync  s-    	&&qww'9'9'E'E'GHr^   c           
        ddl m} ddlm} t	        | j
                  |      sJ |D cg c]  }|j                          }	}i }
t        ||	      D ]  \  }}t        |d       j                  \  }\  }}| j                  |||      }| j                  |||      }t        |||      }|j                         xr! t        j                  j!                  |d      }t#        ||||||      |
|<    |j%                  || ||
	      }t&        j)                  d
t+        |      |D cg c]  }t+        |       c}       g }|D ]k  }t+        |      dk(  rt+        |      dk(  r|
|d      }|r|j-                  dd|f       @| j                  |j.                  |j0                        }| j3                  ||j4                  |       t        j6                  |      5  |j9                         }ddd       |j-                  ||f        || j
                  ||      }|D ]g  }|
|   }|j;                  |j.                  |j0                  | | j
                        }| j3                  |j=                  |      |j4                  |       i |j9                         }|j-                  |||f       n |S c c}w c c}w # 1 sw Y   xY w)al  
        Generate kernel code for combo kernel partitions.

        Partitions subkernel_nodes using horizontal_partition(), then generates
        kernel code for each partition. Single-node partitions are generated as
        regular kernels, while multi-node partitions use ComboKernel.

        Returns a list of (src_code, kernel, node_group) tuples.
        r<   )TritonKernel)ComboKernelc                4    t        | j                               S rV   r?  r@  s    r\   r   z;SIMDScheduling.generate_combo_kernel_code.<locals>.<lambda>	      #ann>N:O r^   r   F)r'  )r	  r  ri   r  r  r  )r   triton_schedulingcustom_algorithmnode_info_mapz1ComboKernels: %d nodes partitioned into %s groupsr   Nr  )triton_kernel_clsenable_autotunemixed_sizes)r  optimize_maskr  )rY   r  triton_combo_kernelr  
issubclassr  r  r  rt  r  r  r  rH   rq   r;   choicesr)  r  horizontal_partitionrU  rB  r   r   r  r  rY  r	  r  rz  create_triton_kernelcreate_sub_kernel)rm   subkernel_nodescustom_part_algorithmr  r  rX  r  r  r   fused_node_listsnode_schedule_mappnr   r  ri   r  r	  r  r  r  
partitionspkernel_code_list
node_group	node_infork   r  	subkernels                               r\   generate_combo_kernel_codez)SIMDScheduling.generate_combo_kernel_code	  s   " 	)4 $**L9999HIDNN,II13_.>? 	IB!$U0O!P!V!VAv 77ufMM''ufEF)-GH%%' II==E >  $ %-+!(?%b!	( !55!"2+	 6 

 			? '(SV(	

 $ .	HJ:!#:!#-jm<	$$++T4,DE "--!((!*!3!3 . F ''	 7 79J --f5 ;#)#8#8#:; %++Xvz,JK %&*&6&6$3 +
 % B 1" 5I + @ @!((!*!3!3*5o*.*:*:	 !A !I ''00;!//) "002 ''6:(FG].	H`  e J> ),; ;s   J1#J6J;;K	c                   |j                         }|j                  }|j                  }t        j                  dkD  xs t        j                  dk(  xr |}| j                  ||||      }|D ]]  \  }}}	| j                  ||g|      }
| j                  |j                  |
       t        j                  d|
       |j                  |
       _ | j                          y )Nr<   z"ComboKernels: generated kernel %s.)get_subkernel_nodesuse_custom_partition_algor  r   combo_kernel_allow_mixed_sizesr  r  r#  snodesrU  rB  r"  r)  )rm   combo_kernel_noder  r  r  r  r  r  rk   r  r  s              r\   codegen_combo_kernelz#SIMDScheduling.codegen_combo_kernell	  s    +??A 1 K K+;;;;a? 
11Q6P;P 	  ::2O[
 $4 	,Hfa,,X8I7JFSK  !2!9!9;GII:KH{+		, 	&&(r^       c           
        
 dk(  }d 
fd}|j                         \  }
t        |      dk  rt        
      dk  st        |
z         rg S |j                         \  }
 |||r|n
|j                  |            }|D cg c]?  }t	         j                  |j                  |      |j                  |j                        A }	}|	S c c}w )Nr<   c                d   t        |j                        t        |      k(  sJ d|j                  d|       |j                  |j                  g}t	        d t
        j                  j                  |      D              sJ t
        j                  j                  |      D cg c]:  }|j                  t        j                  j                  vrt        |t              r|< }}t        |j                  D cg c]  }|j                   c}      }dd}t        j!                   ||      g|       dd      g}|D ]  }t        j                  j"                  j%                  |j&                  |j                        }	t        |	      t        |      k(  sJ 	 |	j'                  d      dz   }
|
t        |      k(  rt	        d	 |	|
d
 D              r	  ||d
|
        |||
d
       f}t        j                  j"                  j+                  t-        d t/        ||	      D                    }|j                  |v r|dz  }t        j1                  |d         r|dz  }t        j1                  |d         r|dz  }t        j                  j"                  j+                  |t-        t        j                  |            z
        dk\  s|j3                  t        j!                   ||d
|
        |||
d
       g      ||j                                |S c c}w c c}w # t(        $ r Y w xY w)zX
            Compute tiling candidates by dividing up the iteration ranges.
            zrw.range_vars=z ranges=c              3  H   K   | ]  }t        |t        t        f        y wrV   )r   r    r!   )r   rH  s     r\   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>	  s$       3G 45s    "c                f    t         j                  j                  j                  t	        |             S rV   r  )r  s    r\   collapse_rangeszNSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.collapse_ranges	  s"    ww''00v1FGGr^   noner   )r  rf   scorer<   c              3  &   K   | ]	  }|d k(    ywr  r   r   s     r\   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>	  s     ;a16;s   Nc              3  2   K   | ]  \  }}|d k7  s|  ywr  r   )r   r  rT  s      r\   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>	  s       "!-vST"s   r   r  r  rf   )r  r  r~   r{   )r   
range_varsr  r  r   r#  r  r  rf   r;   r   r  r   r    r   CandidateTilingcreate_partial_tilingr   stride_hintsr   
ValueErrorr   r6   r  is_good_sizer   )is_pointwiser  rwdep_sourcesrH  depswrite_namesr  tilingsr/  splittiled_groupsr  r  r  reduction_rangess                r\   tile_rangesz5SIMDScheduling.candidate_tilings.<locals>.tile_ranges	  s    r}}%V4S8H	&6SS4 88RYY/K $??88E    %??88E88177#:#::sI. D  %"))%D3chh%DEKH
  44(01<  G  4''**77		2==Q7|s6{222
#MM!,q0EF+ ;756?;; ! < $F6EN3#F56N3  ((::! "14VW1E" 
 88{*QJE"//Q@QJE"//Q@QJE GG$$66ioofFV.W XX 
 NN'#&#<#<$3F6EN$C$3F56N$C!" !0$ #(!$
Q4l N[ &E: " s$   $?L8L"L"=L""	L/.L/r  )r  r   r~   list[CandidateTiling])	r  r   r   "pointwise_or_reduction_read_writesr  complete_partial_tilingr  r  rf   )r  r   ri   r  r  r  pointwise_rangespartial_tilingsr  full_tilingsr  s   `  `      @r\   candidate_tilingsz SIMDScheduling.candidate_tilings	  s     '!+\	| .2__->** !Q&$%*$%58H%HII .2__->**% ,2B33LA
 *	
  22MM5/ ll[[	
 	
 	
s   ACc           	         d}d}t        |      t        |      k  sJ t        |      t        |      k  sJ t        g t        |t        |       d |d      t        ||d            S )zK
        Create a tiling dict from pointwise and reduction splits.
        )rP   rQ   rR   )rS   rT   NF)strict)r   r   r  )r  	pw_tilingreduction_tilingpw_prefixesreduction_prefixess        r\   create_tilingzSIMDScheduling.create_tiling
  s     &+9~[!1111#$,>(????[#i.!23YuM')9%H
 	
r^   c                >    | j                  |r|ng |s|      S g       S rV   )r  )r  r  r  s      r\   r  z$SIMDScheduling.create_partial_tiling
  s0       "F&F
 	
,.
 	
r^   c                    t        |j                               }d|v }||z  }|t        |      z  g}|r||fn||f} | j                  | S )zb
        Given a tiling for only pointwise or reduction dimensions, adds the missing one.
        rR   )r  r   r6   r  )	r  r  ri   r  splitsr  total_numelmissing_tilingtiling_argss	            r\   r  z&SIMDScheduling.complete_partial_tiling
  sf     fmmo&f}o-%f(==> )5V^$>6:R 	 !s  +..r^   c                    	 	 	 	 	 	 dfd	 	 d	fd}dk(  }t        t        t        t        j                  f             }t        j                  |      D ]  }t        |t        j                        s|j                         }|st        |d         dk(  rC |d         }	 |d         }
|	|
fg}|j                  j                         D cg c],  }t        |t              rt        |j                        dkD  r|. }}|D ]  g j                  j!                         }t        j"                  j$                  }t&        j(                  j*                  }d}t-        |      D ]#  \  }\  }}||z  }|}|j/                  |      s# n |j1                  |      s|dz   }|d| }|rdn||d } |||d         }|rt        j"                  j$                  fn |||d         }t        |      st        |      dkD  s|j3                  ||f        t5        j6                  t9        |  D ]&  \  }}|j;                  | j=                  ||             ( 	 t?        |t        d      }|S c c}w )
z
        Creates N-dimensional tiling candidates, attempting to simplify loads/stores
        by tiling the kernel into higher dimensions.

        Returns a list of tilings ranked by dimensionality.
        c                ^   | s|fS t        d      }t        j                  j                  j	                  d      r6t        j                  j                  j                  d      rt        |d      }t        dt        |       |z
        }|dz   }t        | d|       }|ft        | |d       z   S )zU
            Collapse dimensions to the maximum allowed number of tiles.
            r   r<   r   N)r]   r;   r   r   r   r  r  rt  r   r6   r3  )dimsfallback_numelrZ   num_leading_dimsfirst_trailing_dimcollapsed_leading_dimpointwise_numelr  s         r\   collapse_dimsz4SIMDScheduling.get_nd_tilings.<locals>.collapse_dimsB
  s     &((%a(Iww77''""66J  	1-	"1c$i)&;<!1A!5$1$7J8J2K$L!)+eD9K9L4M.NNNr^   c                   g }| D ]  \  }}t        j                  j                  |      }t        d|j	                  t
              |j	                  t              z   t        |            }t        j                  ||||      }||d   n|g}	|j                  |	        |D 
cg c]F  }
t        j                  j                  j                  |
t        j                  j                         s|
H }}
 ||      S c c}
w )Nr   r   )r=   get_subexpr_involving_symbolr   rt  r$  r   r   r   match_mod_div_block_exprrw  r;   r   r   r   r   r   r   )rh   ranges_to_tiler	  r  varri   r   num_dimsmatch_resultr  dimr  rH  s              r\   tile_var_rangesz6SIMDScheduling.get_nd_tilings.<locals>.tile_var_rangesV
  s     F( $
U+HHTWX KK)EKK,HH'  3KK3x  +7*B|Ad#!$* "ww''??UWW[[Q F  !55s   $AC:r<   r   NT)r   reverse)r  r  r  r{   r~   tuple[sympy.Expr, ...])r~   r  ) r   r   rx   r   r  rE   r8  r   r   r1  r  r   r  reads_and_writesr    r  ru   r   r   r;   r   r   rh  statically_known_geqr   r   r#  productr  r   r  r  )r  r	  r  r  r  r  r  r   node_rangesdefault_pointwise_tilingdefault_reduction_tilingnode_tilingsrH  memory_depsall_var_rangespointwise_vars_numelr   pointwise_end_idxrj  _varri   reduction_start_idxpointwise_var_rangesreduction_var_rangespointwise_tilingr  ranked_tilingsr  s     ``        `              @r\   get_nd_tilingszSIMDScheduling.get_nd_tilings4
  s   	O&	O8B	O#	O(	6#	6@ '!+^CO<=?#**=9 F	SDdI$;$;< //+KCA$71$< (5[^_'U$'4[^_'U$57OPQL  ++<<>c9-#cjj/A2E K 
 # (N "73::#3#3#5!6',ww{{$77++$%!*3N*C &C$(E1((+%44,o   77(/  '8!&;#'56J7J'K$(Dn=P=Q.R % $3(+a./$ 
 $ WW[[N(,k!no ! '(S1A-BQ-F '')9;K(LMQ(NX 7@6G6Gl#7 S2 "2 C--.>@PQRSGF	ST  
 }s   1I;c                x   j                   sdnj                   j                  j                  j                  j                  j                  j                  j
                  }D cg c]  }||   	 c}D cg c]  }||   	 c}t        j                  j                  j                  }t        j                   |t                     |      k(  fd       t        j                   |t                     |      k(  fd       i g }	 	 	 d	 	 	 	 	 	 	 df	d}	|j                   |	d       |	d      f       r$|j                   |	fdd       |	d      f       j                  j                         z  }
|
D ]%  }|j                   |	|fd       |	d      f       ' t!        d	
      d	k(  rBdk(  r=t#        j$                  |
d      D ]$  }|j                   |	|d       |	d      f       & g }|D ]b  \  \  }}\  }}t'        | j)                  ||      t+        |      t+        |      z         }| j)                  ||      }|j                  ||f       d | j)                  gg      }ddt+        j,                  j/                               fd}t1        ||      D ]  \  }}| j3                  |j4                        s|j4                  |k(  rt7        |j4                        dk(  rdndz
  }|t!        d	
      kD  rDt8        j;                  d|t        j<                  j>                  j@                  jB                         |j4                  |fc S |j4                  |k(  s|j4                  |fc S  |dfS c c}w c c}w )zr
        Generates a tiling, and a score of each tile according to each tile's coalesced memory accesses.
        Nc                      d d  S Nr   r   )r	  r  	pw_rangess   r\   r   z8SIMDScheduling.compute_tiling_strategy.<locals>.<lambda>
  s    ykO#4B}oF r^   c                      d d  S r4  r   )r	  
red_rangesr  s   r\   r   z8SIMDScheduling.compute_tiling_strategy.<locals>.<lambda>
  s    zl"_$5RG r^   Fc                @  	 |rn}|rn}|s|r|gg fS g g fS t        |       ||f}j                  |      x}r|S |rn}g }g }	d}
d}t        ||      D ]  \  }}|| vr"|
|z  }
j                  j                  |d      }-|r|k(  rj                  }|J |j
                  }t        ||j
                        }|j                  |
|z         |	j                  |j                         |j                  |       |	j                  j                  j                  |d             d}
d}|
|z  }
|j                  |
       |	j                  j                  j                  |d             d}
 |
dk7  s|r0t        |      dk(  r"|j                  |
       |	j                  |       t        t        |            D ]S  }t        j                  j                  j                  ||   d      }t        |d      }t!        |	|   |z  dz        |	|<   U ||	f|<   ||	fS )z]
            Generate a tiling, and a tiling score, given vars to use as splits.
            r<   r   r  fallbackr  )r   r   r  coalesced_by_varsuggested_splittiling_factorr   r   r  r   ranger;   r   r   r   r  r   )vars_to_useuse_split_varr  r  target_numelr   r  splitting_varsr  split_scoresprodprev_var_coalesced_scorer,  v_range
var_tilingtile	remainderrJ  r   all_iter_varsall_red_varsrD  r  r5  r7  r  scored_sub_split
tiling_vars                      r\   process_node_varszASIMDScheduling.compute_tiling_strategy.<locals>.process_node_vars
  sX    #/YJF.:?L)NB//8O$m\BC&**3//s/
.:]NFLD'($ ".&9 
7K'GOD/@/Q/Q/U/U10,  Q*_!2!B!BJ%111%33D (*2J2J KIMM$"23 ''
(8(89MM$' ''(9(J(J(N(NqRS(TUD/0,d###$5$F$F$J$J1a$PQ;> qy\c&kQ.>d###$<= 3v;' ?GG$$66vay26N1I"%l1o&9A&=">Q?
 &,\$:S!L))r^   T)r  )r@  r  r   r[   r<   r   )r  gffffff?gGz?c                    d}| d   j                   j                         D ]"  }t        j                  |      s|z  }|z  }$ dz  }| d   j                  |z    |z  S )Ng      ?r   g?)r  r   r  r  r  )r  score_factor	tile_sizeuncoalesced_penalty"bad_size_additional_tiling_penaltygood_size_tiling_penaltytotal_uncoalesceds       r\   	score_modz9SIMDScheduling.compute_tiling_strategy.<locals>.score_mod|  sw    LqT[[//1 K	&33I>#/2T#TL#/2J#JL	K #4d":qTZZ"556EEr^   r   r   zmFound optimal tiling with %s tiles but torch._inductor.config.triton.max_tiles set to %s. Consider increasing)r   FF)r?  r  r@  r   r  r   r~   ztuple[list[int], list[int]])"r<  r  norm_read_writesr   reduce_varsrh   r;   r   r   r   rW   _checkr6   r   r;  r~  r]   r#  combinationsr  r  rT  uncoalesced_addrsr   r  tiling_is_compatibler  r   perf_hint_loginforX   r   rY   rZ   ) r  r	  r  r  rD  r  r,  get_hintscore_splitrN  overlapping_iter_varsr?  r  pw_splitpw_score	red_split	red_score	candidatern  default_tilingrW  cand
tiling_lenrJ  rK  rT  rU  r5  r7  rL  rM  rV  s     ````                  @@@@@@@@@r\   compute_tiling_strategyz&SIMDScheduling.compute_tiling_strategy
  s    %44 "2266 	 *::EE(99EE"33>>(561VAY6	)56AfQi6
 77##55]9-.(?2KKF	

 	]:./8O3LLG	
 DF  	
 35"'!&K	*/K	*K	* K	* )	K	* K	*\ 	!t4!u5	
 %#T &59	 ->>CCEE 	 ' 	A%qd>%59	 #q(_-A(556KQO "")+DI)u= RT<G 	68 Xx"89i'!!(I6(mc)n4I ,,XyALNNI|45	6 **O+<>OP .3*#(  1 C C J J LM	F #)i"@ 	1D,((!?OT[[ ;;.0 !-o6JPQR
a 88!&&9"..55??	 {{L00 {{n,{{L00/	12 t##K 76s   8N2N7c                T    t        t              sJ t        fd|D              S )Nc              3     K   | ]R  }t        |t        j                        r6t        j	                  j                         |j                                 T ywr  )r   r   r1  r|   r  r   r  )r   r   r  r  s     r\   r   z6SIMDScheduling.tiling_is_compatible.<locals>.<genexpr>  sO      
 $	 7 78	 $$!2O % 
s   AA)r   r
  r   )r  r	  ri   r  r  s      ``r\   r]  z#SIMDScheduling.tiling_is_compatible  s1     &$''' 
 &	
 
 	
r^   c                B    |D ]  }| j                  ||||      s|c S  y rV   )r]  )r  r	  ri   r  r0  r  s         r\   get_first_compatible_tilingz*SIMDScheduling.get_first_compatible_tiling  s1     % 	F''uovV	 r^   c                0    | j                  ||||      d   S r  )rb  )r  r	  ri   r  rD  s        r\   r  zSIMDScheduling.select_tiling  s)     ((5/3D

 	r^   c                   |dk(  }| j                  |g|g      }t        j                  |      D ]  }t        |j                  t
        j                        s(|j                  j                         dk(  sFt        j                  j                  sa|j                         }|d   }	|d   }
| j                  |	|
      }|dfc S  t        j                  j                  j                  j                  r0|r.t        j                  j                  s| j!                  ||||      S |st        j                  j"                  rt%        d      dk  rt&        j(                  t*        j,                  k  rt        j                  |      D ]i  }t        j                  j"                  rt/        | j1                  |||            dkD  s>t&        j3                  t5        j6                  d              |dfS  |dfS t9               }t;        j<                         }t        j                  |      D ]g  }| j1                  |||      D ]O  }|j>                  |v r|j>                  |jA                  |j>                         ||xx   |jB                  z  cc<   Q i |jE                         D cg c]  \  }}|jF                   }}}t%        d      dk\  r?|r=	 	 	 	 	 	 dd	}tI        dt/        |            D ]  } ||d   ||         }||g|z   } n t/        |      dkD  rt&        j3                  d
|       t        j                  j                  r| jK                  |||      |z   }| jM                  ||||      x}r|dfS |dfS c c}}w )z
        Heuristics to decide how to tile kernels.
        Currently, we tile based on stride-1 dimensions.

        Returns:
            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

        r<   r  r   Nr   rO  z
                                Reduction over non-contiguous dims.
                                Consider setting config.triton.tile_reductions to True.
                                r   c                   | d   | j                  dd      }}|d   |j                  dd      }}t        j                  j                  j                  } |||z
        dk(  ry  |||z
        dk  r||f||fc\  }}\  }} |||z
        dkD  sJ t        j                  j                  j                  ||      sy |t        ||      || d   d}|S )NrR   rQ   r<   r   rS   )rP   rQ   rR   rS   )r   r;   r   r   r  r  r   )tiling0r  a0a1b0b1r  
new_tilings           r\   convert_tiling_to_3dzBSIMDScheduling.get_tiling_and_scores.<locals>.convert_tiling_to_3d-  s     !w{{3':B w{{3':B ww''>>R=A%R=1$*,bB8&HRhr2BG}q(((ww''DDRL !"b)"5>	
 "!r^   zpossibly bad tiling: %s)rs  r~  r  r~  r~   r  )'r  rE   r8  r   r   r   r2  r3  r   rY   r0  r  rW   rX   rF  prefer_nd_tilingrk  tile_reductionsr]   r^  levelloggingWARNINGr   r  r_  textwrapdedentr   collectionsr   rf   r   r  most_commonr  r>  r1  ro  )r  r	  ri   r  rD  r  rh  r   r#  	range_y_xrange_rr  
seen_namescandidate_tilescandidate_tilingr  r0  ry  rJ  new_3d_tilings                       r\   rb  z$SIMDScheduling.get_tiling_and_scores  s   " '!+ **E7_4EF $**=9 	(D$))R%6%67II002e;33 #'//"3K +AI)!nG ..y'BF!4<'	(  OO""))BB!MM22..uo7H  V]]%B%B}H
H ""goo5+22=A D"MM99 5 5dE? STWXX%**$OO!$ !4'' "4''&0l
4?4G4G4I#**=9 	LD$'$9$9$$W L #((J6%**6NN#3#8#89 015E5K5KK1L	L ,;+F+F+H7
' % ##7
 7

 #q(\"."9N"-"@ 1c.12  4"1%~a'8! !,&3_~%EN ~"8.I ==))""=%I ! 
 445/>
 
6 
 4<t##I7
s   "M;c                     y rV   r   rp   s    r\   flushzSIMDScheduling.flushg  r|  r^   c                     yr`  r   rp   s    r\   ready_to_flushzSIMDScheduling.ready_to_flushj  ra  r^   c                   t        d |D              st        |d       j                  \  }\  }}| j                  |||      }| j	                  |||      }| j                  |t        |||            }	| j                  ||	       | j                  |      }
||
d<   t        j                  d
i |
5  t        j                  |	      5  |	j                         }d d d        d d d        nM|d   j                  |      \  }}}t        j                  d|      5  | j                  |||d|      }d d d        j!                  t#        t$        j&                        d	      }|S # 1 sw Y   xY w# 1 sw Y   @xY w# 1 sw Y   LxY w)Nc              3  <   K   | ]  }|j                           y wrV   )r  )r   r   s     r\   r   zASIMDScheduling.generate_kernel_code_from_nodes.<locals>.<genexpr>p  s     2q1==?2s   c                4    t        | j                               S rV   r?  r@  s    r\   r   z@SIMDScheduling.generate_kernel_code_from_nodes.<locals>.<lambda>q  r  r^   r   r  r  r   Tr  r  r   )r  rt  r  r  r  r  rH   r  r^  r   r  r;   r  rz  get_prologue_template_epiloguer  r  rx   r3   r  )rm   r   r  r  r  ri   r  r	  r  rk   rp  r  r  templateepilogues                  r\   generate_kernel_code_from_nodesz.SIMDScheduling.generate_kernel_code_from_nodesm  s    2E22!$U0O!P!V!VAv 77ufMM''ufEF%%+M5&I & F 22=&I!99-HN1AN-..~.3$$V,3 "002	3 3 3 ,18+R+R,(Hh 02BC 00&*"/ 1  ##C(?(?$@)L)3 3 3 3 s0   2E.E"E.E:"E+	'E..E7:Fc                    t         rV   r<  )rm   r  r	  rk   s       r\   r  zSIMDScheduling.define_kernel  rY  r^   )r  N)r  zOrderedSet[str] | Noner~   ztuple[float, str]rV   )r   z!Sequence[scheduler.SchedulerNode]rD  CoalesceVarAnalysis | None)r   z6scheduler.FusedSchedulerNode | scheduler.SchedulerNode)ri   r{   rR  zCIterable[ir.Buffer | ir.TensorBox | ir.TorchBindObject | ir.IRNode]r~   r   )F)rk   r   r	  list[NodeScheduleEntry]rX  r   r~   r   )r	  r  r~   zdict[str, Any])r  rH   )r  rH   r~   zlist[SIMDKernel])r   r   r~   tuple[tuple[int, ...], ...])r   r   r~   r   )r   r   r  r   r~   r  )r  r   r~   
str | None)r  zlist[BaseSchedulerNode]r  r   r  r   r  r   rX  r   r~   z!list[tuple[str | None, Any, Any]])r~   r  )r  r  r  r  r~   immutable_dict[str, sympy.Expr])r  r  r  r   r~   r  )r  r~  ri   r{   r  r{   r~   r  )r~   z%list[immutable_dict[str, sympy.Expr]])
r	  r  r  r{   r  r{   rD  rL   r~   :tuple[dict[str, sympy.Expr], dict[str, sympy.Expr] | None])r	  r  ri   r{   r  r{   r  r~  )r	  r  ri   r{   r  r{   r0  zlist[dict[str, sympy.Expr]])rD  r  r~   r~  )rD  r  r~   r  r   )FN)r  r   )6r   r   r   r   r|   r  r  r  r  can_fuse_verticalr  r  r  r  r  r  r  r(  rH  r  rV  rY  r^  rC  r  r  r  r  r  r  r  r  r  r  r  r   r   r  r  r  r  r1  rk  r]  ro  r   r   r   r  rb  r  r  r  r  r   r^   r\   r  r    s   
 (K'Q`6D !"^@8
%2)l OS"5K"	"
_)H 9=
0
 6
(=J=2 ""T" 
" "P #(	@@ /@  	@
 
@"V)p
1
	
 -T  IV'	$,

'
/2
	$
.  $(n "n 
n`I #(i 0i   $i  	i 
 i   i  
+i V)( Y}  }~ 
,
@T
	(
 
$ 
$
 
 
)	
 
 /%/ / $	/
 
)/ /( S
 
/S Sj Y$.Y$ $Y$ $	Y$
 /Y$ 
DY$ Y$v 
.
 
 $	

 &
 
  .  $	
 4  
 8<	
 6	 
	 	 
 8<S$
 6S$ 
DS$ S$j JN#<F#J"r^   r  T)frozenc                  @    e Zd ZU ded<   ded<   dZded<   ed        Zy)	r  r~  r  r   r  Nr  rf   c                v    t         j                  j                  j                  | d      } | dk\  xr | dz  dk(  S )z@Somewhat arbitrary heuristic used to boost scores for some sizesi    r9  r  r   )r;   r   r   r   )r   s    r\   r  zCandidateTiling.is_good_size  s:     GG..q4.@Bw(AFaK(r^   )r   r   r   r  rf   r  r  r   r^   r\   r  r    s)    !!JD*) )r^   r  c                  $     e Zd Z fdZd Z xZS )r  c                >    t         |           || _        || _        y rV   )rd   re   r   r  )rm   r   r  rn   s      r\   re   zCantSplit.__init__  s    	"r^   c                8    | j                    d| j                   S )Nz not divisible by )r   r  rp   s    r\   __str__zCantSplit.__str__  s    )).t~~.>??r^   )r   r   r   re   r  r   r   s   @r\   r  r    s    #
@r^   r  )r   )r[   r   r~   r   )r   r  r~   rx   )
__future__r   r  r  dataclassesr   r#  r}  r   r  r  r   typingr   r   r   r   typing_extensionsr	   r   rW   torch._loggingtorch._inductorr
   torch._inductor.irr   torch._inductor.tiling_utilsr   %torch.fx.experimental.symbolic_shapesr   torch.fx.immutable_collectionsr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   r   _dynamo.utilsr    r   r   r   analyze_preserves_zero_maskr   	codecacher   r   dependenciesr    r!   r"   collections.abcr#   r%   optimize_indexingr&    runtime.coordinate_descent_tunerr'   runtime.hintsr(   runtime.runtime_utilsr)   r*   r+   r,   r-   r.   utilsr/   r0   r1   r2   r3   r4   r5   r6   r7   r8   virtualizedr9   r:   r;   block_analysisr=   commonr>   r?   r@   rA   r  rB   rC   simd_kernel_featuresrD   rE   rF   rG   rH   rI   rJ   rK   rL   	getLoggerr   rU  _logginggetArtifactLoggerr^  rA  
fusion_logdoprintr  rl  r]   	dataclassr`   r}   r   r   r   r  r  r|   r  r  	Exceptionr  r   r^   r\   <module>r     s   "           : : %    # 2 B G 9 / L L  & $ $ F . 6 6 ( A < , L L D D   - , / P P :  <<@ g!00<H~~//*E^^--hA
 	78;
 3+ 3+ 3+lH;/ H;V;'? ;'| +;T   
"z 
"U('/*B Upw"^ w"t; d#	) 	) $	)@	 @r^   