
    9j                   
   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZmZ d dlZd dlmZ d dlZd dlZd dlmc mZ d dlmZ d dl m!Z!m"Z"m#Z# d d	l$m%Z% d d
l&m'Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z<m=Z= ddl>m?Z?m@Z@mAZA ddlBmCZC ddlAmDZDmEZE ddlFmGZG ddlHmIZI ddlJmKZKmLZLmMZM ddlNmOZOmPZP ddlmQZQmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\ ddl]m^Z^ ddl_m`Z`maZambZbmcZcmdZdmeZe ddlfmgZg dd lhmiZi dd!ljmkZkmlZlmmZm er&d d"lmnZnmoZompZp d dlqZqdd#lrmsZs dd$lAmtZt dd%lumvZv dd&lwmxZx  ej                  ez      Z{ ec       j                  Z}e~ej                  ej<                  eeef   Ze~ej                  ej<                  ed'ef   ZeAj
                  edz  Zed(gdf   Zdud)Zdvd*Zdwd+Zd, Zeeef   Ze~eej.                  z  d-f   eege~ed-f   f   z  Z	 	 dx	 	 	 	 	 	 	 	 	 	 	 dyd.Z	 dz	 	 	 d{d/Zej"                   G d0 d1             Z G d2 d3      Z G d4 d(      Zej"                   G d5 d6e             Zej"                   G d7 d8e             Zej"                   G d9 d:e             Zej"                   G d; d<e             Zej"                   G d= d>e             Zej"                   G d? d@e             Z G dA dBe      Zej"                   G dC dDe             Zej"                   G dE dFe             Zej"                   G dG dHe             Zej"                   G dI dJe             Zej"                   G dK dLe             Zej"                   G dM dNe             Zej"                   G dO dPe             Zej"                   G dQ dRe             Zej"                   G dS dTe             Zej"                   G dU dVe             Zej"                   G dW dXe             Z G dY dZ      Zej"                   G d[ d\e             Zej"                   G d] d^e             Zej"                   G d_ d`e             Zej"                   G da dbe             Z G dc dde      Zej"                   G de dfe             Zej"                   G dg dhe             Zej"                   G di dje             Zej"                   G dk dle             Zej"                   G dm dne             Zej"                   G do dpe             ZeZeeXz  Z G dq drea      Z G ds dte      Zy)|    )annotationsN)Callable)chaincount)AnyTYPE_CHECKING)Exprdtype)countersdynamo_timedget_debug_dir)DebugPrinterManager)MultiKernelState)	cache_dir)get_opaque_obj_repris_opaque_value_type)trace_structured)CallMethodKeyConvertIntKeyDivideByKeyresolve_unbacked_bindingsSymTypes)_get_qualified_name)
OrderedSet)SingletonInt)symbol_is_typeSymT   )async_compileconfigir)output_code_log)IRNodeReinterpretView)triton_heuristics)DeviceProperties)DEFAULT_STREAMDEFAULT_STREAM_IDXSTREAM_NAME_TEMPLATE)get_raw_stream_nameget_stream_name)cache_on_selfDelayReplaceLineget_benchmark_nameget_dtype_sizeIndentedBuffer#is_codegen_graph_partition_subgraphis_using_cudagraph_partitionLineContextsympy_product	sympy_str
sympy_substriton_version_uses_attrs_dict)V   )ArgNameCodeGenDeferredLinePythonPrinterWorkspaceArgWorkspaceZeroMode)cexpr)CUSTOM_EXTERN_KERNEL_CODEGEN)	config_ofshould_unwrap_unspec_argsignature_to_meta)IterableIteratorSequence)GraphLowering)ExternKernel)BaseSchedulerNode)FxConverterzir.CommBufferTypeWrapperLinec                   t         j                  j                  |       }| j                         t         j                  j                  v}t         j                  j
                  j                  | j                               }| j                         | j                         t        t         j                  j                  j                  |            ||fS N)r9   graphget_allocation_storage_sizeget_nameunaligned_buffers	schedulerget_buf_streamget_device_or_error	get_dtyper6   sizevarssimplify)nodestorage_size	alignmentstreams       _/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/codegen/wrapper.pybuffer_reuse_keyr_   f   s    7766t<Lqww'@'@@IWW--dmmo>F  " 	!''""++L9:	 	    c                f   t         j                  j                  |       }| j                         }t	        |t
        j                        sJ | j                         | j                         t        t         j                  j                  j                  |            |j                  |j                  fS rO   )r9   rP   rQ   get_output_spec
isinstancer"   CommBufferLayoutrV   rW   r6   rX   rY   comm_buffer_type
group_name)rZ   r[   layouts      r^   comm_buffer_reuse_keyrh   v   s    7766t<L!!#Ffb11222  "!''""++L9: r`   c                   | j                         |j                         k7  ry| j                         |j                         k7  ryt        j                  j                  j                  t        j                  j                  |             }t        j                  j                  j                  t        j                  j                  |            }t        |      t        |      k(  sWt        j                  j                  j                  |d|z        r+t        j                  j                  j                  ||      ryy)NFgffffff?T)
rV   rW   r9   rP   rX   rY   rQ   r6   statically_known_geqstatically_known_leq)	input_buf
output_buf
input_sizeoutput_sizes       r^   can_match_buffer_sizerp      s     $$&**H*H*JJ
 4 4 66!!**	++I6J ''""++	++J7K 	*;!77 	
--k4*;LMGG11+zJr`   c                   t        | t        j                        r?| j                         }|j                  |j
                  |j                  |j                  dfS g }| }t        |t        j                  t        j                  t        j                  f      rj|j                         }|y|j                  |       |j                  }t        |t        j                  t        j                  t        j                  f      rjt        |t        j                        sy|D ]+  }|j                  |j                         j                  k7  s+ y |j                         }|j                  |j
                  |j                  |j                  dfS )a  
    Collapse a chain of ReinterpretView <- StorageBox
    <- ReinterpretView <- StorageBox.... <- buffer wrappers if every layer
    has the same offset as the innermost (base) buffer.

    Returns:
        (size, stride, offset, dtype, collapsible: bool)
    T)NNNNF)rc   r"   Buffer
get_layoutsizestrideoffsetr   	TensorBox
StorageBoxr%   appenddata)rz   laylayoutscurbase_lays        r^   codegen_reinterpret_view_helperr      s!    $		"ooxxSZZD@@G
C
S2<<8J8JK
Lnn;0shh S2<<8J8JK
L c299%,  1::)00001 ~~H==(//8??HNNDPPr`   .c                    t               dd	 d	 	 	 dfd}dd fd}d  } |d| d       r4t        j                  j                  rj                  j                         nt        j                         }j                         5  |5  t        j                  j                  rV|rTt        j                  j                  r:|t        j                  j                  v rt        j                  j                  |   }	nd gt        |      z  }	t        |      dk(  r" ||d   |	d         \  }
} |d	|
 d	|        nt        |      dkD  sJ t        |      t        |      k(  sJ t               }t        t        |||	      d
 d      D ]  \  }
}}g }|j                  r:|j                  D ]+  }|dvs|j!                  d| d|j                  |           - |rdj#                  |      }nd} ||
|      \  }
}d| d|
 }||v r~|j%                  |        ||d| d|         d d d        d d d        |j'                         fS # 1 sw Y   #xY w# 1 sw Y   'xY w)Nc                d    t        | t        j                        r| S t        j                  |       S rO   )rc   sympyr	   Integer)items    r^   _convert_to_sympy_exprz@user_defined_kernel_grid_fn_code.<locals>._convert_to_sympy_expr   s#    !$

3tLt9LLr`   c                    t        |       r| | fS t        fd| D              }|s|}j                  |      t        j                  j
                  r$j                  t        fd|D                    fS dfS )a'  
        This function return a tuple of two values: the first one is for the real grid
        which is used in the generated code; the second one is an example grid with
        concreate values which is used in the autotune block to run the generated
        kernels at compile time.
        Nc              3  .   K   | ]  } |        y wrO    ).0gr   s     r^   	<genexpr>zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s     C1!4Cs   c              3  T   K   | ]  }j                  |t        |             ! y wrO   generate_example_arg_valuetype)r   r   wrappers     r^   r   zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s*        ::1d1gF   %()callabletuplecodegen_python_shape_tupler!   tritonautotune_at_compile_time)gridexample_grid
sympy_gridr   r   s      r^   determine_gridz8user_defined_kernel_grid_fn_code.<locals>.determine_grid   s     ?htn:CdCC
%L..z: ==99 22 !- 
 	
 
 	
r`   c                    j                  |        rJt        j                  j                  r/j                  vr j
                  j                  |xs |        y y y y rO   )	writeliner!   r   r   kernel_autotune_nameskernel_autotune_calls)liner   nameoutputr   s     r^   r   z3user_defined_kernel_grid_fn_code.<locals>.writeline   sW    66G999))33L4HDI : 7 r`   grid_wrapper_for_def z(meta):r:   r   zreturn c                2    t        | d   j                        S Nr:   lenkwargsxs    r^   <lambda>z2user_defined_kernel_grid_fn_code.<locals>.<lambda>  s    c!A$++. r`   Tkeyreverse)matrix_instr_nonkdimwaves_per_eukpackzmeta['z'] == z and Trueif z	: return )r   int | sympy.Exprreturn
sympy.ExprrO   )r   
TritonGridr   zTritonGrid | None)r   strr   
str | None)r1   r!   r   r   r   indent
contextlibnullcontextr9   rP   autotuning_gridsr   r   sortedzipr   ry   joinaddgetvalue)r   configsgridsr   original_fxnode_namer   r   fn_namekernel_autotune_calls_indentexample_gridsr   r   seenc
guardslistkwargguards	statementr   r   s   `  `              @@r^    user_defined_kernel_grid_fn_coder      s    FM
 +/

'
>J J "$(GWIW%& v}}== 	%%,,.##% !
 
 .L6 .LMM22$(($(@(@@GG445IJM!FSZ/Mu:?!/a-:J!KD,v&',(@Au:>!>u:W---$.LD *0E7M2.* L%a
  
88!" W  ) 
 '--ugVAHHUOCT.UVW $\\*5F#F%3D,%G"l!&4&9	$#)s6()L>%JK1L-.L .L` FOO%%%a.L .L .L .Ls,   I!D%I<A7I3I!I	I!!I*c                  	 t               | j                  }|r|d   }j                  |d       ddlddlm ddlm ddlm	 t        | j                  g      	fd	 |        j                         S )
z
    Given a triton kernel function pointer collect the transitive closure of
    its dependencies

    epilogue_fusion: Optional[(fused epilogue node, modified kerel src code)]
    r:   Tstripr   N)JITFunction)	constexprr
   c           	        t        d t        j                  | j                        D              }| j                  j                  j                  di       }| j                  j                  j                  D ]  }|v r	|| j                  j                  v s"| j                  j                  |   }t        |	      rX
j                          
j                  d       
j                  |j                  d       j                  |        |       t        d      rWt        |j                  j                   j"                        r,d|j                  v rdvr~|j                  j                  j                  d      }|rWt        |d      rK|j$                  j'                  d	      r0
j                  d
|j$                   d       j                  d       
j                          
j                  d       
j                  |j                  d       ||j                  j(                  k7  r*
j                  | d|j                  j(                          j                  |        |       t        |t*        t,        t.        f      r
j                          t        |      rd|j0                  d}n|}|j                  |      x}rKt        |t2              rd|j$                   d|j(                   }nd|}
j                  | | d|        n
j                  | d|        j                  |       ||v s|dk7  st        |d      s|j$                  j'                  d	      st        |      r!
j                  | d|j4                          n:t        |d      r.
j                  d
|j$                   d|j(                   d|        j                  |        y )Nc              3  R   K   | ]  }|j                   d k(  r|j                   ! yw)LOAD_GLOBALN)opnameargval)r   insts     r^   r   z^user_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse.<locals>.<genexpr>W  s(      '
{{m+ KK'
s   %'__annotations__z@triton.jitTr   constexpr_functionr   
__module__r   zfrom z import dtype as dtypez@triton.constexpr_function = ztl.constexpr(): .tlz = tl.__name__z import z as )r   disBytecodefn__globals__get__code__co_namesrc   newliner   splicesrcr   hasattrruntimejitConstexprFunctionr   
startswithr   intr   boolvaluer   r   )
cur_kernelunqualified_loadsglobal_annotationssymbol_namesymboldtype_symbol
symbol_str
annotationannotation_coder   compile_wrapperr   symbols_includedtraverser   triton_dtypes            r^   r  zKuser_defined_triton_kernel_transitive_closure_source_code.<locals>.traverseR  s   
 ' '
Z]]3'
 

 (]]66::;LbQ%==11:: K	6K..jmm777#22;?fk2#++-#--m<#**6::T*B$((5V$V%9:zNN&&88@
 &**,@P1P'-yy'<'<'@'@'I( 'l C , 7 7 B B8 L+55"'(?(?'@@V W -009#++-#--.JK#**6::T*B"fii&8&88'11*m3vyy/A/A.BC %((5V$c4(CD#++-!&)4'4V\\4DA%F
(.z
%7%;%;K%HHzH%j$7"$Z%:%:$;1Z=P=P<Q R , 13:..AO'11*mO+<C
|L (11[MZL2QR$((5#44#t+5 ))44X> "&,7'11[M}2UV 4'11#F$5$5#6hv>OtT_S`a %((5WK	6r`   )r1   r   r   r   r   triton.languager   triton.language.corer   r   r   r   )
kernelepilogue_fusion
kernel_srcr   r   r   r   r  r   r  s
      @@@@@@@r^   9user_defined_triton_kernel_transitive_closure_source_coder  :  s~     %&OJ$Q'
:T2 "): "6??"34V6 V6p V##%%r`   c                  (    e Zd ZU ded<   ded<   d Zy)SymbolicCallArgsympy.Symbolinnerr   
inner_exprc                ,    t        | j                        S rO   )r   r  selfs    r^   __str__zSymbolicCallArg.__str__  s    4::r`   N)r   r   __qualname__r   r  r   r`   r^   r
  r
    s    r`   r
  c                  Z     e Zd Z fdZddZd	dZd
dZddZddZ	 	 	 	 	 	 ddZ	 xZ
S )MemoryPlanningStatec                    t         |           t        j                  t              | _        t        j                  t              | _        d| _        y Nr   )super__init__collectionsdefaultdictlist
reuse_poolcomm_buffer_reuse_pooltotal_allocated_buffer_size)r  	__class__s    r^   r  zMemoryPlanningState.__init__  sG     ##D) 	 ##D) 	# 12(r`   c                L    t        | j                  j                  |d             S rO   )r   r  r   r  r   s     r^   __contains__z MemoryPlanningState.__contains__  s    DOO''T233r`   c                \    | j                   |   j                         }|j                  rJ |S rO   )r  pop	is_reusedr  r   r   s      r^   r$  zMemoryPlanningState.pop  s+    s#'')>>!!r`   c                \    |j                   rJ | j                  |   j                  |       y rO   )r%  r  ry   r&  s      r^   pushzMemoryPlanningState.push  s&    >>!!##D)r`   c                L    t        | j                  j                  |d             S rO   )r   r  r   r!  s     r^   comm_buffer_containsz(MemoryPlanningState.comm_buffer_contains  s     D//33C>??r`   c                \    | j                   |   j                         }|j                  rJ |S rO   )r  r$  r%  r&  s      r^   comm_buffer_popz#MemoryPlanningState.comm_buffer_pop  s-    **3/335>>!!r`   c                \    |j                   rJ | j                  |   j                  |       y rO   )r%  r  ry   r&  s      r^   comm_buffer_pushz$MemoryPlanningState.comm_buffer_push  s*     >>!!##C(//5r`   )r   ReuseKeyr   r   )r   r/  r   FreeIfNotReusedLine)r   r/  r   r0  r   None)r   CommBufferReuseKeyr   r   )r   r2  r   r0  )r   r2  r   r0  r   r1  )r   r   r  r  r"  r$  r(  r*  r,  r.  __classcell__r  s   @r^   r  r    s?    
24
*@
6%6-@6	6r`   r  c                      e Zd ZddZy)rM   c                0    t        dt        |              )Nz&FX codegen not yet supported for type )NotImplementedErrorr   r  	converters     r^   
codegen_fxzWrapperLine.codegen_fx  s    !$J4PT:,"WXXr`   Nr9  rL   r   FxConversionFuncr   r   r  r:  r   r`   r^   rM   rM     s    Yr`   c                  :    e Zd ZU ded<   ded<   d	dZd
dZddZy)EnterSubgraphLinePythonWrapperCodegenr   rI   rP   c                b    | j                   j                  | j                   j                         y rO   )r   push_computed_sizescomputed_sizesr  s    r^   __post_init__zEnterSubgraphLine.__post_init__  s    (()D)DEr`   c                n    | j                   j                  | j                         |j                          y rO   )r   push_codegened_graphrP   	do_indentr  codes     r^   codegenzEnterSubgraphLine.codegen  s"    ))$**5r`   c                    |j                   S rO   )_generate_enter_subgraphr8  s     r^   r:  zEnterSubgraphLine.codegen_fx  s    111r`   Nr   r1  rI  r1   r   r1  r;  r   r   r  r   rD  rJ  r:  r   r`   r^   r?  r?    s    !!F2r`   r?  c                  <    e Zd ZU ded<   ded<   ddZed	d       Zy)
ConditionalLiner@  r   zir.ConditionalrZ   c                    t        d      )NzOnly supports FX codegen)r7  rH  s     r^   rJ  zConditionalLine.codegen  s    !"<==r`   c                    | j                   S rO   )_generate_conditionalr9  s    r^   r:  zConditionalLine.codegen_fx  s    ...r`   NrN  r;  r   r   r  r   rJ  staticmethodr:  r   r`   r^   rQ  rQ    s'    !!
> / /r`   rQ  c                  2    e Zd ZU ded<   ddZedd       Zy)CommentLiner4   r   c                :    |j                  | j                         y rO   )r   r   rH  s     r^   rJ  zCommentLine.codegen  s    tyy!r`   c                    | j                   S rO   )_generate_commentrU  s    r^   r:  zCommentLine.codegen_fx	  s    ***r`   NrN  r;  rV  r   r`   r^   rY  rY    s!    
" + +r`   rY  c                  <    e Zd ZU ded<   ded<   ddZed	d       Zy)
DynamicScalarLiner@  r   zir.DynamicScalarrZ   c                N    | j                   j                  | j                         y rO   )r   _codegen_dynamic_scalarrZ   rH  s     r^   rJ  zDynamicScalarLine.codegen  s    ,,TYY7r`   c                    | j                   S rO   )_generate_dynamic_scalarrU  s    r^   r:  zDynamicScalarLine.codegen_fx  s    111r`   NrN  r;  rV  r   r`   r^   r^  r^    s'    !!
8 2 2r`   r^  c                  0    e Zd ZU ded<   ddZddZd	dZy)
ExitSubgraphLiner@  r   c                V    | j                   j                         | j                   _        y rO   )r   pop_computed_sizesrC  r  s    r^   rD  zExitSubgraphLine.__post_init__  s    &*ll&E&E&G#r`   c                X    | j                   j                          |j                          y rO   )r   pop_codegened_graphdo_unindentrH  s     r^   rJ  zExitSubgraphLine.codegen"  s    ((*r`   c                    |j                   S rO   )_generate_exit_subgraphr8  s     r^   r:  zExitSubgraphLine.codegen_fx&  s    000r`   NrM  rN  r;  rO  r   r`   r^   rd  rd    s    !!H1r`   rd  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
EnterDeviceContextManagerLiner   
device_idx
int | Nonelast_seen_device_guard_indexc                x   t         j                  j                  r|j                  d       t         j                  j                  rg| j
                  ;|j                  t         j                  j                  j                          d       y | j
                  | j                  k(  s{J d       | j
                  H|j                  t         j                  j                  j                          d| j                   d       y |j                  d| j                   d       y y |j                  dt         j                  j                  j                  | j                         d       |j                          |j                  t         j                  j                  j                  | j                               y )	N
z) stream_guard(stream, this->device_idx_);z4AOTInductor only supports running on one CUDA devicez device_guard(z);zdevice_guard.set_index(with :)r9   rP   cpp_wrapperr   aot_moderp  
device_opscpp_aoti_stream_guardrn  cpp_aoti_device_guarddevice_guardrG  
set_devicerH  s     r^   rJ  z%EnterDeviceContextManagerLine.codegen/  sP   77NN4 ww 44<NN77--CCEFFop  <<O NO 44<NN77--CCEFnUYUdUdTeegh NN%<T__<MR#PQ P NNU177#5#5#B#B4??#S"TTUVWNNNN177--88IJr`   c                    |j                   S rO   )&_generate_enter_device_context_managerr8  s     r^   r:  z(EnterDeviceContextManagerLine.codegen_fxL  s    ???r`   NrN  r;  r   r   r  r   rJ  r:  r   r`   r^   rm  rm  *  s    O",,K:@r`   rm  c                      e Zd ZddZddZy)ExitDeviceContextManagerLinec                Z    t         j                  j                  s|j                          y y rO   r9   rP   ru  ri  rH  s     r^   rJ  z$ExitDeviceContextManagerLine.codegenQ  s     ww"" #r`   c                    |j                   S rO   )%_generate_exit_device_context_managerr8  s     r^   r:  z'ExitDeviceContextManagerLine.codegen_fxU  s    >>>r`   NrN  r;  )r   r   r  rJ  r:  r   r`   r^   r  r  P  s    ?r`   r  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
ExternKernelAllocLiner@  r   ir.ExternKernelAllocrZ   c                    | j                   }g |j                         |j                         }| j                  j	                  | j                   |       y rO   )rZ   codegen_argscodegen_kwargsr   $_generate_extern_kernel_alloc_helper)r  rI  rZ   argss       r^   rJ  zExternKernelAllocLine.codegen^  sD    yy=""$=t':':'<=99$))TJr`   c                    |j                   S rO   )_generate_extern_kernel_allocr8  s     r^   r:  z ExternKernelAllocLine.codegen_fxc  s    666r`   NrN  r;  r~  r   r`   r^   r  r  Y  s    !!
K
7r`   r  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
ExternKernelOutLiner@  r   ir.ExternKernelOutrZ   c           	     D   | j                   }g |j                         |j                  d      }|j                         }t        j
                  j                  r|j                  dk(  rd}n|j                         }|j                         x}r|j                  nt        j
                  j                  }| j                  j                  ||j                         |j                  r|j                  j                         nd ||| j                   j                                y )NT)skip_outztorch::inductor::_mm_plus_mmaoti_torch__mm_plus_mm_out)rZ   r  r  get_kernel_namer9   rP   ru  cpp_kernel_name
get_devicer   device_typer   "_generate_extern_kernel_out_helpercodegen_referenceoutput_viewget_stack_traces)r  rI  rZ   r  kernel_nameddevices          r^   rJ  zExternKernelOutLine.codegenl  s    yyJ""$Jt':':D':'IJ**,GG$$(FF 7K..0K!%!22A29L9L77""$484D4DD..0$II&&(	
r`   c                    |j                   S rO   )_generate_extern_kernel_outr8  s     r^   r:  zExternKernelOutLine.codegen_fx      444r`   NrN  r;  r~  r   r`   r^   r  r  g  s    !!

,5r`   r  c                  .    e Zd ZU dZded<   ded<   ddZy)	ExternKernelMultiOutLinezCodegen line for multi-output .out() variant calls.

    Generates a kernel call with pre-allocated output buffers passed as
    keyword arguments. E.g. kernel(x, out0=buf0, out1=buf1).
    r@  r   ir.ExternKernelMultiOutrZ   c           	     (   | j                   }|j                         }g |j                         |j                         }t	        |j
                  |j                        D ])  \  }}|j                  | d|j                                 + |j                  |j                          d| ddj                  |       d       |j                  D ]B  }t        |j                  t        j                        s(|j                  | j                          D y )N=r   (, r   )rZ   r  r  r  r   out_arg_namesout_variant_output_nodesry   rR   r   r   rc   rg   r"   Layoutcodegen_size_assertsr   )r  rI  rZ   r  r  out_nameout_nodes          r^   rJ  z ExternKernelMultiOutLine.codegen  s    yy**,=""$=t':':'<="% = =#
 	=Hh KK8*Ah&7&7&9%:;<	=
 	$--/*#k]!DIIdO;LANO55 	<H(//2995--dll;	<r`   NrN  r   r   r  __doc__r   rJ  r   r`   r^   r  r    s     "!
!!<r`   r  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
FreeLiner@  r   BufferLike | ir.TorchBindObjectrZ   c                    | j                   j                         t        j                  j                  vsJ |j                  | j                  j                  | j                                y rO   )rZ   rR   r9   rP   removed_buffersr   r   make_buffer_freerH  s     r^   rJ  zFreeLine.codegen  sF    yy!!#177+B+BBBBt||44TYY?@r`   c                    |j                   S rO   )_generate_freer8  s     r^   r:  zFreeLine.codegen_fx      '''r`   NrN  r;  r~  r   r`   r^   r  r    s    !!
))A(r`   r  c                      e Zd ZU ded<   ded<   ded<   ded<   ded<   d	ed
<   ded<   ded<   ded<   ded<   ded<   ded<   dZded<   ddZddZy)KernelCallLiner@  r   r   r  ztuple[Any, ...]	call_argsraw_keysraw_args	list[str]	arg_typesr   r   zdict[str, Any]triton_metazdict[str, Any] | Noneinductor_metaztorch.devicer  
graph_namer   Nro  current_stream_idxc                B   | j                   j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  | j                  | j                  | j                         y )N)
r   r  r  r  r  r  r  r  r   r  )r   _generate_kernel_call_helperr  r  r   r  r  r  r  r  r  r  r   r  rH  s     r^   rJ  zKernelCallLine.codegen  sw    11NN;;nn]]]]((,,;;!%!:!:#66 	2 	
r`   c                    |j                   S rO   )_generate_kernel_callr8  s     r^   r:  zKernelCallLine.codegen_fx      ...r`   rN  r;  )r   r   r  r   r  rJ  r:  r   r`   r^   r  r    s]    !!L((O%)
)
 /r`   r  c                  f    e Zd ZU ded<   ded<   ded<   dZded<   d	Zd
ed<   dZded<   ddZddZy)KernelDefinitionLiner@  r   r   r  kernel_bodyNr   metadataTr   gpucpp_definitionc                    | j                   j                  | j                  | j                  | j                  | j
                  | j                         y N)r  r  r  )r   _define_kernel_helperr  r  r  r  r  rH  s     r^   rJ  zKernelDefinitionLine.codegen  sB    **]].. 	+ 	
r`   c                    |j                   S rO   )_generate_kernel_definitionr8  s     r^   r:  zKernelDefinitionLine.codegen_fx  r  r`   rN  r;  )	r   r   r  r   r  r  r  rJ  r:  r   r`   r^   r  r    s<    !!HjC!%NJ%
5r`   r  c                  0    e Zd ZU ded<   ddZddZd	dZy)
MemoryPlanningLiner@  r   c                    | S )zFirst pass to find reuser   r  states     r^   planzMemoryPlanningLine.plan  s    r`   c                     y)zSecond pass to output codeNr   rH  s     r^   rJ  zMemoryPlanningLine.codegen  s    r`   c                r   g }t        j                  |       D ]t  }|j                  dk(  rt        | |j                        }|j	                  |j                   d|j
                  t        j                  u r|j                         n|        v t        |       j                   ddj                  |       dS )zF
        Emits a string representation that fits on one line.
        r   r  r  r  r   )dataclassesfieldsr   getattrry   r   r"   rr   rR   r   r   )r  r  fieldvals       r^   r  zMemoryPlanningLine.__str__  s      ''- 	EzzY&$

+CKK::,a%**		2IsST		 t*%%&a		$'8::r`   Nr  r  r   r  rN  r   r   )r   r   r  r   r  rJ  r  r   r`   r^   r  r    s    !!);r`   r  c                  d     e Zd ZU dZdZded<    ej                  e      Z	ded<   d	 fdZ
 xZS )
+EnterDeviceContextManagerWithStreamInfoLineu@  Enter a CUDA device context and retrieve user stream objects.

    Attributes:
        num_streams: Number of streams (determined by user annotations on nodes).
        stream_idx_to_user_obj_idx: Maps stream_idx → user_object_index for
            retrieving user stream objects via get_external_object_by_index.
    r:   r   num_streams)default_factoryzdict[int, int]stream_idx_to_user_obj_idxc                r   t         j                  j                  rt        |   |       yt        |   |       |j                  t         d       | j                  dkD  rVt        d| j                        D ]<  }| j                  |   }|j                  t        j                  |       d| d       > yy)z5Generate context switching and stream retrieval code.z = torch.cuda.current_stream()r:   
stream_idxz  = get_external_object_by_index(r   N)r9   rP   ru  r  rJ  r   r(   r  ranger  r*   format)r  rI  iuser_obj_idxr  s       r^   rJ  z3EnterDeviceContextManagerWithStreamInfoLine.codegen  s    77GOD!GOD!NNn--KLM!#q$"2"23 A#'#B#B1#ELNN/66!DE F::FqJ $r`   rN  )r   r   r  r  r  r   r  r  dictr  rJ  r3  r4  s   @r^   r  r    s7     K1B1B1BSW1XX r`   r  c                  (    e Zd ZU dZdZded<   ddZy)*ExitDeviceContextManagerWithStreamInfoLinezExit a CUDA device context.

    Attributes:
        num_streams: Number of streams that were allocated (must match Enter).
    r:   r   r  c                Z    t         j                  j                  s|j                          yy)zGenerate context exit code.Nr  rH  s     r^   rJ  z2ExitDeviceContextManagerWithStreamInfoLine.codegen+  s     ww"" #r`   NrN  )r   r   r  r  r  r   rJ  r   r`   r^   r  r  !  s     Kr`   r  c                  $    e Zd ZU dZded<   ddZy)EnterCudaStreamContextLinezEnter a context executed by respective CUDA Stream.

    Attributes:
        stream_idx: The index number corresponds to the entering CUDA Stream context.
    r   r  c                t    |j                  dt        | j                         d       |j                          y )Nzwith torch.cuda.stream(z):)r   r,   r  rG  rH  s     r^   rJ  z"EnterCudaStreamContextLine.codegen;  s-    01Q0RRTUVr`   NrN  r  r   r`   r^   r  r  1  s     Or`   r  c                      e Zd ZdZddZy)ExitCudaStreamContextLinez1Generate code to exit the current stream context.c                $    |j                          y rO   )ri  rH  s     r^   rJ  z!ExitCudaStreamContextLine.codegenD  s    r`   NrN  )r   r   r  r  rJ  r   r`   r^   r  r  @  s
    ;r`   r  c                  *    e Zd Zd ZddZddZddZy)EfficientPeakEstimatec                   ddl m}m} t        j                  j
                  j                  }t        t        j                  j                  j                               }t        t        j                  j                               } |||      } ||||      \  | _        }ddlm}  ||t        j                  t         d      | _        y )Nr   )estimate_peak_memoryget_freeable_input_bufr:   )SegmentedTreer   )memoryr  r  r9   rP   rT   nodesr   graph_inputskeysget_output_namesoverall_peak_memorysegmented_treer   operatorr   max)	r  r  r  scheduler_nodesr  graph_outputsnames_to_freeable_bufspeak_by_scheduler_noder   s	            r^   r  zEfficientPeakEstimate.__init__I  s    I''++11!!''"6"6";";"=>"177#;#;#=>!7!V;O"<
8 "8 	2+"HLL#q
r`   c                    t         j                  j                  j                  t         j                  j	                  |      d      t        |j                               z  S )Nr   fallback)r9   rP   rX   optimization_hintrQ   r0   rW   r  rZ   s     r^   	_get_sizezEfficientPeakEstimate._get_size\  sL    ww11GG//5 2 
4>>+,- 	-r`   c                n    | j                   j                  |j                  dz   |j                  dz
        S r   )r  summarize_rangescheduler_node_indexr  line_aline_bs      r^   peak_betweenz"EfficientPeakEstimate.peak_betweena  s6    ""22''!+V-H-H1-L
 	
r`   c                    |j                   dz   |j                   k(  ry | j                  j                  |j                   dz   |j                   dz
  | j                  |j                               y r   )r  r  update_ranger  rZ   r  s      r^   update_peak_betweenz)EfficientPeakEstimate.update_peak_betweenf  s^    &&*f.I.II((''!+''!+NN6;;'	
r`   N)rZ   
BufferLiker   r   )r  r0  r  AllocateLine)r   r   r  r  r  r  r  r   r`   r^   r  r  H  s    
&-



r`   r  c                  X    e Zd ZU dZded<   dZded<   d ZddZdd	Zdd
Z	ddZ
ddZy)r  z6Represents a buffer allocation during memory planning.r  rZ   Fr   comm_bufferc                   t         j                  j                  j                  J t         j                  j                  j                  j                  t         j                  j                  j                        | _        y rO   r9   rP   rT   current_noder  indexr  r  s    r^   rD  zAllocateLine.__post_init__w  T    ww  --999$%GG$5$5$;$;$A$AGG**%
!r`   c                    | j                   ry|j                  dz   | j                  k(  ry| j                  j                  j                  }| j                  j                  j                  ||       }||z   }||k  S )NTr:   )r!  r  r   estimate_peakr  r  )r  	free_linert   r  peak_memory_in_rangenew_peak_memorys         r^   should_reuse_bufferz AllocateLine.should_reuse_buffer}  su    ))A-1J1JJ"ll88LL#||99FFyRVW!55"555r`   c           	     (   | j                   j                         t        j                  j                  v rt        | j                        S | j                  r}t        | j                         }t        j                  rV|j                  |      rE|j                  |      }d|_        t        | j                  |j                   | j                   d      S | S t        | j                         }t        j                  r
||v r|j!                  |      }t        j                  j"                  j%                  t        j                  j'                  | j                         d      t)        | j                   j+                               z  }| j-                  ||      rXd|_        | j                  j.                  j1                  ||        t        | j                  |j                   | j                         S |j3                  ||       | S | j                   j5                         j6                  dk(  rh| j                  j9                  | j                         }|A|xj:                  t=        t?        j@                  tB        jD                  |d            z  c_        | S )NTr!  r   r  cpur:   )#rZ   rR   r9   rP   r  NullLiner   r!  rh   r!   allow_buffer_reuser*  r,  r%  	ReuseLiner_   r$  rX   r  rQ   r0   rW   r,  r(  r  r(  rV   r   static_shape_for_buffer_or_noner  r   	functoolsreducer  mul)r  r  r   r)  rt   static_shapes         r^   r  zAllocateLine.plan  s   99177#:#::DLL))'		2C((U-G-G-L!11#6	&*	# LL)..$))  K tyy)$$		#I77##5533DII> 6 tyy22456D ''	48&*	#**>>y$O y~~tyyII

3	*99((*//58<<GG		RL'11S$$X\\<C6 1 r`   c                   | j                   j                         t        j                  j                  vsJ | j
                  r| j                  |       y | j                  j                  | j                         }|j                  |       y rO   )
rZ   rR   r9   rP   r  r!  _codegen_comm_bufferr   make_buffer_allocationr   )r  rI  r   s      r^   rJ  zAllocateLine.codegen  sb    yy!!#177+B+BBBB%%d+<<66tyyADNN4 r`   c                @   | j                   j                         }| j                   j                         }||j                  
J d|        | j                   j	                         }t        | j                   j                               }t        | j                   j                               }| j                   j                         }t        |t        j                        sJ |j                  }|j                  }	|t        j                  j                  k(  rh| d| j                   j#                  |       d| j                   j#                  |       d| d|j                   d|	 dt%        j&                  dd	       d
}
nt)        d|       |j+                  |
       y)z*Generate allocation code for comm buffers.Nz9Comm buffer requires a valid CUDA device with index, got z = empty_strided_p2p(r  z, torch.device("cuda:z"), group_name="z", alloc_id=r   l    r   zUnsupported comm buffer type: )rZ   rR   r  r%  rW   r   get_size
get_striderb   rc   r"   rd   re   rf   CommBufferTypeSYMM_MEMr   codegen_shape_tuplerandomrandintr7  r   )r  rI  r   r  r   shaperu   rg   re   rf   r   s              r^   r9  z!AllocateLine._codegen_comm_buffer  s   yy!!#%%'!fll&> 	
GxP	
> 		##%dii((*+tyy++-.**,&""5"5666!22&&
r00999&-<<33E:;2<<33F;<B' &&,ll^ 4)l +"NN1i89<  &01A0BC  	tr`   c                J    | j                   r|j                  S |j                  S rO   )r!  _generate_comm_buffer_allocate_generate_allocater8  s     r^   r:  zAllocateLine.codegen_fx  s#    ;;;+++r`   N)r)  r0  rt   r   r   r   r  rN  r;  )r   r   r  r  r   r!  rD  r,  r  rJ  r9  r:  r   r`   r^   r  r  p  s4    @
K
6&P!>,r`   r  c                  R    e Zd ZU ded<   dZded<   dZded<   d ZddZdd	Zdd
Z	y)r0  r  rZ   Fr   r%  r!  c                   t         j                  j                  j                  J t         j                  j                  j                  j                  t         j                  j                  j                        | _        y rO   r#  r  s    r^   rD  z!FreeIfNotReusedLine.__post_init__  r&  r`   c                <   t        | j                  j                               dkD  r| S t        | j                  j                  t
        j                        r| S | j                  rJ | j                  j                         t        j                  j                  v rt        | j                        S t        j                  r\| j                   r)t#        | j                        }|j%                  ||        | S t'        | j                        }|j)                  ||        | S r  )r   rZ   get_inputs_that_alias_outputrc   rg   r"   MultiOutputLayoutr%  rR   r9   rP   r  r0  r   r!   r1  r!  rh   r.  r_   r(  )r  r  r   s      r^   r  zFreeIfNotReusedLine.plan  s    tyy55781<Kdii&&(<(<=K>>!!99177#:#::DLL))$$+DII6&&sD1  'tyy1

3%r`   c                   | j                   j                         t        j                  j                  vsJ | j
                  s| j                  j                  | j                         }| j                  rb| j                   j                         }t        |t        j                        sJ |j                  | d|j                  j                   d       y |j                  |       y y )Nz # z buffer free)rZ   rR   r9   rP   r  r%  r   r  r!  rb   rc   r"   rd   r   re   r   )r  rI  r   rg   s       r^   rJ  zFreeIfNotReusedLine.codegen  s    yy!!#177+B+BBBB~~<<00;D224!&"*=*=>>>$s6+B+B+H+H*IVWt$ r`   c                J    | j                   r|j                  S |j                  S rO   )r!  _generate_comm_buffer_free_generate_free_if_not_reusedr8  s     r^   r:  zFreeIfNotReusedLine.codegen_fx  s#    777555r`   Nr  rN  r;  )
r   r   r  r   r%  r!  rD  r  rJ  r:  r   r`   r^   r0  r0    s0    
ItK
$	%6r`   r0  c                  D    e Zd ZU ded<   ded<   ded<   d
dZddZddZy	)ReinterpretLiner  rZ   	reused_asz	ir.Layoutrg   c                    | S rO   r   r  s     r^   r  zReinterpretLine.plan  s    r`   c                @   t        | j                  t        j                        sJ t        | j                  j                  t        j
                        sJ | j                  j                  | j                  j                         | j                  j                         y rO   )
rc   rg   r"   NonOwningLayoutviewr%   r   codegen_deferred_allocationrR  rR   rH  s     r^   rJ  zReinterpretLine.codegen  sj    $++r'9'9:::$++**B,>,>???00NN##%t{{'7'7	
r`   c                    |j                   S rO   )_generate_reinterpretr8  s     r^   r:  zReinterpretLine.codegen_fx  r  r`   Nr  rN  r;  )r   r   r  r   r  rJ  r:  r   r`   r^   rQ  rQ    s#    

/r`   rQ  c                  V    e Zd ZU ded<   ded<   dZded<   dZded<   dd	Zdd
ZddZy)r2  r  rZ   rR  Tr   
delete_oldFr!  c                p   | j                   j                         t        j                  j                  v rK| j
                  j                         t        j                  j                  v sJ t        | j                        S | j
                  j                         t        j                  j                  vsJ | S rO   )rZ   rR   r9   rP   r  rR  r0  r   r  s     r^   r  zReuseLine.plan&  s    99177#:#::>>**,0G0GGGGDLL))~~&&(0G0GGGGr`   c                p   | j                   j                         t        j                  j                  vsJ | j
                  j                         t        j                  j                  vsJ |j                  | j                  j                  | j                   | j
                  | j                               y rO   )
rZ   rR   r9   rP   r  rR  r   r   make_buffer_reuser[  rH  s     r^   rJ  zReuseLine.codegen-  sz    yy!!#177+B+BBBB~~&&(0G0GGGGLL**499dnndooV	
r`   c                    |j                   S rO   )_generate_reuser8  s     r^   r:  zReuseLine.codegen_fx4  s    (((r`   Nr  rN  r;  )	r   r   r  r   r[  r!  r  rJ  r:  r   r`   r^   r2  r2    s1    
JK
)r`   r2  c                      e Zd ZddZy)r0  c                    |j                   S rO   )_generate_nullr8  s     r^   r:  zNullLine.codegen_fx9  r  r`   Nr;  r=  r   r`   r^   r0  r0  8  s    (r`   r0  c                  J    e Zd ZU dZded<   ded<   ded<   ded<   dd	Zdd
Zy)MultiOutputLinezU
    Given a MultiOutputLayout buffer, indexes actual buffer(s) from the result.
    r@  r   r   result_namearg_nameSequence[Any]indicesc                      fd  j                    j                        }|j                   j                  j                    j
                   d|  j                  j                          y )Nc                l   t        |      dkD  r|d   \  }}t        |t              r |  d| d|dd        S t        |t              r<j                  j                  | j                  t        |            } ||dd        S t        |t              r |  d| d|dd        S t        d|      | S )Nr   []r:   z['z']znon supported index type: )
r   
issubclassr  r   r   codegen_tuple_accessrf  r   r  AssertionError)basenameri  ityper  tuple_accesscodegen_list_tuple_accessr  s        r^   rt  z:MultiOutputLine.codegen.<locals>.codegen_list_tuple_accessI  s    7|a"1:qeT*4z1#Q5GQRQSUUu-#'<<#D#D $"2"2CF$L 5\712;OOt,4zA3b5I7STSU;WW()EuMMr`   r   )rg  ri  r   r   declarerf  ending)r  rI  r   rt  s   `  @r^   rJ  zMultiOutputLine.codegenH  s]    	 $ *$--F||##$T%5%5$6c%ATAT@UV	
r`   c                    |j                   S rO   )_generate_multi_outputr8  s     r^   r:  zMultiOutputLine.codegen_fx`  s    ///r`   NrN  r;  )r   r   r  r  r   rJ  r:  r   r`   r^   re  re  =  s*     "!M
00r`   re  c                  <    e Zd ZU ded<   ded<   ded<   d
dZddZy	)IndexPutFallbackLiner@  r   ir.IndexPutFallbackrZ   zlist[ir.IRNode | None]ri  c                   | j                   }t        j                  |j                        sJ d |j                  d d D        \  }}| j                  D cg c]*  }|r|j                         n| j                  j                  , }} | j                  j                  |j                         |||g|j                           y c c}w )Nc              3  <   K   | ]  }|j                           y wrO   r  r   ts     r^   r   z/IndexPutFallbackLine.codegen.<locals>.<genexpr>m  s     Fq**,F   r   )rZ   r"   is_node_sequenceinputsri  r  r   none_str_generate_index_put_fallbackr  codegen_const_args)r  rI  rZ   r   valuesidxri  s          r^   rJ  zIndexPutFallbackLine.codegenj  s    yy""4;;///Fdkk"1oFF ||
 (+C!!#0E0EE
 

 	211  "Aw	
9=9P9P9R	

s   /Cc                    |j                   S rO   )r  r8  s     r^   r:  zIndexPutFallbackLine.codegen_fxw  s    555r`   NrN  r;  r~  r   r`   r^   rz  rz  d  s    !!
##
6r`   rz  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
ScatterFallbackLiner@  r   ir.ScatterFallbackrZ   c           
     8   | j                   }t        j                  |j                        sJ |j                  rd |j                  D        \  }}}n%d |j                  D        \  }}|j
                  d   }|j                         x}r|j                  nt        j                  j                  }| j                  j                  |||j
                  d   ||g|j                  |j                  |j                  |j                  d   |j!                         |       y )Nc              3  <   K   | ]  }|j                           y wrO   r~  r  s     r^   r   z.ScatterFallbackLine.codegen.<locals>.<genexpr>  s     Jq224Jr  c              3  <   K   | ]  }|j                           y wrO   r~  r  s     r^   r   z.ScatterFallbackLine.codegen.<locals>.<genexpr>  s     EA!--/Er  r:   r   r5  )rZ   r"   r  r  src_is_tensorconstant_argsr  r   r9   rP   r  r   _generate_scatter_fallbackr  python_kernel_namer   r  )r  rI  rZ   r   r%  r   r  r  s           r^   rJ  zScatterFallbackLine.codegen  s    yy""4;;///JdkkJOQsEEJQ$$Q'C!%!22A29L9L//""1%uc2  ##KK!!		
r`   c                    |j                   S rO   )r  r8  s     r^   r:  zScatterFallbackLine.codegen_fx  s    333r`   NrN  r;  r~  r   r`   r^   r  r  {  s    !!

(4r`   r  c                  <    e Zd ZU ded<   ded<   ded<   d
dZddZy	)SymbolicCallArgLiner@  r   r
  argrI   rP   c                d    | j                   j                  | j                  | j                         y rO   )r   "_generate_symbolic_call_arg_helperr  rP   rH  s     r^   rJ  zSymbolicCallArgLine.codegen  s    77$**Mr`   c                    |j                   S rO   )_generate_symbolic_call_argr8  s     r^   r:  zSymbolicCallArgLine.codegen_fx  r  r`   NrN  r;  r~  r   r`   r^   r  r    s    !!	N5r`   r  c                  F    e Zd ZU ded<   ded<   ded<   ded<   dd	Zdd
Zy)UnbackedSymbolDefsLiner@  r   r   output_namer   outputs)dict[sympy.Symbol, pytree.KeyPath] | Noneunbacked_bindingsc                z    | j                   j                  | j                  | j                  | j                         y rO   )r   )_codegen_unbacked_symbol_defs_for_outputsr  r  r  rH  s     r^   rJ  zUnbackedSymbolDefsLine.codegen  s+    >>dllD,B,B	
r`   c                    |j                   S rO   )_generate_unbacked_symbol_defsr8  s     r^   r:  z!UnbackedSymbolDefsLine.codegen_fx  s    777r`   NrN  r;  r~  r   r`   r^   r  r    s#    !!L@@

8r`   r  c                  F    e Zd ZU ded<   ded<   ded<   ddZed	d       Zy)
AssertSizeStrideLiner   r   rt   ru   c           	     v    |j                  d| j                   d| j                   d| j                   d       y )Nzassert_size_stride(r  r   )r   r   rt   ru   rH  s     r^   rJ  zAssertSizeStrideLine.codegen  s1    ,TYYKr$))Bt{{mSTUVr`   c                    | j                   S rO   )_generate_assert_size_striderU  s    r^   r:  zAssertSizeStrideLine.codegen_fx  s    555r`   NrN  r;  rV  r   r`   r^   r  r    s+    
I
IKW 6 6r`   r  c                  Z    e Zd ZU dZdZded<    fdZe	 d	 	 	 	 	 	 	 dd       ZddZ	dd	Z
dd
ZddZddZedd       ZddZedd       ZddZedd       ZddZ	 	 ddZddZddZddZddZddZddZddZddZddZddZddZ ddZ!dd Z"d! Z#d" Z$d# Z%d$ Z&d% Z'dd&Z(	 	 d	 	 	 	 	 	 	 dd'Z)dd(Z*	 	 	 	 dd)Z+dd*Z,dd+Z-dd,Z.dd-Z/dd.Z0dd/Z1dd0Z2dd1Z3d2 Z4	 	 	 	 dd3Z5	 d	 	 	 	 	 	 	 	 	 	 	 	 	 dd4Z6dd5Z7dd6Z8dd7Z9d8 Z:dd9Z;d: Z<dd;Z=d< Z>	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd=Z?d> Z@dd?ZAeBj                  dd@       ZDddAZEdB ZFdC ZGdD ZHdE ZIddFZJ	 	 	 	 	 	 ddGZKdH ZLddIZMdJ ZNddKddLZOddKddMZPddNZQddOZRddPZSddQZT	 	 ddRZU	 d	 	 	 ddSZVddTZWddUZXdV ZYdW ZZdX Z[dY Z\dZ Z]d[ Z^	 	 	 d	 	 	 	 	 	 	 	 	 dd\Z_e	 d	 	 	 	 	 dd]       Z`	 	 	 d	 	 	 	 	 	 	 	 	 dd^Zadd_Zbecd`        Zd	 	 	 	 ddaZedddbZf	 	 	 	 	 	 ddcZgdddZhddeZidf Zjdg Zkdh Zldi Zmdj Zndk Zodl Zpdm Zqdn ZrddoZsdp Ztdddddddddq	 ddrZuddddddddsdddt
	 dduZvdv Zwdw Zxdx ZyddyZzddzZ{ed{        Z|	 dd|Z}d} Z~dd~ZddZddZddZ	 d	 ddZddZddZddZd ZddZd ZddZd Z	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 ddZd Z	 	 	 	 ddZddZd Zd Zd Zd Zd Zd ZddZd Zed        Zed        Zed        Zed        Zed        Z	 	 	 	 ddZd Zd Z xZS )r@  zB
    Generate outer wrapper in Python that calls the kernels.
    Tr   supports_cachingc                    t                    i  _        t                _        t                _        i  _        t                _	        t                _
        t                _        t                _        t                _        t                _        t                _        t                _        t                _        t                _        i  _        d _        i  _        t                _        g  _        d _        d _        d _        d _        d _        t:        j<                  j>                  rdnd _         t:        j<                  j>                  rdnd _!        d  _"        d _#        i  _$        t                _%        t                _&        d  _'         jQ                          g  _)        g  _*         jW                          tY               s j[                           j]                          t:        j<                  j^                  sBt:        j<                  j`                  jc                         D ]  \  }} je                  ||        t        tf                   _4        t        tf                   _5        i  _6         to        jp                  d        jr                         _9        tn        jt                  d
 fd       }| _;        i  _<        t                _=        t}                _?        t                _@        i  _A        t        t        j                  j                  t        j                  j                  	       _G        g  _H        y )Nr    #r1  z
std::move(r   Tc                    j                   j                  |        t        j                  j                  rj
                  j                  |        y y rO   )importsr   r!   r   r   r   )r   r  s    r^   add_import_oncez6PythonWrapperCodegen.__init__.<locals>.add_import_once  s;    LL""4(}}55**44T: 6r`   )debug_printer_leveluse_array_ref)r   r   r   r1  )Ir  r  _pending_input_assertsr   _pending_alignment_copiesr   _names_iterargs_to_buffersr1   r  headerprefixsuffixkernel_declarationswrapper_callkernel_autotune_defsr   subgraph_definitionsr   kernel_autotune_example_argskernel_autotune_tmp_arg_idxsrc_to_kernelkernel_numel_exprlinesru  declare_maybe_referencerv  commentr  r9   rP   ru  
move_beginmove_endrp  supports_intermediate_hooksuser_defined_kernel_cacheunbacked_symbol_declsrC  launcher_fn_nameset_launcher_fn_namecodegened_graph_stackcomputed_sizes_stackwrite_headerr2   write_prefix!write_kernel_autotune_defs_headerrv  constant_reprsitemswrite_constant
BufferName	allocatedfreedreusesr4  	lru_cachewrite_get_raw_streamcacher  _metas
_meta_varsr   multi_kernel_statealready_codegened_subgraphsallocated_workspacesr   r!   aot_inductor debug_intermediate_value_printerallow_stack_allocationdebug_printeradditional_files)r  r   hashedr  r  s   `   r^   r  zPythonWrapperCodegen.__init__  s   BD#:D,&*/'  	 &'$&$&$&#1#3 *,$2$4!%3%5"$2$4!6@l" IK)01( .0HR!#
')$*+''*=*=,2 ww228<)+/(  	& L 	" 9C $!!# &("$&!248..0ww ! 6 6 < < > 2f##D&12 $J/1
+-
 57$=I$7$7$=%%%
! 
	; 
	;
  /&(+5<"2"4<FL(46! 1 & 3 3 T T --DD
 !#r`   Nc                D    | r|J |J t        |||      S t               S rO   )SubgraphPythonWrapperCodegenr@  )is_subgraphsubgraph_nameparent_wrapperpartition_signaturess       r^   createzPythonWrapperCodegen.create1  s?      ,,,!---/~/C  $%%r`   c                    d| _         y )Ncall)r  r  s    r^   r  z)PythonWrapperCodegen.set_launcher_fn_name@  s
     &r`   c                D    | j                   j                  | d|        y )Nz = None  # )r  r   )r  r   r  s      r^   r  z#PythonWrapperCodegen.write_constantD  s    k&:;r`   c                   t         j                  j                  j                         }d}||j                  d|j                   }d}t        t        j                  j                        dkD  rd}n0t         j                  j                  j                  j                  rd}| j                  j                  d| dt        j                   d	| d
d       | j                   j                  dd       	 ddlm} | j                   j                  dd       t        j*                  r| j                   j-                  d       t        j.                  j0                  r| j                   j-                  d       | j                   j-                  d       | j                   j-                  d       | j                   j-                  d       | j                   j-                  d       | j                   j-                  d       | j                   j-                  d       t        j.                  j2                  xs( t4        j6                  j9                  t;               d      }| j                   j-                  d| d       d| d}d| d}t        j.                  j<                  }t        j.                  j>                  }	t        j.                  j@                  }
| j                   j-                  d       | j                   j                  d| d | d!|	 d"|
 d#	       | j                   j-                  d$| d%       | j                   j-                  d&       yy# t&        t(        f$ r Y Qw xY w)'z>Write the header section of the generated Python wrapper code.r  Nz
# AOT ID: r   zRfrom torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_infozFfrom torch._inductor.runtime.debug_utils import tracked_empty_strided
z
                aH  
                from ctypes import c_void_p, c_long, c_int
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from cmath import nanj
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align
                from torch import device, empty_strided
                from zq import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels
                z
            Tr   a  
                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                _quantized = torch.ops._quantized
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                assert_alignment = torch._C._dynamo.guards.assert_alignment
                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
                reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                async_compile = AsyncCompile()
            )_SymmetricMemoryzs
                empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
                zfrom torch.cuda import nvtxz import triton.profiler as protonz%import triton.profiler.language as plzCfrom triton.profiler.hooks import HookManager as _ProtonHookManagerzimport tritonzimport atexitz	import oszetriton.set_allocator(lambda size, align, stream: torch.empty(size, dtype=torch.uint8, device='cuda'))protonzos.makedirs("z", exist_ok=True)zos.path.join("z", "inductor")z", "inductor.chrome_trace")z^from torch._inductor.runtime.proton_utils import process_proton_trace as _proton_process_tracez
                def _proton_finalize_and_postprocess():
                    proton.finalize()
                    _trace_path = z
                    if os.path.exists(_trace_path):
                        _proton_process_trace(
                            _trace_path,
                            group_by_sm=z0,
                            split_invocations=z0,
                            per_cta_occupancy=z,,
                        )
                z5if not _ProtonHookManager.active_hooks: proton.start(z], backend="instrumentation", data="trace"); atexit.register(_proton_finalize_and_postprocess)pl.enable_semantic("triton"))!torch_guardsTracingContexttry_getaot_graph_namer   r!   r  r  	_inductortest_configstrack_memory_lifecycler  r   r    r   r  torch._C._distributed_c10dr  AttributeErrorImportErrorannotate_trainingr   r   proton_profilingproton_output_dirospathr   r   proton_group_by_smproton_split_invocationsproton_per_cta_occupancy)r  contextaot_config_commentinductor_debug_utilsr  
output_dirproton_name
trace_pathgroup_by_smsplit_invocationsper_cta_occupancys              r^   r  z!PythonWrapperCodegen.write_headerG  s4   --..6687#9#9#E#-g.D.D-E!F!v""CCDqH#w __##00GG#l #$ % $,,- .%& '!$ ' 	 	
* 	 ! 	 	
$	 DKK 	   ##KK!!"?@==))KK!!"DEKK!!"IJKK!!U KK!!/2KK!!/2KK!!+.KK!!G  88 BGGLL=J KK!!M*=N"OP*:,nEK)*5PQJ --::K & F F & F FKK!!p KK# $., /) *5 6//@.A B//@.A B KK!!  +} -DD
 KK!!"@A[ *	 , 		s   6#M M+*M+c                     y rO   r   )r  r  s     r^   include_extra_headerz)PythonWrapperCodegen.include_extra_header      r`   c                    | j                   j                  dt        j                   d       	 ddlm} | j                   j                  dd       y # t        t        f$ r Y y w xY w)Na	  
                import torch
                from torch._dynamo.testing import rand_strided
                from torch._dynamo.utils import preserve_rng_state
                from torch._inductor.select_algorithm import AlgorithmSelectorCache
                from aH   import AsyncCompile

                async_compile = AsyncCompile()
                generate_example_value = AlgorithmSelectorCache.generate_example_value
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
            r   )_cuda_getCurrentRawStreamzU
                get_raw_stream = torch._C._cuda_getCurrentRawStream
                Tr   )r  r   r    r   torch._Cr  r   r  )r  r  s     r^   r  z6PythonWrapperCodegen.write_kernel_autotune_defs_header  sw    !!((
 $,,- .	

	:%%,, 	 -  ^, 		s   #A A%$A%c                   dt         j                   d}t        j                  j                  r]| j
                  j                  |       | j
                  j                  t        j                  j                  j                  d             t        j                  j                  s`| j                  j                  |d       | j                  j                  t        j                  j                  j                  d             y y )NzU
            import triton
            import triton.language as tl
            from z+ import start_graph, end_graph
            get_raw_streamTr   )r&   r   r!   r   r   r   r   r   r9   rP   rw  import_get_raw_stream_asru  r  r  
import_strs     r^   write_triton_header_oncez-PythonWrapperCodegen.write_triton_header_once  s     $,,- .

 ==11&&--j9&&00"";;<LM ww""LL
$7LL"""";;<LM #r`   c                   t         j                  j                  j                  d      }t        j
                  j                  r6| j                  j                  |      s| j                  j                  |       t         j                  j                  s8| j                  j                  |      s| j                  j                  |       y y y )Nr  )r9   rP   rw  r  r!   r   r   r   containsr   ru  r  )r  import_get_raw_stream_strs     r^   write_get_raw_stream_headerz0PythonWrapperCodegen.write_get_raw_stream_header  s    $%GG$6$6$O$O%
! ==11--667PQ**445NOww""<<(()BC&&'@A D #r`   c                $    | j                          y rO   )r!  r  s    r^    write_get_raw_stream_header_oncez5PythonWrapperCodegen.write_get_raw_stream_header_once  s    ((*r`   c                   t        |      }|| j                  vrdt        | j                         }|| j                  |<   | j                  j	                  | d|        t
        j                  j                  r;| j                  j	                  | d|        | j                  j                  |       | j                  |   S )Nmetar   )reprr  r   r  r   r!   r   r   r   r  r   )r  r%  vars      r^   add_meta_oncez"PythonWrapperCodegen.add_meta_once  s    Dzt{{"T[[)*+C #DKKKK!!SETF"34}}55**44uCv5FG##C({{4  r`   c                z    | j                         D cg c]  }|j                  | j                         c}S c c}w rO   )get_graph_outputsr  r  r  r   s     r^   get_output_refsz$PythonWrapperCodegen.get_output_refs  s<     =A<R<R<T
78A 1 12
 	
 
s   "8c                     y rO   r   r  s    r^   mark_output_typez%PythonWrapperCodegen.mark_output_type      r`   c                6    t         j                  j                  S rO   )r9   rP   r  r  s    r^   get_graph_inputsz%PythonWrapperCodegen.get_graph_inputs  s     ww###r`   c                6    t         j                  j                  S rO   )r9   rP   r  r  s    r^   r*  z&PythonWrapperCodegen.get_graph_outputs  s    ww$$$r`   c                   | j                         j                         D ]  \  }}t        |t        j                  t
        j                  t
        j                  t
        j                  f      rO|t        j                  j                  vrlt        |j                               dk(  r| j                  |j                               }| j                  |j                               }||f| j                   |<    y r  )r1  r  rc   r   Basicr"   TorchBindObjectGeneratorStateOpaqueObjectStater9   rP   graph_input_namesr5   r<  r   r=  r  )r  r   bufrt   ru   s        r^   codegen_input_size_assertsz/PythonWrapperCodegen.codegen_input_size_asserts  s    ..0668 	?ID#KK&&%%((	  177444 S\\^,1223<<>BD44S^^5EFF15vD''-+	?r`   c                `   | j                   j                  d       | j                         j                         D ]r  \  }}t	        |t
        j                  t        j                  f      r1d| d}| j                   j                  |       d| d}| j                   j                  |       t y )Nz(# make sure graph inputs are not nan/infzassert not z.isnan().any().item()z.isinf().any().item())	r  r   r1  r  rc   r   r4  r"   r5  )r  r   r9  r   s       r^   codegen_input_nan_assertsz.PythonWrapperCodegen.codegen_input_nan_asserts/  s    HI..0668 	(ID##R-?-?@A &;<DKK!!$' &;<DKK!!$'	(r`   c                :    | j                   j                  d       y )NzV

            async_compile.wait(globals())
            del async_compile
            )r  r   r  s    r^   write_async_compile_waitz-PythonWrapperCodegen.write_async_compile_wait9  s    	
r`   c                    dj                  |      }t        |      dk(  r|dz  }| j                  j                  | d       | j                  j                  d       y )Nr  r:   ,z = argszargs.clear())r   r   r  r   )r  input_nameslhss      r^   
write_argszPythonWrapperCodegen.write_argsB  sP    ii${q 3JCWo.n-r`   c                    t         j                  r| j                  j                  d       d}|S | j                  j                  d| j                   d       d}|S )Na  
                class Runner:
                    def __init__(self, partitions):
                        self.partitions = partitions

                    def recursively_apply_fns(self, fns):
                        new_callables = []
                        for fn, c in zip(fns, self.partitions):
                            new_callables.append(fn(c))
                        self.partitions = new_callables

                    def call(self, args):
                r   z
                def z(args):
                r:   )r!   graph_partitionr  r   r  r  prefix_indents     r^   !write_launcher_fn_call_get_indentz6PythonWrapperCodegen.write_launcher_fn_call_get_indentI  sm    !!KK M  KK**+ ,
 Mr`   c                6    t         j                  j                  S rO   )r9   rP   r8  r  s    r^   get_graph_input_namesz*PythonWrapperCodegen.get_graph_input_namese  s    ww(((r`   c                   | j                   J | j                          | j                         }| j                  j	                  |      5  t
        j                  j                  rA| j                  j                  t        j                  j                  j                                t        j                  j                         }t
        j                  r| j                  j                  d| d       | j                         x}r| j!                  |       | j#                          t%               rt'        |       r| j)                          d d d        y # 1 sw Y   y xY w)Nz0training_annotation = nvtx._device_range_start(''))r  r>  rH  r  r   r!   r   debug_sync_graphr   r9   rP   rw  synchronizeget_training_phaser  rJ  rC  codegen_inputsr3   r2   "codegen_input_size_and_nan_asserts)r  rG  phaser8  s       r^   r  z!PythonWrapperCodegen.write_prefixh  s   $$000%%'>>@[[. 	:}}--%%agg&8&8&D&D&FGGG..0E''%%FugRP %)$>$>$@@ @ 12!
 -.<TB779)	: 	: 	:s   
DEEc                    t         j                  r| j                          t         j                  r| j	                          y y rO   )r!   size_assertsr:  nan_assertsr<  r  s    r^   rQ  z7PythonWrapperCodegen.codegen_input_size_and_nan_asserts  s1    ++-**, r`   c                    |D ]K  }|| j                   v s| j                   j                  |      \  }}| j                  t        |||             M y rO   )r  r$  r   r  )r  rA  r   rt   ru   s        r^   codegen_deferred_input_assertsz3PythonWrapperCodegen.codegen_deferred_input_asserts  sQ     	IDt222#::>>tDf3D$GH	Ir`   c                   t         j                  j                  ryt         j                  j                  }|syt	        t         j                  j
                        }|D ]?  }||vst         j                  j                  |   }| j                  j                  |       A | j                  r1dt         j                  _	        | j                  j                  d       yy)zPopulate pending alignment copies for non-mutated inputs.
        Called from the scheduler after mutated_input_idxs is computed.NTz3from torch._C._dynamo.guards import copy_misaligned)r9   rP   ru  inputs_to_checkr   mutated_input_idxsr8  r  r   _defers_input_alignmentr  r   )r  rY  mutated_idxsr  r   s        r^   register_alignment_check_inputsz4PythonWrapperCodegen.register_alignment_check_inputs  s     77''11 "!''"<"<=" 	9C,&ww005..2248	9 )).2AGG+LL""E *r`   c                    t         j                  j                  ry|D ]C  }|| j                  v s| j                  j	                  |       | j                  | d| d       E y)z~Emit alignment check + clone just before the first kernel
        that reads each input, hiding the cost behind GPU execution.Nz = copy_misaligned(r   )r9   rP   ru  r  discardr   )r  rA  r   s      r^   !codegen_deferred_alignment_copiesz6PythonWrapperCodegen.codegen_deferred_alignment_copies  sa     77 	DDt555..66t<$':4&BC	Dr`   c                   | j                          t        |      }t        j                  j                  r=| j
                  j                  | d| d       t        j                  j                  r|S | j                  | d| d       |S )N = get_raw_stream(r   )
r!  r+   r!   r   r   r   r   r9   rP   ru  )r  rn  r  r   s       r^   r  z)PythonWrapperCodegen.write_get_raw_stream  s}    ((*":.==11&&00&*:,a8 ww""$1*Q?@r`   c                     | j                   d   S )N)r  r  s    r^   get_codegened_graphz(PythonWrapperCodegen.get_codegened_graph  s    ))"--r`   c                :    | j                   j                  |       y rO   )r  ry   )r  rP   s     r^   rF  z)PythonWrapperCodegen.push_codegened_graph  s    ""))%0r`   c                6    | j                   j                         S rO   )r  r$  r  s    r^   rh  z(PythonWrapperCodegen.pop_codegened_graph  s    ))--//r`   c                P    ddl m} | j                  j                   ||            S )Nr   )deepcopy)copyri  r  ry   )r  rC  ri  s      r^   rB  z(PythonWrapperCodegen.push_computed_sizes  s!    !((//0HIIr`   c                6    | j                   j                         S rO   )r  r$  r  s    r^   rf  z'PythonWrapperCodegen.pop_computed_sizes  s    ((,,..r`   c                .    t        | j                         S rO   )nextr  r  s    r^   next_kernel_suffixz'PythonWrapperCodegen.next_kernel_suffix  s    t''()*r`   c                   |dkD  rd|J d}| j                   j                  |      s| j                   j                  |       | j                  t        || j                  ||             n%| j                  t        || j                               t        j                  j                  r| j                          | j                  j                  dt        j                  j                  j                  |       d       | j                  j                          t!        |       r| j#                          | j                  j                  t%        |       d| d       || _        || _        y )Nr:   zLfrom torch._dynamo.graph_bytecode_inputs import get_external_object_by_indexrs  rt  rb  r   )r  r  r   r  rp  rm  r!   r   r   r  r   r9   rP   rw  rz  rG  r2   r!  r+   _num_streams)r  rn  r  r  import_lines        r^   codegen_device_guard_enterz/PythonWrapperCodegen.codegen_device_guard_enter  sE    ?-999/  <<((5&&{3NN;55.	 NN- A A
 ==11))+&&00**77
CDAF &&002248002&&00&z233Ej\QRS -7)!,r`   c                    t        | d      r4| j                  dkD  r%| j                  t        | j                               n| j                  t	                      t
        j                  j                  r| j                  j                          y y )Nrp  r:   )
r   rp  r   r  r  r!   r   r   r   ri  r  s    r^   codegen_device_guard_exitz.PythonWrapperCodegen.codegen_device_guard_exit  si    4(T->->-BNN:4;L;LM NN79:==11&&224 2r`   c                    t         j                  j                  j                  x}t	        d| dt        |             t        |      }| j                  |       |S )a  Generate data structure for entering a CUDA Stream context.

        Args:
            stream_idx: The index number of the entering CUDA Stream context.

        Raises:
            ValueError: If this function is called while the previous stream context isn't exited.
        z!Nested stream context switching: z -> r  )r9   rP   rT   current_stream_name
ValueErrorr,   r  r   )r  r  rv  ctx_entrances       r^   codegen_cuda_stream_enterz.PythonWrapperCodegen.codegen_cuda_stream_enter  sf     $%77#4#4#H#HHU34G3H":./1  2ZH|$r`   c                6    | j                  t                      y)z:Generate data structure for exiting a CUDA Stream context.N)r   r  r  s    r^   codegen_cuda_stream_exitz-PythonWrapperCodegen.codegen_cuda_stream_exit"  s    023r`   c                   |r,t         j                  r| j                  j                  ddj	                  |      z   dz          | j                  j                  d       | j                  j                          | j                  j                  d       | j                  j                          | j                  j                  d       | j                  j                  d       | j                  j                  d       | j                  j                  d	dj	                  |      z   dz          y | j                  j                  d
       y )Nzreturn_vars = (r  , )zfor var in return_vars:z!if isinstance(var, torch.Tensor):z#assert not var.isnan().any().item()z#assert not var.isinf().any().item()r   zreturn (z	return ())r!   rU  r  r   r   rG  ri  )r  output_refss     r^   generate_returnz$PythonWrapperCodegen.generate_return&  s   !!!!++%		+(>>F !!++,EF!!++-!!++,OP!!++-!!++,QR!!++,QR!!--a0''
TYY{5K(Ke(ST''4r`   c                     y rO   r   r  results     r^   generate_before_suffixz+PythonWrapperCodegen.generate_before_suffix8  r/  r`   c                    t         j                  rNdj                  | j                        t	        | j                        dk(  rdndz   }|j                  d| d       y y )Nr  r:   r@  r  z-
                runner = Runner(partitions=[z{])
                call = runner.call
                recursively_apply_fns = runner.recursively_apply_fns
                )r!   rE  r   all_partition_namesr   r   )r  r  all_partition_name_lists      r^   generate_after_suffixz*PythonWrapperCodegen.generate_after_suffix;  se    !!&*ii0H0H&I43349r'# MM--D,E F "r`   c                     y rO   r   r  s     r^   generate_endz!PythonWrapperCodegen.generate_endI  r/  r`   c                    |j                   }|1|t        v r)t        |   j                  }| ||| j                         y | j                  t	        | |             y rO   )r  rB   pythonr   r  )r  rZ   op_namecustom_codegens       r^   generate_fallback_kernelz-PythonWrapperCodegen.generate_fallback_kernelL  sV    ))7.J#J9'BIIN)tT^^4,T489r`   c                ~    |j                   D ]  }| j                  |        | j                  t        | |             y)z7Generate .out() call with pre-allocated output buffers.N)r  codegen_allocationr   r  )r  rZ   r  s      r^    generate_extern_kernel_multi_outz5PythonWrapperCodegen.generate_extern_kernel_multi_outV  s9    55 	.H##H-	./d;<r`   c                    |j                  |        | j                  t        | |             t        |j                  t
        j                        r|j                  |        y y rO   )codegen_commentr   r  rc   rg   r"   r  r  r  s     r^   generate_extern_kernel_allocz1PythonWrapperCodegen.generate_extern_kernel_alloc\  sI    T",T489dkk299-%%d+ .r`   c           
        t        |j                  t        j                        }|j	                         }|j                         }|j                         }| j                  }t        j                  r	d|v rd| }|r5| j                  | j                   | ddj                  |       d|        y | j                  | j                   | d| ddj                  |       d|        | j                  rKt        j                  r:|7t        d   dxx   d	z  cc<   | j                  d
|j                   d| d       y y y y )Nview_as_complex.clone()r  r  r   r   inductorintermediate_hooksr:   zrun_intermediate_hooks()rc   rg   r"   
NoneLayoutrR   get_origin_noder  rv  r!   memory_planningr   ru  r   r  generate_intermediate_hooksr   r   )r  extern_kernelr  	no_returnr  origin_noder  rv  s           r^   r  z9PythonWrapperCodegen._generate_extern_kernel_alloc_helperb  s;    }33R]]C	#,,.#335#335!!&7;&F  x(FNNdll^K=$))D/9J!F8TUNN<<.SQtyy>OqQWPXY 0066+$%9:a?:-k.>.>-AK=PQR , 7 1r`   c                \    |j                  |        | j                  t        | |             y rO   )r  r   r  r  s     r^   generate_extern_kernel_outz/PythonWrapperCodegen.generate_extern_kernel_out  s&     	T"*467r`   c                    t         j                  j                  j                  }|j	                  ||d d d       |j                  d|r|n|        |5  | j                  | ddj                  |       d       d d d        y # 1 sw Y   y xY w)Nexternzout=r  r  r   )r9   rP   wrapper_coder  set_printer_argsry   r   r   )r  r  outout_viewr  r  stack_tracesdebug_printer_managers           r^   r  z7PythonWrapperCodegen._generate_extern_kernel_out_helper  s     !" 4 4 B B..tVT4Rdx8S9:;" 	;NNfXQtyy&7q9:	; 	; 	;s   'BBc                    |j                   }|j                  }|rRt        j                  j                  j                  |      }t        j                  j                  j                  |      }|j                  j                          d}dj                   fd|D              }dj                   fd|D              }t        j                   |j                        }d}| d|j                   d}| d| d| d| }	| d|	 d	}
|
S )
Nz.data_ptr()r  c              3  J   K   | ]  }t         j                  |        y wrO   r@  val_to_arg_strr   dimr  s     r^   r   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>  s     XC-<<T3GX    #c              3  J   K   | ]  }t         j                  |        y wrO   r  r  s     r^   r   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>  s$      
?B //c:
r  z$triton.tools.experimental_descriptorz.create_d_tma_descriptorr  r   )dims
block_dimsr9   rP   rX   r  optimization_hintstensorr  r   r@  r  element_sizerank)r  descapply_size_hintsr  r  ptrr  r  r   r  r  s   `          r^   *_generate_tma_descriptor_call_experimentalz?PythonWrapperCodegen._generate_tma_descriptor_call_experimental  s    yy__
77##55d;D))<<ZHJ..01=yyXSWXXYY 
FP
 

 ,::4ARARS7xx		{*:;bbB|n=QtfAr`   c                    |j                   }|r)t        j                  j                  j	                  |      }d}| d}|j
                  j                          d| }| d| d}|S )Nz/triton.tools.tensor_descriptor.TensorDescriptorz.from_tensorr  r  r   )block_shaper9   rP   rX   r  r  r  )r  r  r  r  r  r   r  r  s           r^   $_generate_tma_descriptor_call_stablez9PythonWrapperCodegen._generate_tma_descriptor_call_stable  so    &&''**==kJKBx|$++//12"[MBQtfAr`   c                    t        |t        j                        r| j                  ||      S t        |t        j                        sJ | j                  ||      S rO   )rc   r"   TMADescriptorExperimentalr  TMADescriptorStabler  )r  r  r  s      r^   _generate_tma_descriptor_callz2PythonWrapperCodegen._generate_tma_descriptor_call  sW    dB889BB&  dB$:$:;;;<<TCSTTr`   c                    | j                  |      }|j                   d| | j                   }| j                  |       y Nr   )r  r   rv  r   )r  r  r  r   s       r^   generate_tma_descriptorz,PythonWrapperCodegen.generate_tma_descriptor  s:    11$7))Cvdkk]3tr`   c                :    | j                  t        | |             y rO   )r   r  r  s     r^   generate_scatter_fallbackz.PythonWrapperCodegen.generate_scatter_fallback  s    *467r`   c	                    | ddj                  t        t        |             }	|j                  d      r|	dj                  dg|z         z  }	n|r|	dt	        |       z  }	|	dz  }	| j                  |	       y )Nr  r@  zaten.scatter_reducer  r  z	, reduce=r   )r   mapr   r   r&  r   )
r  r   r  r  r  r  r5  r   r  r   s
             r^   r  z/PythonWrapperCodegen._generate_scatter_fallback  s{     %%QsxxC0@'A&BC(()>?DIIrdVm,,D)DL>22tr`   c                `   g }|j                   dd  }t        |      }t        |j                        D ]^  \  }}|j                  |   9t	        |      }t        |t        j                        sJ |j                  |       N|j                  d        ` | j                  t        | ||             y )Nr   )r  iter	enumerateri  rm  rc   r"   r$   ry   r   rz  )r  rZ   ri  valid_indicesiter_valid_indicesr  _r%  s           r^   generate_index_put_fallbackz0PythonWrapperCodegen.generate_index_put_fallback  s    *,AB!-0dll+ 	%DAq||A*/0!%333u%t$	% 	+D$@Ar`   c                |    ddj                  |       d}||||g}| j                  | j                  ||             y )Nrl  r  rm  )r   r   wrap_kernel_call)r  r  r   ri  r  
accumulateindices_strr  s           r^   r  z1PythonWrapperCodegen._generate_index_put_fallback  sA    $))G,-Q/;
3t,,VT:;r`   c           
     `    | j                  | d| ddj                   |              d       y )Nr   r  r  r   )r   r   )r  buf_namer  get_argsop_overloadr  r  s          r^   ,generate_fallback_kernel_with_runtime_lookupzAPythonWrapperCodegen.generate_fallback_kernel_with_runtime_lookup  s2     	(3'9&:!DIIhj<Q;RRSTUr`   c                f    t        d      5  | j                  |      cd d d        S # 1 sw Y   y xY w)NzPythonWrapperCodegen.generate)r   	_generater  is_inferences     r^   generatezPythonWrapperCodegen.generate  s,    9: 	0>>,/	0 	0 	0s   '0c                &    t         j                  ryy)Nr   r:   )r!   rE  r  s    r^   get_wrapper_call_indentz,PythonWrapperCodegen.get_wrapper_call_indent  s    !!r`   c              #  b   K   | j                   }	 || _         | || _         y # || _         w xY wwrO   r   )r  newolds      r^   set_writelinez"PythonWrapperCodegen.set_writeline  s.     nn	! DNI DNSDNs   /# /	,/c                    | j                   j                  }t        j                  j                  r| j
                  j                  |       y | j                  j                  |       y rO   )r  kernel_defsr!   r   r   r  r   r  )r  r  s     r^   _write_multi_kernel_defsz-PythonWrapperCodegen._write_multi_kernel_defs  sF    --99==11%%,,[9KK{+r`   c                	   t         j                  r| j                          t        j                         5 }|j                  | j                  j                                t         j                  r| j                  |       t         j                  r| j                          | j                  |       t         j                  j                  r*t         j                  j                  s| j                          | j!                  | j                  j"                        5  | j$                  D ]I  }t'        |t(              r|j+                  | j                         /| j                  j#                  |       K 	 d d d        | j-                          | j/                         }| j1                          t         j                  j2                  rA| j                  j#                  t4        j6                  j8                  j;                                t         j                  r| j=                          t         j                  j>                  r| jA                          t         j                  j                  r*t         j                  j                  s| jC                          t         j                  j                  r| jE                          t         jF                  r+t         jH                  s| j                  j#                  d       | jK                  |       d d d        tM               }|jO                  | jP                         |j#                  d       |jO                  | jR                         t4        j6                  jT                  r>t4        j6                  jH                  r$t4        j6                  jV                  r
tM               }|jO                  | jX                         | j[                          |jO                  | j\                         | j_                         }|j                  |      5  |jO                  | j                         d d d        | ja                  |       |jO                  | jb                         | je                  |       | jg                  |       | ji                  |       |jk                         | jl                  jk                         fS # 1 sw Y   ^xY w# 1 sw Y   xY w# 1 sw Y   xY w)Nz+nvtx._device_range_end(training_annotation)r  )7r!   profile_bandwidthr  r   	ExitStackenter_contextr  r   profiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphrun_wrapper_ir_passesr   store_cubinr   !generate_reset_kernel_saved_flagsr  r   r  rc   rM   rJ  r  r,  r.  rM  r9   rP   rw  rN  generate_end_graphr  generate_proton_finalize generate_save_uncompiled_kernelsgenerate_and_run_autotune_blockr  ru  r  r1   r   r  r  rv  is_const_graphr  finalize_prefixr  r  r  r  r  r  add_benchmark_harnessgetvaluewithlinemapr  )r  r  stackr   r~  r  wrapper_call_indents          r^   r  zPythonWrapperCodegen._generate  s~   ##))+!!# .	.u 1 1 8 8 :;0088?''))+&&|4}}((1W1W668 ##D$5$5$?$?@ : JJ :D!$4T%6%67))33D9:: ))+..0K!!#}}--!!++AGG,>,>,J,J,LM'''')}}----/}}((1W1W557}}55446 ''0B0B!!++A   -].	.b  !dll#dkk" 77 3 38N8N#%F 	d//0dkk""::<]]./ 	-MM$++,	- 	##F+dkk"""6*&!""6* &&($$88:
 	
}: :.	. .	.B	- 	-s2   C%SAS4FS#S*S	SS'*S3c                \   | j                   j                  d       i }t        j                  j                  r_t
        j                  j                  rEt        t
        j                  j                        D ci c]  \  }}| j                  |      | }}}| j                   j                         dz   | j                  j                         z   t        j                  t        j                  k(  rkt!        j"                  t%               dd      5 }|j'                  j)                  d             |j*                  }ddd       t        j,                  d       t/        d	d
 fd       	 t1        |       yc c}}w # 1 sw Y   ExY w# t2        $ r}t5        d|       |d}~ww xY w)z
        Compose self.kernel_autotune_defs and self.kernel_autotune_calls into a single block of
        code and execute it to trigger Triton kernel compilation and auto-tuning
        zQ
            async_compile.wait(globals())
            del async_compile
        rr  z.pyF)dirr  deletezutf-8NzAuto-tuning code written to %sartifactc                     dddS )N&inductor_autotune_at_compile_time_codestring)r   encodingr   r   r`   r^   r   zFPythonWrapperCodegen.generate_and_run_autotune_block.<locals>.<lambda>  s    @$! r`   c                      S rO   r   )tuning_codes   r^   r   zFPythonWrapperCodegen.generate_and_run_autotune_block.<locals>.<lambda>  s    { r`   )metadata_fn
payload_fnz%Failed to run autotuning code block: )r  r   r!   r   r   r9   rP   autotuning_inputsr  get_autotuning_input_namer   r   r#   levelloggingDEBUGtempfileNamedTemporaryFiler   writeencoder   debugr   exec	ExceptionRuntimeError)r  scoper  vf	file_pather  s          @r^   r  z4PythonWrapperCodegen.generate_and_run_autotune_blockh  s   
 	!!((	
 ==11agg6O6O ((A(ABC ..s3Q6E 
 %%..0((1134 	
   GMM1 ,,Ke #**734FF		#
 !!0 	 +	
	Se$?# #(  	S!FqcJKQRR	Ss*   9E<-F/F F	F+F&&F+c                \    ddl m}  ||       j                  | j                        | _        y )Nr:   )MemoryPlanner)r  r  r  r  )r  r  s     r^   memory_planz PythonWrapperCodegen.memory_plan  s     2"4(--djj9
r`   c                   | j                         }t        j                  j                  |      }| j                  rt        | j                  d   t              r| j                  d   j                  j                  |vri| j                  j                          | j                  rCt        | j                  d   t              r&| j                  d   j                  j                  |vrit               g}g }t        t        | j                              D ]  }| j                  |   }t        |t              r"|j                  |d         | j                  |<   Dt        |t              r|j                  t                      nt        |t               s|j                  |j                                 |j                  |j                                t        |      dk(  sJ t#        d |D              }y )Nrd  r   c              3  4   K   | ]  }|j                     y wrO   )r  )r   ss     r^   r   z9PythonWrapperCodegen.memory_plan_reuse.<locals>.<genexpr>  s      +
./A))+
s   )r*  r9   rP   _get_output_namesr  rc   r  rZ   r   r$  r  r  r   r  r?  ry   rd  sum)r  r  	out_namesplanning_statespast_planning_statesr  r   _total_allocated_buffer_sizes           r^   memory_plan_reusez&PythonWrapperCodegen.memory_plan_reuse  s   ((*GG--g6	 JJ4::b>+=>

2##((	9 JJNN JJ4::b>+=>

2##((	9 /01!s4::' 	CA::a=D$ 23 $		/"*= >

1D"34&&':'<=D"23$++O,?,?,AB	C 	##O$7$7$9:?#q(((
 (+ +
3G+
 (
$r`   c                    |r!t         j                  r| j                          y t         j                  rt	               | _        | j                          y rO   )r!   r  r  r1  r  r(  r)  r  s     r^   r  z*PythonWrapperCodegen.run_wrapper_ir_passes  s9    F22((%:%<"""$r`   c           	     (  	 | j                   	t        j                  	fd       }t        j                  	fd       }t        |t        j
                        rGt        |t        j                        r||v ry 	j                  | d|        |j                  |       y t        |t        j                        rt        |j                               D ]V  \  }}t        |t        j                        s!||vs&	j                  | d ||       d| d       |j                  |       X t        |j                               D ]V  \  }}t        |t        j                        s!||vs&	j                  | d ||       d| d       |j                  |       X y t        |t        j                  t        j                  t        j                   f      ry t"        j$                  j&                  j(                  ry t+        dt-        |             )Nc                <    j                  |  d|  d       |  dS )Nz_size = z.size()_sizer  r   rI  s    r^   sizeofzDPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.sizeof  s(    NNdV8D69:V5>!r`   c                <    j                  |  d|  d       |  dS )Nz
_stride = z	.stride()_strider  r.  s    r^   strideofzFPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.strideof  s)    NNdV:dV9=>V7##r`   r   rl  rm  zUnknown value type: )r  r4  r  rc   r   r	   Symbolr   r   r"   rw   r  r<  r=  r5  r6  r7  r  r  r!   rE  rp  r   )
r  r   r   
bound_varsr/  r2  r  rt   ru   rI  s
            @r^   codegen_input_symbol_assignmentz4PythonWrapperCodegen.codegen_input_symbol_assignment  s    {{		" 
	" 
	$ 
	$ eUZZ(eU\\2ez6INNeWCv./NN5!r||,&u~~'78 )	TdELL1d*6LNNdV3vd|nAcU!#DENN4()  ))9)9);< +Vfell3j8PNNfXS$0@#a#HINN6*+ B&&(9(92;O;OP
 %%55$';DK=%IJJr`   c           	        t        t        j                            }| j                         }|j	                         D cg c]$  \  }}t        |t        j                        s!||f& c}}|j	                         D cg c]$  \  }}t        |t        j                        r!||f& c}}z   }|D ]  \  }}| j                  |||        	 	 	 	 dd}|D ])  \  }	}t        |t        j                        s! |||       + yc c}}w c c}}w )z$Assign all symbolic shapes to localsc                P   t        j                  | j                         | j                         g      D ]k  }t	        |t
              rt	        |t        j                        r.|j                  D cg c]	  }||vs| }}t        |      dkD  s[t        d| d| d       y c c}w )Nr   zFor z, expected z to have been codegen-ed.)r   from_iterabler<  r=  rc   r	   r   r3  free_symbolsr   rp  )r   r4  exprsymundefined_symbolss        r^   _verify_input_symbol_assignmentzLPythonWrapperCodegen.codegen_inputs.<locals>._verify_input_symbol_assignment	  s     ++U^^-=u?O?O?Q,RS 
!$-D%,,1O $(#4#4%:8MC%! % ()A-(tfK0A/BB[\ 
%s   0	B#:B#N)r   ir.TensorBoxr4  OrderedSet[sympy.Symbol])	r   r   r3  r1  r  rc   r5  r"   rw   )
r  r4  r  kr  r  r   r   r=  r  s
             r^   rP  z#PythonWrapperCodegen.codegen_inputs  s    -/
 ,,.+113
q!z!U\\7RQF
 , 2 2 4X1Jq%,,<WaVXY " 	JKD%00ujI	J		0	&  	?HAueR\\2+E:>	?3
Xs   "D"D>"D!Dc                ~   t        |t        j                        rt        |t        j
                        r|| j                  v ry | j                  j                  |       t        j                  j                  j                  |   }t        ||      }| j                  t        | |t        j                               y y y rO   )rc   r   r3  r   r   PRECOMPUTED_SIZErC  r   r9   rP   rX   inv_precomputed_replacementsr
  r   r  )r  r;  r:  r  s       r^   ensure_size_computedz)PythonWrapperCodegen.ensure_size_computed	  s    c5<<(^CAVAV-Wd)))##C(77##@@ED!#t,CNN.tS!''BC .X(r`   c                     y rO   r   r  s    r^   r  z$PythonWrapperCodegen.finalize_prefix$	  r  r`   rY   c                   t        d      )Nz8codegen_cpp_sizevar is only implemented for cpp_wrapper!)r  r  r   rY   s      r^   codegen_cpp_sizevarz(PythonWrapperCodegen.codegen_cpp_sizevar'	  s    UVVr`   c                   t        ||      S )NrF  )pexprrH  s      r^   codegen_python_sizevarz+PythonWrapperCodegen.codegen_python_sizevar*	  s    Q**r`   c                $    | j                  |      S rO   )rL  r+  s     r^   codegen_sizevarz$PythonWrapperCodegen.codegen_sizevar-	  s    **1--r`   c                    | d| dS )Nrl  rm  r   )r  rq  r   r%  s       r^   ro  z)PythonWrapperCodegen.codegen_tuple_access0	  s    1UG1%%r`   c                    g t        | j                  |      }t        |      dk(  ryt        |      dk(  r	d|d    dS ddj                  |       dS )Nr   ()r:   r  r}  r  r   )r  rL  r   r   )r  rC  partss      r^   r   z/PythonWrapperCodegen.codegen_python_shape_tuple3	  s^    :#d1159:u:?u:?uQxj$$499U#$A&&r`   c                $    | j                  |      S rO   )r   )r  rC  s     r^   r@  z(PythonWrapperCodegen.codegen_shape_tuple;	  s    ..u55r`   c                    dj                  dj                  |t        |      t        |      | j	                  |      | j	                  |      g            g fS )Nzalloc_from_pool({})r  )r  r   rK  r   r   )r  r   rv   r   rC  ru   s         r^   codegen_alloc_from_poolz,PythonWrapperCodegen.codegen_alloc_from_pool>	  s_     %++II&MJ33E:33F;

 
 
	r`   c                ~    t        |      \  }}}	}
} fd}|j                         }|xr ||	k(  }|r||k(  xr ||k(  }|
}n[||j                  j                  k(  xr4 ||j                  j                  k(  xr ||j                  j
                  k(  }|j                  }|r|||k7  r	d| d| dS | S  |||||||      S )Nc           	         
j                  |      }
j                  |      }
j                  |      }d|  d| d| d| d	}	|||k7  r	d|	 d| dS |	S )Nzreinterpret_tensor(r  r   aten.view.dtype()r   rN  )r   tgt_size
tgt_stride
tgt_offset
cast_dtype
base_dtyper"  stoffr:  r  s             r^   apply_reinterpretzHPythonWrapperCodegen.codegen_reinterpret_view.<locals>.apply_reinterpretl	  s|     //9A00<B&&z2C(b2bTC5BD%*
*B)$r*Q??Kr`   rX  r  r   )r   rR   rg   rt   ru   rv   r   )r  rz   rt   ru   rv   r   r   d_sized_strided_offsetd_dtypecollapsibler`  r   	collapsedsame_layoutr]  s   `                r^   codegen_reinterpret_viewz-PythonWrapperCodegen.codegen_reinterpret_viewM	  s    8 ,D1 	9(G[		 }}6Fh$6	&.?Vx-?K J ((( 1dkk0001dkk000 
 J Uj%8)$r%::V tVVUJOOr`   c                8    | j                  | d| d| d       y )Nz.copy_(r  r   r  )r  r   dstnon_blockings       r^   codegen_device_copyz(PythonWrapperCodegen.codegen_device_copy	  s!    #gcU"\N!<=r`   c                    |j                         }|j                  d      }| j                  t        | |||j                               y r  )rR   
input_namer   re  ri  )r  rZ   rf  rg  s       r^   codegen_multi_outputz)PythonWrapperCodegen.codegen_multi_output	  s6    mmo??1%t[(DLLQRr`   c           
     j   |j                    d|j                   d|j                    d|j                    }|rd|j                   d| d}| j                  |j                   d|j                   d|j
                   d| d	       | j                  j                  t        |j                               y )
N +  if z
 < 0 else zmax(0, min(r  z))r   z * (r   )	r%  rt   r   unbacked_offset_symbolbase_offsetbase_dim_strider  r   r   )r  rZ   clamp	index_strs       r^   codegen_dynamic_select_indexz1PythonWrapperCodegen.codegen_dynamic_select_index	  s    zzl#dii[TZZL
4::,W	%dii[9+R@I**+3t/?/?.@DDXDXCYY]^g]hhij	
 	""&&s4+F+F'GHr`   c                     fd} fd}j                   } |j                        } |j                        } j                  | d|         j                  | d|         || d| dj                        } j                  | d| d        j
                  j                  t        j                                y )	Nc           	     `   j                  t        j                  dt        j                  | j                                    }j                  t        j                  dt        j                  | j                  z   j                                    }j                  |       }| d| d| S )Nr   rr  z >= 0 else )rN  r   MaxMinrt   )r   posnegx_condrZ   r  s       r^   clamp_indexzDPythonWrapperCodegen.codegen_dynamic_slice_size.<locals>.clamp_index	  s    &&uyyEIIa4K'LMC&&		!UYYq499}dii@AC ))!,FU$vhk#77r`   c                Z    |dk(  r| d|  S j                  |      }d| d|  d| d| S )Nr:   z - r  rq  z	 - 1) // )rN  )	start_varend_varstepstep_r  s       r^   codegen_with_stepzJPythonWrapperCodegen.codegen_dynamic_slice_size.<locals>.codegen_with_step	  sJ    qy!#i[11((.Ewis9+SyHHr`   z	_start = z_end = _start_endz
 = max(0, r   )unbacked_size_symbolstartendr   r  r  r   r   )r  rZ   r  r  r;  r  r  	with_steps   ``      r^   codegen_dynamic_slice_sizez/PythonWrapperCodegen.codegen_dynamic_slice_size	  s    	8	I ''DJJ'$((##iw/0#gcU+,%VnTlDIIN	#j156""&&s4+D+D'EFr`   c                :    | j                  t        | |             y rO   )r   r^  r  s     r^   codegen_dynamic_scalarz+PythonWrapperCodegen.codegen_dynamic_scalar	  s    (t45r`   c                   d |j                   D        \  }t        |j                        dk(  r#| j                  |j                   d| d       nkt        |j                        dk(  r@t        |j                  d   t              r#| j                  |j                   d| d       nt        |j                        dk(  rt        |j                  d   t              r| j                  |j                   d| d       | j                  d	|j                   d
|j                  d   j                   d|j                   d|j                  d   j                   d	       | j                  |j                   d|j                   d|j                  d   j                          nt        d|j                         | j                  |j                          d       y )Nc              3  <   K   | ]  }|j                           y wrO   r~  r  s     r^   r   z?PythonWrapperCodegen._codegen_dynamic_scalar.<locals>.<genexpr>	  s     >Q1&&(>r  r   r   .item()r:   z = 1 if z.item() else 0z_undivided = zassert z_undivided % z
 == 0, f'{z_undivided} not divisible by 'z_undivided // unrecognized keypath  = None)r  r   keypathr   r;  rc   r   r   divisorrp  rR   )r  rZ   rz   s      r^   r`  z,PythonWrapperCodegen._codegen_dynamic_scalar	  s   >$++>t||!NNdhhZs4&89!#
4<<?M(RNNdhhZxv^DE!#
4<<?K(PNNdhhZ}TF'BCNN$((=a1H1H0I Jxxj >t||A?V?V>WWXZ NN88*CzQ8O8O7PQ !#8!GHH 	$--/*'23r`   c           
     ^     fd}fd}fd}j                  g d       j                         5  j                  dd       t        j                  j
                  j                         D ]U  \  }}j                  d|         |||j                         |j                         |j                  |j                         W t        t        j                  j                        d	kD  r^j                  d
       t        j                  j                  j                         D ]"  \  }}j                  d|         |||       $ t        j                  j                  j                         D ]  \  }}t        |t         j"                        rCt        t        j                  j$                  j&                  j)                  |d      t*              rdt        |t,        j.                        rj                  | d       t        |t         j0                        r3 ||t        j                  j$                  j3                  |d             t        |t         j4                        r ||d       t        |t,        j6                        r# ||d|j                  j8                   d       Bt        |t,        j:                        rj                  | d       rt        j                  j$                  j=                  |j?                         d      }t        j                  j$                  j=                  |jA                         d      }	 ||||	|jC                         |jE                                 j                  ddjG                  t        j                  j                  jI                                d       ddd       j                  g d       j                         5  j                  dd       ddd       y# 1 sw Y   IxY w# 1 sw Y   yxY w)z2Write out codegen for benchmarking the output codec                    j                  |  dj                  |       dj                  |       d| d| d
       y )Nz = rand_strided(r  
, device='	', dtype=r   )r   r   )r   rC  ru   r  r   r   r  s        r^   add_fake_inputzFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_fake_input	  sT    &(2259:"226:; <!()E7!5r`   c                2    j                  |  d|        y r  r  )r   r  r   s     r^   add_expr_inputzFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_expr_input	  s    vS./r`   c                2   |j                  |  d       y dd l}	 j                  |  d|j                  |      d       y # t        t        |j
                  f$ r7}j                  dt        |       d|  dt        |       d       Y d }~y d }~ww xY w)	Nr  r   z = pickle.loads(r   z.raise TypeError("Failed to pickle opaque type z for variable r   z"))r   pickledumps	TypeErrorr  PicklingErrorr   r   )r   r   r  r  r   s       r^   add_torchbind_inputzKPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_torchbind_input	  s    }  D6!12  D6)9&,,u:M9PPQ!RS~v/C/CD   DT%[MQ_`d_eeghklmhngooqr s   &A B-BB)r  r  zdef get_args():zP
                from torch._dynamo.testing import rand_strided
                Tr   zglobal r   zimport pickleNr  *   r  Fztorch.cuda.default_generators[z].graphsafe_get_state()zreturn [r  rm  )r  r  z9def benchmark_compiled_module(args, times=10, repeat=10):z
                from torch._inductor.utils import print_performance
                fn = lambda: call(list(args))
                return print_performance(fn, times=times, repeat=repeat)
                )%
writelinesr   r   r9   rP   	constantsr  r   rt   ru   r  r   r   torchbind_constantsr  rc   r   r3  rX   backed_var_to_valr   r   r"   r5  r	   r  r4  r6  r%  r7  r  r<  r=  r  rW   r   r  )
r  r   r  r  r  r   r   torchbind_objrC  ru   s
   ``        r^   benchmark_compiled_modulez.PythonWrapperCodegen.benchmark_compiled_module	  s   		0	 	56]]_ F	SMM 	    !ww00668 e   74&!12%**,ekk	 177../!3  1+,77+F+F+L+L+N ='D- $$wtf%56'm<	=  !ww3399; ,eeU\\2zGG$$66::5$G8 eR%7%78$$vW%56uzz2
 #agg..@@QS@T  u{{3 #4/r'8'89"89K9K8LLcd  r';';<$$vW%56GG,,??(2 @ E WW--@@((*R A F #((*)M,\ x		!''2F2F2K2K2M(N'OqQRMF	SR 	Q	
 ]]_ 	MM
   	 	YF	S F	SX	 	s   NP:P#P #P,c                    t         j                  sy| j                  |       |j                  g d       |j	                         5  |j                  dddt                dg       ddd       y# 1 sw Y   yxY w)zL
        Append a benchmark harness to generated code for debugging
        N)r  r  zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzargs = get_args()zcompiled_module_main('zU', lambda times, repeat: benchmark_compiled_module(args, times=times, repeat=repeat)))r!   benchmark_harnessr  r  r   r/   r  r   s     r^   r  z*PythonWrapperCodegen.add_benchmark_harnessB
  s}     ''&&v.@A]]_ 
	X'01C1E0F Gm m		
	 
	 
	s   !A00A9c           
     D    | j                  t        | |||||             y r  )r   r  )r  r  r  r  r  r  s         r^   define_kernelz"PythonWrapperCodegen.define_kernelX
  s*     	 !-		
r`   c                    t         j                  j                  r)|r't        j                  dd|t        j
                        }|r| dnd}d| |  d| }|S )Nz^// z# )flagsrr  r  z

r   )r!   r   r   resub	MULTILINE)r  r  r  metadata_commentbodys        r^   _format_kernel_definitionz.PythonWrapperCodegen._format_kernel_definitionk
  sZ     ==11h vvgtXR\\JH.6hZr?B&'}C}Er`   c                .   t         j                  j                  rL|rJ| j                  |||      }| j                  j                  |       t        j                  j                  ry | j                  |||      }| j                  j                  |       y )N)r  )
r!   r   r   r  r  r   r9   rP   ru  r  )r  r  r  r  r  r  r  s          r^   r  z*PythonWrapperCodegen._define_kernel_helperw
  s     ==11c11[8 2 D %%,,T2ww""--x . 
 	4 r`   c                N    | j                   j                  |j                         y rO   )r  r   r   )r  r   subgraph_codes      r^   define_subgraph_launcher_fnz0PythonWrapperCodegen.define_subgraph_launcher_fn
  s    !!(()<)<=r`   c                    ddl m} |S )Nr:   )TritonKernel)r   r  )clsr  s     r^   _get_triton_info_kernel_clsz0PythonWrapperCodegen._get_triton_info_kernel_cls
  s     	)r`   c                8  123456 ddl m}m}	m}
 ddlm1m}m}m}m	} |j                  }g 6i 4g 3g }36fd2d0124fd	}|j                  D cg c]  }|j                   }}|j                  D cg c]  }|j                  s|j                   }}t        |      D ]  \  }}||v r || 1|      d	       |vr#|   }|    || 1|      d
       @t!        |t"        j$                        r[t!        |t"        j&                        r'd|j(                  |j*                  j-                         fnd\  }}} || |||||             t!        |t"        j.                        r/ || |||j1                         |j-                                      t!        |t"        j2                        rO || |||j4                  j1                         |j-                         |j6                  j8                               gt!        |t:        t<        j>                  f      xr* t@        jB                  jD                  jG                  |d      } || |||      |        tI        6d 3|jJ                  D cg c]  }tM        |       c}      }|tO        jP                  t@        jB                  jS                               i 4tT        jW                  |d      tY        63      gd}|rt[        |      |d<   |rt[        |      |d<   t]        |      dk(  r0|	j_                         }g ta        t<        jb                  |d         } nd15fd}!i 5|D "cg c]  }"g ta        |!|"       }}"|rt]        |      t]        |      k(  sJ g }#te        tg        ||      d d      D ]Q  \  }"}$|#ji                   ||$      g ta        tj        |"      g ta        tl        |"      g ta        tj        |"      d       S |
j                  |#g ta        tn        5jq                               d}g 5js                         } |r|D %cg c]  }%||%   	 c}%|d<   tu        |jv                        g}&t]        |      dkD  rQjq                         D ]>  }t!        |t"        j.                  t"        j2                  f      r.|&ji                  |       @ |&ji                  to        |             |&jy                  to        |             t[        |&      }&|&| jz                  v r| jz                  |&   \  }'}}(|'||(| fS | dt]        | jz                         }'t}               })t~        j                  j                  r|)j                  d|'d       n|)j                  d|d       |'|d <   | j                         }*|j                  |*j                                |)j                  |*j                                t~        j                  j                  r|)j                  d!       |)j                  d"g ta        ||      d#|d$|d%       t        ||      }+t~        j                  j                  r|+j                  d&| d'd&|' d'      }+t~        j                  r|+j                  d(d)      }+|+j                  d*d+      }+|)j                  |+       t@        jB                  jS                         },|)j                  d,|,j                   d-       t        j                  |jv                        \  }-}.t        j                  |jv                        }/d.|/ d/|. }0| j                  |'|)j                         |0       |'||f| jz                  |&<   |'||| fS c c}w c c}w c c}w c c}"w c c}%w )2Nr   )config_to_dict	FixedGridPrecomputedGridr:   )ConstexprArgKernelArgTypeSizeArg	TensorArgTMADescriptorArgc                J    j                  |       j                  |        y rO   )ry   )r  r  arg_indices	signatures     r^   add_to_signaturezPPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_to_signature
  s    S!s#r`   c                   |r?t               r	 | |       |j                  v r|j                     |j                  <   y y |j                  v sJ |r>t               r |  |j                               n	 | |       d|j                  <   y |r4t               r |  |j                               d |j                  <   y  | |       y )Nr   r:   )r8   r   )	r  r  is_constexprequals_1equals_noner  r  r  r   s	        r^   add_argzGPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_arg
  s    13 %S#.88v% +1*:Ichh' & xx6)))57
 )l.IJ(c2*+Ichh' 57 )l.IJ*.Ichh'$S#.r`   r  T)r  )r  stable)experimentalNN)r   api_typer  r   )r   bufferr   )r   r  r   rv   )r  )
size_dtyperi  argdefs)ri  )r  r  r  r   restore_valuereset_to_zeror   c                N   t        | t        j                        rdg | j                  }|s| S |j	                  t
               |D ]+  }|v rt        j                  dt                     |<   - t        |       S t        | t              sJ t        j                  |       S )N)r   _launcher_s)rc   r   r	   r9  sortr   r3  r   r7   r   r   )r:  symbolsr;  extra_launcher_argss      r^   rename_sizes_for_launcherzYPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.rename_sizes_for_launcherD  s    dEJJ/2 1 12G"#LLSL)& "55$38<<)#.A*B)CD4+C0 &d,?@@!$,,,}}T**r`   c                2    t        | d   j                        S r   r   r   s    r^   r   zHPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<lambda>Z  s    3qt{{3C r`   r   )r!   r  cpppython_slow)	grid_typeprecomputed_gridsr  declared_constexpr_namesr  zasync_compile.triton(z, '''r  r  zG
            @triton_heuristics.user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=z{,
                filename=__file__,
                custom_kernel=True,
            )
            @triton.jit
            r   r  z"""z\"\"\"z'''z\'\'\'z''', device_str='rL  z# Original path: rt  )FFF)r:  r   r   r   )Rruntime.triton_heuristicsr  r  r  commonr  r  r  r  r  r   paramsr   r  numr  rc   r"   TMADescriptorr  r  r  rW   rr   rR   r%   rz   rg   rv   r   r   r   r9   rP   rX   statically_known_equalsrE   	arg_namesr;   r'   r  get_current_device_or_throwr  fromkeysrC   r   r   setup_grid_as_argsr  sympifyr   r   ry   rK  rA   r   r  r  idr   extendr  r1   r!   r   unique_user_kernel_namesr   r  updateinductor_meta_commonr   gen_common_triton_importsr  r  replaceru  r   inspectgetsourcelinesgetsourcefiler  r   )7r  r  r   r   restore_value_argsreset_to_zero_argsr   r  r  r  r  r  r  r  r  original_nameequal_to_1_argsr  pr  
constexprsr  r   r  r  r  r   r  r   triton_signaturer  r  extra_launcher_call_argsr  r   r  cfgr  	cache_keyr   cached_inductor_metar   triton_info_kernel_clsr  current_devicer  linenosrcfiler  r  r  r  r  r  r  s7      `                                             @@@@@@r^   !define_user_defined_triton_kernelz6PythonWrapperCodegen.define_user_defined_triton_kernel
  sp   	
 	

	
 	
 )+	$&	!#%'	$"	/ "	/H &,]]3QVV3	3%+]]EannaeeE
E!), 9	GHCj \s3$G& +Cc{"\s3Fc2#3#34 &c2+A+AB "3??CJJ4H4H4JK9 1Hk5
 (!$%-(3"'	  RYY/!!$#&<<>"%--/  R%7%78 !!$#&88#4#4#6"%--/#&::#4#4	  *c5==1   ''**BB  Cc!2XFs9	Gv -)/)9)9:AWQZ:	
 *&--agg.Q.Q.ST--3
 ''
, +01C+DK(+01C+DK(u:?,5,H,H,JM'FU]]E!H)E'F$+  EGINO<s4d;<OEOSZ3w<777 "#E7#)CT 
	c "(("0"5"5Ct$4"52UD!12':UD)9':	
 -55%6'PS2E2L2L2N)O'PM
 (E)<)A)A)C'D$&09!"	!9M45
 VYY-	w<!}} *!#		23E3E'FG$$S)* 	[)*]+,)$	6666:6T6T73D+3 $(	   #d&D&D"E!FG(*==11%%(=dXU&KL%%(=m=Ne&TU'+m$!%!A!A!C3HHJK5OOQR==))%%&DE83~w78; <,/ 0(O ,			
 OO

 ==11#++d=/,CtD6QR^TJ $++E;?J''{;
z*<<>!!$5n6I6I5J""MN**6995	6''		2&wiq9$$&	
 6:;4V&&y1[-1IIIy 4E@ ;j P.9s$   ^4^^?^^^c                    | d|j                    d}||d| z  }t        j                  |dd      }t        ||j                        }|dk(  }|s*| j                  t        | |t        j                               |S )Nr  numelT)
is_integeris_positiver  )	r  r   r3  r
  r  r   r  r9   rP   )r  r  treer  sym_namer;  r  is_benchmark_kernels           r^   generate_numel_exprz(PythonWrapperCodegen.generate_numel_expr  s    !]!DKK=6!F8$Hll8$G c4::.)R/"NN.tS!''BC
r`   c                j    | j                  |j                   dt        |j                                y r  )r   r  rK  r  )r  r  rP   s      r^   r  z7PythonWrapperCodegen._generate_symbolic_call_arg_helper  s)     	#))Ccnn(='>?@r`   c                   |j                         }t        | |      }|j                  t        j                  k(  r| j                  |       n1|j                  t        j                  k(  r2| j                  |       | j                  | j                  |             n|j                  t        j                  k(  r| j                  j                  |      }|rRt        |t              rt        |j                  t              sJ t        j                  |j                  |      |_        nV| j                  |       | j                  | j                  |             || j                  |<   nt        |j                        t         j"                  j$                  r| j&                  j                  t(        j+                  | ||j,                  |j.                  t0        j2                  j4                  j7                  |j8                        fd             |j                  t        j                  k7  r0| j&                  j                  t(        j                  | |             y y y )N)r:   )rC  ru   )rR   r  	zero_moder@   UNINITIALIZEDr   ZERO_ON_CALLmake_zero_bufferZERO_PER_GRAPHr  r   rc   rZ   r?   maximumrp  r!   r   r   r   r@  make_allocationr  r   r9   rP   rX   r  r   )r  wsr   r   priors        r^   generate_workspace_allocationz2PythonWrapperCodegen.generate_workspace_allocation  s   {{}D"%<<,:::NN4 \\.;;;NN4 NN40067\\.===--11$7E!%6:JJ<   *11%**bA
t$t44T:;26))$/ ..==11&&00$44IIHH77++==bhhGI 5 	 ||0>>>**44(99$E ? 2r`   c                v    |j                   t        j                  k7  r| j                  t	        | |             y y rO   )r  r@   r  r   r0  )r  r  s     r^   generate_workspace_deallocationz4PythonWrapperCodegen.generate_workspace_deallocation  s.    <<,;;;NN.tR89 <r`   c                $    | d| j                    S )Nz.zero_())rv  )r  r   s     r^   r  z%PythonWrapperCodegen.make_zero_buffer  s    x}--r`   c                H    | ddj                  |       d| j                   S )Nr  r  r   )r   rv  )r  r   r  s      r^   r  z%PythonWrapperCodegen.wrap_kernel_call  s'    q9-.a}==r`   c                    | j                   j                  d       | j                   j                  dt        j                  j                   d       |j                  | j                   j                                y )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)r  r   r9   rP   graph_idr  r   )r  r  s     r^   r  z8PythonWrapperCodegen.generate_profiler_mark_wrapper_call  sb    ##$PQ##*177+;+;*<<UV	
 	D--4467r`   c                :    | j                   j                  d       y )Nzstart_graph())r  r   r  s    r^   r  z)PythonWrapperCodegen.generate_start_graph  s    ##O4r`   c                ^    | j                   j                  dt        j                  d       y )Nz
end_graph(r   )r  r   r!   profile_bandwidth_outputr  s    r^   r  z'PythonWrapperCodegen.generate_end_graph  s'    ##j1P1P0SST$UVr`   c                    | j                   j                  t        j                  j                  j                                y)z<Synchronize GPU to ensure proton captures all kernel events.N)r  r   r9   rP   rw  rN  r  s    r^   r  z-PythonWrapperCodegen.generate_proton_finalize  s)    ##AGG$6$6$B$B$DEr`   c                ^    | j                   j                  dt        j                   d       y )NU
            for kernel in globals().values():
                if isinstance(kernel, zU.CachingAutotuner):
                    kernel.cuda_kernel_saved = False
            r  r   r&   r   r  s    r^   r  z6PythonWrapperCodegen.generate_reset_kernel_saved_flags  s2      ''8'A'A&B C	
r`   c                ^    | j                   j                  dt        j                   d       y)a[  
        Precompile and save the CUBINs of the Triton kernels that haven't
        been precompiled and saved as a side effect of running the generated
        JIT model (Python wrapper). This can happen when the model contains
        control flow: only one pass through the control flow operators covers
        the kernels that are saved, the remaining kernels are not launched,
        hence not saved. The main purpose of this codegen is to compile and
        save the Triton kernels outside the active control flow path for
        subsequent AOTInductor code generation and compilation.
        r,  a  .CachingAutotuner):
                    if not kernel.cuda_kernel_saved:
                        if len(kernel.launchers) == 0:
                            kernel.precompile()
                        kernel.save_gpu_kernel(
                            stream="stream",  # use dummy stream
                            launcher=kernel.launchers[0],
                        )
            Nr-  r  s    r^   r  z5PythonWrapperCodegen.generate_save_uncompiled_kernels"  s4     	  ''8'A'A&B C
	
r`   c                >    d }|D cg c]
  } ||       c}S c c}w )Nc                    t        | t              rt        |       r| dz   S | S t        | t        t        t
        t        f      rt        |       S t        t        j                  j                  j                  |             S )Nr  )rc   r   rD   r   floatr   r
  rK  r9   rP   rX   rY   )r  s    r^   wrap_argzAPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.wrap_arg<  s^    #s#*B3*GsYPSPC#udO!DE3xQWW--66s;<<r`   r   )r  r  r2  r  s       r^   prepare_triton_kernel_callz/PythonWrapperCodegen.prepare_triton_kernel_call;  s!    	= *33#333s   c                p    t        |t              rKt        |t        j                        r.|j	                         j                         } j                  |   }n\ j                  j                  |      r|} j                  |   }n/|J d       d j                   }|} xj                  dz  c_        |
J d|        t        j                  j                  j                  |j                               }t        j                  j                  j                  t        j                  j                  |            }t        j                  j                  j                  |j                               }|j!                         }	|j#                         }
t        j                  j                  j%                  |j'                         j(                        }d| d| d|	 d|
 d| d| d	} j*                  j-                  | d
|        t        |t        j                        r5 j/                  |d      }|} j*                  j-                  | d
|        |S t1        |t2        j4                        st        |t6              rt        |t8              r| j:                  v r|S |y|}t        |t6              r|j<                  }|t        j                  j                  j>                  v r't        j                  j                  j>                  |   }t9        t        j                  j                  j%                  |            S t        |t8        t@        tB        tD        f      rt9        |      S t        |tF              rddjI                   fd|D               dS tK        dtM        |             )NzBV.graph.get_buffer(arg) and raw_arg can't be None at the same timetmp_arg_r:   z Failed to find a buffer for arg zgenerate_example_value(r  z, 'z', r   r   T)r  r  r1  rl  c              3  T   K   | ]  }j                  |t        |             ! y wrO   r   )r   ar  s     r^   r   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>  s#      ZQR!@!@DG!L Zr   rm  zUnsupported type )'rc   torch_dtyper"   r  
get_tensorrR   r  r   r  r9   rP   rX   r  r<  get_allocation_sizer=  r  rW   r  rs   rv   r   r   r  rn  r   r4  r
  r   r  r  rC  r   r1  r   r  r   r7  r   )r  r  arg_typeraw_argr  r9  rt   allocation_sizeru   r  r   rv   r   s   `            r^   r   z/PythonWrapperCodegen.generate_example_arg_valueG  s   h,'2#3#34"--/88:**3/%%))#.**3/* X* &d&F&F%GH00A50?L&Fse$LL?77##66s||~FDgg..AA++C0O WW%%889IJF^^%FMMOEWW%%778H8O8OPF-dV2fXSE7RTU[T\\^_n^oopqE&&00H:S1HI'2#3#34 :: %) ;  **44zUG5LMO%++.*S/2R#s#$//)J?!#/nnagg&&CCCgg&&CCCHqww''99#>??c3t45s8OT"tyy ZVY ZZ[[\]]%(9$s)&EFFr`   c                z     t        |t              r ddj                   fd|D              z   dz   S t        |      S )Nrl  r  c              3  @   K   | ]  }j                  |        y wrO   )_grid_dim_str)r   r   r  s     r^   r   z5PythonWrapperCodegen._grid_dim_str.<locals>.<genexpr>  s     RT 2 24 8Rs   rm  )rc   r  r   rK  )r  grid_per_dims   ` r^   r@  z"PythonWrapperCodegen._grid_dim_str  s<    lD)diiR\RRRUXX &&r`   )r  r   r  r  r  r  r  r   c                  | j                   j                  |D ci c]2  }t        |t              r |t        j
                  j                  |      4 c}       |xs t        j
                  j                         }t        j
                  j                  j                  }| j                  t        | ||||||||	|t        j
                  j                  |
|             yc c}w )z
        Generates kernel call code.

        triton: Defines whether the backend uses Triton for codegen. Otherwise it uses the CUDA language when gpu=True,
                and C++ when gpu=False.
        )r  r  r  r  r  r   r  r  r  r  r   r  N)r  r  rc   r   r9   rP   try_get_bufferr  rT   r  r   r  r   )r  r  r  r  r   r  r  r  r  r  r   r  r  s                r^   generate_kernel_callz)PythonWrapperCodegen.generate_kernel_call  s    . 	## %c3' QWW++C00	
 @177>>@WW..AA'#!!#'+77<<%9#5%	
s   7Cr  )
r  r   r  r  r  r  r  r  r   r  c       
   
     
    |xs t         j                  j                         }|s|j                  dvr{|j                  dk(  r" j	                   j                  |             y |j                  dk(  r" j	                   j                  |             y t        d|j                   d       j                  |      }dj                  |      }|=|t        k7  r4 j                          d} j	                  | d|j                   d	       n!t        j                   |j                  |
      }|s$d
| d	} j	                   d d| d| d	       y  j                          t        j                   j"                  rn j$                  vr_|t'        |      t'        |      k(  sJ d       d |rDt         j                  j(                  r*t         j                  j(                  j+                  |d       d fd} fd}g }|(|J d       d gt'        |      z  }d gt'        |      z  }nt'        |      t'        |      k(  sJ d       i }t-        t/        ||||            D ]p  \  }\  }}}}d }t1        |t2              r!dt3        |      v r|j5                  d      \  }}d }r|v r j7                  |         }|rB|}t1        |t8              st;        |t<        j>                        st1        |t@              r|||<   n|dk(  r |||||      r||   }nt1        |t8              r_tC        jD                  d|      r|}n4| jF                  vr jI                  |||      }n jF                  |   d   }|f jF                  |<   n jI                  |||      }t1        |t2              rtK        |      r|dz  }|jM                  ||n| d|        s  jN                  j	                  dt         j                  jP                  jS                  |j                         d        jN                  jU                           jN                  j	                   ddj                  |       d| d	        jN                  jW                           jN                  j	                  tY        d|d              j$                  j[                         t         j                  j\                  ry t         j                  j^                  j`                  }|jc                  ||d        |5   j	                   d| d| d	       d d d         j                          y # 1 sw Y   xY w)N)cudaxpur/  mpszdevice z nyir  
raw_streamrb  r   z	c_void_p(r   r  z$call_args and arg_types do not matchc                     j                   j                         D  cg c]  \  } }|k(  r|  }} }|rddj                  |       dS yc c}} w )a  After all the autotune kernel calls have been written (i.e.
                self.kernel_autotune_example_args is complete), returns a deletion call
                for all autotune example tensors that are unnecessary after kernel_name
                is called.del r  rr  r  )r  r  r   )r  kntensors_to_deleter  r  s      r^   get_autotune_deletion_callzUPythonWrapperCodegen._generate_kernel_call_helper.<locals>.get_autotune_deletion_call  se     '+&G&G&N&N&P%"[( %! %
 %!$)),=">!?rBB%s   Ac                j   ||   }||v ryt        t        | |            D ]  \  }\  }}||k(  st        |t              sd}r|v rj	                  |         }|dk(  rA	 |j                         }	t        |	j                        D ]  \  }
}||k(  s| d|
 d||<     y  y# t        $ r Y w xY w)zWe try to infer raw_arg (i.e. raw_args[idx]) from remaining raw_args.
                This is particularly useful for jagged cases, where the dimension is often
                being passed in as an input.Tr  z.shape[rm  F)r  r   rc   r$   r  rs   rt   r7  )r  r  r  reused_args
target_argr  raw_keyr<  triton_inputrg   r  r"  autotune_argsr  s               r^   infer_arg_by_inputszNPythonWrapperCodegen._generate_kernel_call_helper.<locals>.infer_arg_by_inputs  s    
 &c]
,-6s8X7N-O !)A)Cxz'6'B #%L$M)A'+'E'E)'2( $r) 	!!(!3!3!5&/&< ,FC J=IN'RUQVVW:XJ 7'+,!, 	 / ! !!s   #0B&B&"B&&	B21B2zkeys are not None but args arez#call_args and raw_args do not matchr  r  z^(workspace|semaphore)r   r  rs  rt  z.run(z	, stream=z
<del_call>r  )2r9   rP   r  r   r   r  r  r3  r   r)   r!  r%  r@  r  r  r!   r   r   r   r   autotuning_mappingr   r  r   rc   r   splitr  r8  rn  r   r4  r
  r  matchr  r   rD   ry   r   rw  rz  rG  ri  r.   r   ru  r  r  r  )r  r  r  r  r   r  r  r  r  r  r  r   r  call_args_strstream_name
stream_ptrrN  rU  all_argsrP  r  r  r;  rR  r<  r   rS  arg_strr  rT  s   ``                           @r^   r  z1PythonWrapperCodegen._generate_kernel_call_helper  s>     @177>>@&++_<{{e#t44[)LM  %t44[)LM  #WV[[M#>?? 77	B		-0).@DV.V ,,.&KNNk]*<V\\N!LM.CCfllJK $[M3JNN-qQ}oR
|1M %%' MM224#=#== (S^s9~-M 6M !M#(B(B ! : : > >($!B H'I)II' 6C	N2 6C	N28}I6 96 K8AIy(H=9 -P44C7G c3'C3s8O"yy~HC+/ W%=#'#A#A%g.$L  *G%h<"8U[[9%c?;/6G,]':h;(
 *'2G+6 xx 93?"%D$E$EE"&"A"A7# #'"C"CC"H"K>E{=SD55c:"==c8WUGc3',DS,Iy(G3;se1WI<NO[-P` &&00**77EFaH &&002&&00-uTYYx%8$9;-qQ &&224&&00 /I<X &&**;7ww"" !" 4 4 B B..y+yRVW" 	XNNk]%i}TUVW	X%%'	X 	Xs   ,U  U)c                :    | j                   j                  |       y rO   )r  ry   r  r   s     r^   r   zPythonWrapperCodegen.writeline  s    

$r`   c                4    |D ]  }| j                  |        y rO   r  )r  r  r   s      r^   r  zPythonWrapperCodegen.writelines  s     	!DNN4 	!r`   c                L    | j                   j                  t        |             y rO   )r  ry   r4   )r  ctxs     r^   r  z"PythonWrapperCodegen.enter_context  s    

+c*+r`   c                   	 ddl m}  |       rdd l}t        |t              rt        |j                  j                        S t        |t        j                        rt        |      S t        |t        t        f      rAt        j                   G d d             	t         t        |      	 fd|D                    S t        |t         j"                  j$                        rt'        |      S t        |t(        j*                  t(        j,                  t.        f      r|j1                         S  |       r+t        |j2                  j4                        rt        |      S t        |t(        j6                  t(        j8                  f      r|j1                         S t;        t        |            rEt=        |      \  }}|j?                         D ]"  \  }}|t@        jB                  jD                  |<   $ |S t        |      S )Nr   )has_triton_packagec                      e Zd ZU ded<   d Zy)1PythonWrapperCodegen.val_to_arg_str.<locals>.Shimr   refc                    | j                   S rO   )rg  r  s    r^   __repr__z:PythonWrapperCodegen.val_to_arg_str.<locals>.Shim.__repr__  s    88Or`   N)r   r   r  r   ri  r   r`   r^   Shimrf    s    $r`   rj  c              3  V   K   | ]   } t         j                  |             " y wrO   r  )r   r7  rj  r  s     r^   r   z6PythonWrapperCodegen.val_to_arg_str.<locals>.<genexpr>  s$     Vq1@@qIJVs   &))#torch.utils._tritonrd  r   rc   r   rK  rZ   r:  r   r	   r   r  r  	dataclassr&  r   r  _ops
OpOverloadr   r"   rr   
MutableBoxr%   r  languager   r6  r7  r   r   r  r9   rP   opaque_value_type_classes)
r  r"  type_rd  r   obj_repropaque_typesnr  rj  s
   `        @r^   r  z#PythonWrapperCodegen.val_to_arg_str  s   :a"%%5::&8OE4=)""$ $ #$ QVTUVV  5::001&q))BIIr}}oFG&&((!jFOO4I4I&J7NB--r/C/CDE&&((!$q'*%8%;"Hl$**, 917811!49O7Nr`   c           	     `   |j                         }|j                         }t        |j                               }t        t        j
                  j                  |            }t        |j                               }|j                         }| j                  |j                         ||||||      S rO   )r  rW   r   r<  r9   rP   r:  r=  get_is_pinnedr  rR   )r  r  r  r   rC  allocation_shaperu   	is_pinneds           r^   r:  z+PythonWrapperCodegen.make_buffer_allocation  s    ""$  "foo'( !<!<V!DEv((*+((*	##OOvueV=My
 	
r`   c                x    d}t         j                  j                  s| j                  j	                  |d       y y )Nzi
            from torch._inductor.runtime.debug_utils import check_memory_step, track_tensor
            Tr   )r9   rP   ru  r  r   r  s     r^   "write_memory_track_allocation_oncez7PythonWrapperCodegen.write_memory_track_allocation_once  s4    
 ww""LL
$7 #r`   c                   ||}| j                  |      }| j                  |      }	| j                  |      }
t        j                  j                  j                  j
                  r| d|	 d|
 d| d|j                   d| d}ne|j                  dk(  r|r| d|	 d|
 d| d	}nE|j                  d
v r| d|j                   d|	 d|
 d| d	
}n| d|	 d|
 d|j                   d| d	
}||	k7  r|d| d|
 d	z   }|S )Nz = tracked_empty_strided(r  z, dtype=r  z	', name='rL  r/  z = empty_strided_cpu_pinned(r   )r/  rF  rG  mtiaz = empty_strided_r  z = empty_strided(r  z.as_strided()r   r  r  r!   r  r  r   )r  r   r  r   rC  ru   ry  rz  r@  codegen_allocation_shape_tuplecodegen_stride_tupler  s               r^   r  z$PythonWrapperCodegen.make_allocation  s    #$"==eD)-)H)H*
&  $>>vF??!!..EE&112"'( )  !;;- (b"  [[E!i&412"'('  [[:: &)&++a12"'('  &)12"'( )!;;-yq:  "@@,':&;2>R=SSTUUC
r`   c                8    | j                  t        |             y rO   )r   rY  r_  s     r^   make_commentz!PythonWrapperCodegen.make_comment  s    {4()r`   c           	     `    | j                    | d| | j                   d| j                   d| 	S )Nr      )ru  rv  r  )r  new_nameold_namer  s       r^   make_tensor_aliasz&PythonWrapperCodegen.make_tensor_alias  s6    ,,zXJt{{m2dll^STU\T]^^r`   c                (    d|j                          S )NrK  )rR   )r  r  s     r^   r  z%PythonWrapperCodegen.make_buffer_free  s    foo'())r`   c                8    ddj                  d |D               S )NrK  r  c              3      K   | ]  }|  y wrO   r   )r   r   s     r^   r   z:PythonWrapperCodegen.make_free_by_names.<locals>.<genexpr>  s     >>s   )r   )r  names_to_dels     r^   make_free_by_namesz'PythonWrapperCodegen.make_free_by_names  s    dii>>>?@@r`   c           	     `    | j                    | d| | | j                   d| j                   d	S )Nr   r   reuse)r  rv  r  )r  r  r  del_lines       r^   codegen_exact_buffer_reusez/PythonWrapperCodegen.codegen_exact_buffer_reuse
  s@    ../zXJxjQUQ\Q\P]]_`d`l`l_mmsttr`   c                P    |$| j                  | j                   d| d|        y y )Nz [Provenance debug handles] rt  )r   r  )r  r  debug_handles      r^   write_provenance_debug_handlez2PythonWrapperCodegen.write_provenance_debug_handle  s4    
 #NN<<. <[M<.Y $r`   c                r   |j                         |j                         k(  sJ |j                         }|j                         }d}|t        j                  j	                         vr|rd| j                  |       }|j                         |j                         k(  r4|j                         |j                         k(  r| j                  |||      S | j                  ||j                         |j                         d| j                  j                        }| j                   | d| | d| j                   dS )N;z; r   r   r  r  )rW   rR   r9   rP   r  r  r<  r=  r  rh  r  r   ru  r  )r  r  r  r[  r  r  r  reinterpret_views           r^   r^  z&PythonWrapperCodegen.make_buffer_reuse  s   }}#--/111<<><<>1773355*D11#678H<<>S\\^+0@CNNDT0T228XxPP88!11d6G6G6Q6Q
 ,,z-=,>xj4<<.X^__r`   c                    | j                  t        || j                   | d|j                          | j                   d| j
                   d             y )Nr   r  z alias)r   r=   ru  r  rv  r  )r  r   rV  s      r^   rW  z0PythonWrapperCodegen.codegen_deferred_allocation'  sS    <<.c$*@*@*B)CDKK=PRSWS_S_R``fg	
r`   c                t   |j                         }|t        j                  j                  v sG|| j                  v s9t        |t        j                  t        j                  t        j                  f      ry | j                  j                  |       t        |j                         t        j                  t        j                  f      r|j                         sy |j                         }t        |t        j                         ry t        |t        j"                        ry t        |t        j$                        r+t        |j&                  t        j(                        s*J dt+        |j&                         d|j&                          |j&                  j,                  }t        |t        j.                        sJ t+        |             |j,                  }t        |t        j0                  t        j(                  f      sJ t+        |             t        |t        j(                        rdfd |      }| j3                  |       | j5                  t7        | |||             y t        |t        j8                        r| j5                  t;        | |d             y | j5                  t;        | |             y )Nzunexpected r   c                   t        | t        j                        r | j                               S t        | t        j                        r | j
                        S t        | t        j                        sJ t        |              | S rO   )rc   r"   BaseViewunwrap_viewrp  rz   rr   r   )targetunwrap_viewss    r^   r  z=PythonWrapperCodegen.codegen_allocation.<locals>.unwrap_viewsS  sd    !&"++6+F,>,>,@AA!&"--8+FKK88%fbii8F$v,F8!Mr`   Tr.  )r   	ir.Buffer)rR   r9   rP   r  r  rc   r"   DonatedBufferSubgraphBufferInputBufferr   get_defining_opExternKernelAllocMultiOutputshould_allocaterb   MutationLayoutSHOULDREMOVEr  rU  rV  r%   r   rz   rx   rr   r  r   rQ  rd   r  )r  r  r   rg   boxinput_bufferr  s         @r^   r  z'PythonWrapperCodegen.codegen_allocation/  s     AGG+++t~~%&2#3#3R5F5F"WX4 &&(%%r~~6 **,'')fb;;<fbmm,fb001fkk2+=+=> d6;;/06;;-@> ++""Cc2==1<49<188LlRYY8J8J,KL dO L ,(:(:;"  ,L9##L1NN?4vvNOfb112NN<f$GH|D&12r`   c                   |j                         }t        |t        j                  t        j                  f      r| j                  t        | |             y t        |j                         t        j                        r| j                  t        | |d             y | j                  |      sy | j                  j                  |       | j                  t        | |             y )NTr.  )rR   rc   r"   r  r5  r   r  rb   rd   r0  	can_reuser  r   )r  r  r   s      r^   codegen_freez!PythonWrapperCodegen.codegen_freef  s      fr~~r/A/ABCNN8D&12f,,.0C0CD NN.tVNO~~f%

t*489r`   c                   |j                         }|t        j                  j                  v xs |t        j                  j                  v xr6 t        t        j                  j                  |   t        j                         xsh |t        j                  j                  v xsJ |t        j                  j                  v xs, |t        j                  j                  v xs || j                  v  S rO   )rR   r9   rP   r  r  rc   graph_inputs_originalr"   r  r  r  never_reuse_buffersr  )r  r  output_bufferr   s       r^   r  zPythonWrapperCodegen.can_reusez  s    $$&AGG+++ 
",,, "GG11$79I9I 
" qww(((
" qww222
" qww222
" tzz!
 	
r`   c                    |j                         | j                  v xr. | j                  |j                            |j                         k(  S rO   )rR   r  )r  r  reused_buffers      r^   	did_reusezPythonWrapperCodegen.did_reuse  sC     OO, KFOO-.-2H2H2JJ	
r`   c                t   t        ||      sJ | j                  |       | j                  j                  |j	                                | j
                  j                  |j	                                |j	                         | j                  |j	                         <   | j                  t        | ||             y rO   )	rp   r  r  r   rR   r  r  r   r2  )r  r  r  s      r^   codegen_inplace_reusez*PythonWrapperCodegen.codegen_inplace_reuse  s    $\=AAA-

|,,./=11340<0E0E0GM**,-y|]CDr`   c                    t        |      }|| j                  v r|S | j                  j                  |       | j                  |z   S rO   )r   r  r   ru  )r  r   r   s      r^   codegen_unbacked_symbol_declz1PythonWrapperCodegen.codegen_unbacked_symbol_decl  sC    6{4---K &&**40<<$&&r`   c                    t        t        j                  j                  j                  |      }| j                  t        | |||             y rO   )r   r9   rP   rX   	shape_envr   r  )r  r  r  r  s       r^   (codegen_unbacked_symbol_defs_for_outputsz=PythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs  sB     6GG&&(9
 	"4g?PQ	
r`   c                    |sy |j                         D ]I  \  }dfdfd}| j                  | j                  |       d |        | j                          K y )Nc                   |dk(  r| S t        |      dk\  r_t        |d   t              rLt        |d   t        j                        r/ |  d|d   j
                   d|d   j                   d|dd        S t        |d   t              r |  d|d   j
                   d|dd        S t        |d   t        j                        rYt        j                  j                  r  d	|d   j                   d
|  d|dd        S  |  d|d   j                   d|dd        S t        |d   t              r |  d|d   j                   d|dd        S t        d|       )Nr   r   r   r:   r   r  r   rQ  z	std::get<z>(rl  rm  z.__floordiv__(r  )r   rc   r   pytreeSequenceKeyr   r  r9   rP   ru  r   r  rp  )r:  r  gos     r^   r  zJPythonWrapperCodegen._codegen_unbacked_symbol_defs_for_outputs.<locals>.go  s   b=K LA%"71:}="71:v/A/AB&'!*//!2!GAJNN3C1Ewqr{   
M:a
'8;WQR[II
F,>,>? 77.. Ywqz~~&6ba@'!"+N  4&'!*..)9 ;WQR[I
  
K8 nWQZ5G5G4HJGTUTVKXX(+@	)JKKr`   c                    t         j                  j                  rt              dk(  rZd   }  d   j	                         t        | t        j                        r!t        | j                        dk7  r	dd        S       S t        d   t        j                        sJ  d   j                     j	                         dd        S        S )Nr:   r   )r9   rP   ru  r   rR   rc   r"   r  ri  r  r  r  )r  r  r  r  r  s    r^   go_outerzPPythonWrapperCodegen._codegen_unbacked_symbol_defs_for_outputs.<locals>.go_outer  s    77&&
 7|q(%aj  "#AJ//1)#r~~>3s{{CSWXCX $ABK   ")	    *'!*f6H6HIII!''!*.."9"B"B"DgabkRRk733r`   r   )r:  r   r  zpytree.KeyPath)r  r   r  rv  )r  r  r  r  r"  r  r  r  s    ``   @@r^   r  z>PythonWrapperCodegen._codegen_unbacked_symbol_defs_for_outputs  sg     ! ,113 <	JAw
L<4. NN44Q78HJ<}Uu<	r`   c                     fd} fd}	  j                  j                          j                   j                   dj                           |        t
        j                  }t        j                  j                        5  j                  j                  |       d d d         |         j                          y # 1 sw Y   !xY w#  j                          w xY w)Nc                    t        j                  j                        t              k(  sJ t        j                  j                        D ]3  \  } }j	                  j
                   |  d| j                          5 y r  )r   rP   r  r   r   ru  rv  )inner_inputouter_inputouter_inputsr  subgraphs     r^   _codegen_subgraph_prefixzSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_prefix
  sy    x~~223s<7HHHH,/++\- ([ ||n[M[M$++Or`   c                    t        j                  j                        t              k(  sJ t        j                  j                        D ]5  \  } }j	                  | d| j                          j                          7 y r  )r   rP   r  r   r   r  rv  )inner_outputouter_outputouter_outputsr  r  s     r^   _codegen_subgraph_suffixzSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_suffix  s{    x~~334M8JJJJ.1,,m/ *l #nC(F(F(H'I$++Wr`    subgraph: )parent_graph)	rF  rP   r   r  r   r9   set_graph_handlercodegen_subgraphrh  )r  r  r  r  r  r  r  s   ````   r^   codegen_subgraph_by_inliningz1PythonWrapperCodegen.codegen_subgraph_by_inlining  s    			'%%hnn5NNdll^;x}}oFG$&77L$$X^^4 //!- 0  %&$$&  $$&s$   A;C C,C CC C*c           	        |j                   }|j                  }t        |j                               |j                  D cg c]  }|j
                   c}z   }dj                  |      t        |      dk(  rdndz   }|D cg c]  }|j                          }	}dj                  |	      t        |      dk(  rdndz   }
| j                  d| d| d       |j                         D cg c]
  \  }}|s	| }}}|r#| j                  ddj                  |              | j                  d	|
 d
| d| d       | j                  d| d       yc c}w c c}w c c}}w )z'Generate code to call a graph partitionr  r:   r@  r  	partition	_args = [rm  rK  r  z) = self.partitions[z](partition_args)zdel partition_argsN)input_deallocationoutput_nodesr  r  symbol_inputsr   r   r   rR   r   r  )r  partition_idr  r  r  symbol_inputrA  r  rZ   output_namesr  r   
deallocater  s                 r^   codegen_partition_callz+PythonWrapperCodegen.codegen_partition_call)  sj    2DD+88-22452F2T2T9
".L9
 
 ;'#k2Ba2G3RP4@ADAA))L)C4E4JSPRS 	<.	&CD *<)A)A)C
%T:zD
 
 NNT$))L"9!:;< 	y,\N+l^SYZ	
 	|nE:;-9
 B
s   E?E'
E2Ec                P    t        |      D cg c]  }d| 	 c}| _        y c c}w )N
partition_)r  r  )r  num_partitionsr  s      r^   set_all_partition_namesz,PythonWrapperCodegen.set_all_partition_namesJ  s$    BGBW#X3j$6#X #Xs   #c           	     p   dj                  |      t        |      dk(  rdndz   }dj                  |      t        |      dk(  rdndz   }| j                  |j                  j                   d| d       | j                  d| d|j                  j                   d|j                  j                   d	       y )
Nr  r:   r@  r  r  rm  r  z) = r  )r   r   r   rP   r   )r  r  r  outer_flattened_outputsouter_output_namesouter_input_namess         r^   ,codegen_subgraph_call_with_flattened_outputszAPythonWrapperCodegen.codegen_subgraph_call_with_flattened_outputsM  s     "YY'>?./14C"
 !IIl3|$)Cr
 	(..--.i8I7J!LM 	"#4(;(;'<Ahnn>Q>Q=RRXY	
r`   c                v   dj                  |      t        |      dk(  rdndz   }| j                  |j                  j                   d| d       t
        j                  j                  j                          | j                  | d|j                  j                   d|j                  j                   d	       y )
Nr  r:   r@  r  r  rm  r   r  r  )r   r   r   rP   r   r9   rT   free_buffers)r  r  r  outer_buffer_namer  s        r^   codegen_subgraph_callz*PythonWrapperCodegen.codegen_subgraph_call_  s     IIl3|$)Cr
 	(..--.i8I7J!LM 	
&&( 	 !X^^%8%8$98>>;N;N:OvV	
r`   c                    | j                  |j                         | j                  d       | j                  | j                   d|j                          t
        j                  }|j                  |j                  _        |j                  |j                  _        |j                  j                  | j                  vrt        j                  |j                        5  t        j                  dd      5  |j                  j                         \  }}d d d        d d d        |j                  j                  }| j                  j                  |       | j                  |       y y # 1 sw Y   VxY w# 1 sw Y   ZxY w)Nr  r  rE  F)rF  rP   r  r  r   r9   ru  
fx_wrapperr  r  r!   patchrJ  r   r  )r  r  r  r  r  r  s         r^   codegen_subgraph_commonz,PythonWrapperCodegen.codegen_subgraph_commonp  s/   !!(..1"T\\N+hmm_EFww%1%=%="$0$;$;!>>d&F&FF $$X^^4 @\\"3U; @'/~~'='='?$M1@@
 %NN//M,,00?,,]MJ G
@ @@ @s$   E45E(E4(E1	-E44E=c                L    | j                  |       | j                  |||       y rO   )r  r  )r  r  r  r  s       r^   'codegen_subgraph_with_flattened_outputsz<PythonWrapperCodegen.codegen_subgraph_with_flattened_outputs  s(     	$$X.99l$;	
r`   c                L    | j                  |       | j                  |||       y rO   )r  r  )r  r  r  r  s       r^   r  z%PythonWrapperCodegen.codegen_subgraph  s%     	$$X.""8\;LMr`   c                   |j                         }| j                  | dt        |j                                |j                  D cg c]  }|j                          }}t        j                  j                  rOt        t        |j                              D cg c]
  }| d| d }}| j                  |j                  ||       y | j                  |j                  ||       y c c}w c c}w )N = [None] * rl  rm  )rR   r   r   r  r  r  r9   rP   rv  r  r  r  r  )r  invoke_subgraphr   r9  r  r  r  s          r^   codegen_invoke_subgraphz,PythonWrapperCodegen.codegen_invoke_subgraph  s    '')$|C0G0G,H+IJK;J;Q;QRC--/RR77(-c/2I2I.J(K#$4&!AM  --((, !!/":":L$O Ss   C(C-c                   |j                         }|j                  D cg c]  }|j                          }}|j                  j                         }t	        |j                  t
        j                        s| d}| j                  | dt        |j                                | j                  d| d       | j                  t        | |j                  j                               t        j                  j                  rOt        t        |j                              D cg c]
  }| d| d }}| j!                  |j                  ||       n| j#                  |j                  ||       | j                  t%        |              | j                  d       | j                  t        | |j&                  j                               t        j                  j                  rOt        t        |j                              D cg c]
  }| d| d }}| j!                  |j&                  ||       n| j#                  |j&                  ||       | j                  t%        |              y c c}w c c}w c c}w )Nr  r  r   rt  rl  rm  zelse:)rR   operandsr  	predicaterc   r"   ShapeAsConstantBufferr   r   r  r?  true_subgraphrP   r9   rv  r  r  r  rd  false_subgraph)r  conditionalr   r9  r  r  r  r  s           r^   codegen_conditionalz(PythonWrapperCodegen.codegen_conditional  s   ##%;F;O;OPC--/PP));;=	+//1I1IJ$+W-I$|C0C0C,D+EFGYKq)*({/H/H/N/NOP775:3{?R?R;S5TUvQqc^UMU--))< !!+";";\4P'-.w({/I/I/O/OPQ775:3{?R?R;S5TUvQqc^UMU--**L- !!+"<"<lDQ'-.9 Q V Vs   I)"I.I3c                     fd}|j                         }|j                  D cg c]  }|j                          }}|j                  D cg c]  }|j                          }}t	        |      } j                  | dt	        |              |r  j                  | dt	        |       d       t        |      D ]  \  }	}
 j                  | d|	 d|
          g t        t	        |            D 	cg c]
  }	| d|	 d c}	|}| dg}t        |      }|d	t	        |       } ||j                  ||        j                  d
|d            j                  d       |rwt        |      D ]h  \  }	} j                  t         |j                  j                                j                  | d|	 d| d        j                  t                      j nvt        |      D ]h  \  }	} j                  t         |j                  j                                j                  | d|	 d| d        j                  t                      j  j                  d        j                  t         |j                  j                                ||j                  ||        j                  t                      |r j                  t         |j                  j                               t        t	        |            D ]"  }	 j                  | d|	|z    d| d|	 d       $  j                  t                       j                  t         |j                  j                                ||j                  ||        j                  t                       j                  d|d           |r j                  d       t        t	        |            D ]  }	 j                  d| d|	|z    d        j                  t         |j                  j                                j                  | d|	 d| d|	|z    d        j                  t                       y	y	c c}w c c}w c c}	w )z1while_loop is codegened as a host side while_loopc                    t         j                  j                  rj                  | ||       yj	                  | ||       y)z3Helper method to deduplicate subgraph codegen logicN)r9   rP   rv  r  r  )r  r  r  r  s      r^   r  zAPythonWrapperCodegen.codegen_while_loop.<locals>.codegen_subgraph  s7    ww11(L-X<<lMr`   r  z.extend([[] for _ in range(z)])rl  z] = rm  _cond_resultNzshould_loop = r   zif not should_loop:z.unsqueeze(0).clone()r  zwhile should_loop:z	].append(z])z    should_loop = z%# Stack outputs after loop completionzif len(z]) > 0:z] = torch.stack(z	], dim=0))rR   carried_inputsr  additional_inputsr   r   r  r  r  cond_subgraphr?  body_subgraphrP   rd  )r  
while_loopstack_outputr  r   r9  outer_carried_inputsouter_additional_inputs
ckp_offsetr  inpcond_outer_inputscond_outer_outputsbody_outer_inputsbody_outer_outputscarried_inputs   `               r^   codegen_while_loopz'PythonWrapperCodegen.codegen_while_loop  s   	 ""$/9/H/H 
(+C!!# 
  
 0:/K/K#
(+C!!##
 #
 -.
$|C0D,E+FGHNN&3C8L4M3NcR   45 	3FAsNNdV1QCtC512	3
&+C0D,E&FGas!nG
$
 "&l34 
 //J5I1JK$$&79K	
 	(:1(='>?@,-$-.B$C 7 =0z7O7O7U7UVW$q4>STU/567
 %..B$C 7 =0z7O7O7U7UVW$q4hGH/567
 	+,(z/G/G/M/MNO$$&79K	
 	'-. NN,T:3K3K3Q3QRS3345 P$qZ(8	$q2NOPNN+D12 	(z/G/G/M/MNO$$&79K	
 	'-.+,>q,A+BCD NNBC3345 7aJ/?wGH0z7O7O7U7UVWfAaS 0aJ7GyQ /567 Q 
#
  Hs   Q+Q04Q5c                    	 t        | dd       ry t        | t              r| S t        j                  j
                  j                  |       }||S t        |      S # t        $ r Y y w xY w)Nr9  )r  rc   r   r9   rP   
_shape_env_maybe_evaluate_staticr  )r   r  s     r^   statically_known_int_or_nonez1PythonWrapperCodegen.statically_known_int_or_none#  sf    	q.$/ !S!''$$;;A>C{
s8O 		s!   A A ,A 
A 	A&%A&c                l    g }| D ],  }t         j                  |      }| y |j                  |       . |S rO   )r@  r  ry   )lstr  r   r  s       r^   %statically_known_list_of_ints_or_nonez:PythonWrapperCodegen.statically_known_list_of_ints_or_none3  sA     	A&CCAFC{MM#		
 r`   c                0    t         j                  |       d uS rO   )r@  r  )r  s    r^    is_statically_known_list_of_intsz5PythonWrapperCodegen.is_statically_known_list_of_ints=  s     !FFsKSWW	
r`   c                H    t         j                  | j                               S rO   )r@  r  r<  r  s    r^   r3  z4PythonWrapperCodegen.static_shape_for_buffer_or_noneC  s    #IIOO
 	
r`   c                0    t         j                  |       d uS rO   )r@  r3  r  s    r^   !can_prove_buffer_has_static_shapez6PythonWrapperCodegen.can_prove_buffer_has_static_shapeI  s    #CCFKSWWWr`   c                     y rO   r   )r  r  node_schedules      r^   write_kernel_context_guardz/PythonWrapperCodegen.write_kernel_context_guardM  s    
 	r`   c                     y)z<
        Mark the beginning of kernel context guard
        Nr   r  s    r^    write_kernel_context_guard_beginz5PythonWrapperCodegen.write_kernel_context_guard_beginT       	r`   c                     y)z6
        Mark the end of kernel context guard
        Nr   r  s    r^   write_kernel_context_guard_endz3PythonWrapperCodegen.write_kernel_context_guard_end\  r%  r`   rO   )r  r   r  r   r  PythonWrapperCodegen | Noner  !ir.GraphPartitionSignature | NonerM  )r   r   r  r   r   r1  )r  r   )r%  TritonMetaParamsr   r   r   r  )r   z9dict[str, ir.TensorBox | ir.TorchBindObject | sympy.Expr]r   zlist[IRNode])rA  r  r   r   )rA  zIterable[str]r   r1  )rn  r   r  r   r   r   r  )r:   N)rn  r   r  r   r  zdict[int, int] | Noner   r1  )r  r   r   r  )r~  r  r   r1  r  r1   r   r1  )rZ   zir.FallbackKernelr   r1  )rZ   r  r   r1  )rZ   r  )rZ   r  r   r1  )r  r   r  r   r  r   r  r  r  r   r  zOrderedSet[str] | Noner   r1  )F)rZ   r  )rZ   r{  r   r1  )r  r   r  r   r  zCallable[[], Sequence[str]]r  z6torch._ops.OpOverload | torch._ops.HigherOrderOperatorr  rh  r  zSequence[ir.Buffer]r   r1  )r  Callable[..., None]r   zIterator[Callable[..., None]])r  r   )r   r   r   r>  r4  r?  )r;  r  )r   r	   rY   r   r   r   )r   r	   r   r   )rq  r   r   r   r%  r   r   r   )rC  zSequence[Expr]r   r   )r   ztuple[str, list[str]])r   r/  r   r   )rk  z
bool | str)rZ   zir.MultiOutput)NTN)
r  r   r  r   r  r   r  r   r  r   )r  r   r  r   r  r   )r   r   )r   zlist[list[int | sympy.Expr]]r  $tuple[ir.ComputedBuffer, str] | None)r  r   r  r   )r  r
  rP   rI   r   r1  )r  r?   )r  r   )r  r  )NF)r  )r  r  )r  r  )r  r   r  r   r  r   )r  ro  )r  r  r  r  r[  r   )r   r   rV  zir.ReinterpretViewr   r1  r  r  )r  r  r  r  )r  r   r  r   r  r  r   r1  )r  r   r  zir.GraphPartitionSignature)r  r   )r  r   r!  z*Sequence[BaseSchedulerNode] | ExternKernel)r   r   r  r  r  r   r  rW  r  r  r  r  r  r  r-   r  r!  r#  r(  r,  r.  r1  r*  r:  r<  r>  rC  rH  rJ  r  rQ  rW  r]  r`  r  re  rF  rh  rB  rf  rn  rr  rt  ry  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   contextmanagerr  r  r  r  r  r)  r  r5  rP  rD  r  rI  rL  rN  ro  r   r@  rU  rh  rl  ro  rx  r  r  r`  r  r  r  r  r  r  classmethodr  r  r  r  r   r"  r  r  r  r  r  r  r  r  r3  r   r@  rD  r  r   r  r  r  r:  r|  r  r  r  r  r  r  r  r^  rW  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r3  r  r"  r$  r'  r3  r4  s   @r^   r@  r@    s    "d!a#F 
 CG	&&!& 4& @	& &'<oBb8  "	B + +! 
 

$	B$
%?0(
.8):6-I*D.10J
/+ <@	*-*- *- %:	*-
 
*-X5 
$,45$:=,:8 8 
8 04;; ; 	;
 ; ; -; 
; (	U
8(B<
	V	V  	V .		V
 L	V  	V %	V 
	V0 ! !,S
j.S`:
 
D%(K(K (K -	(KT'?RD @D W CG +.&'6	, <P '<P 
<P|>S
IG264*rh4  $%)

 
 	

 
 #
& CG		'*	6@	 	  $%)!! ! 	!
 ! #!,>  aJ ,aJ >aJF	*A"A+8A	A
%N:.>85WF

2
4=G~' !6
6
z !A(A(F !,%P	
 8 8 TY.`*_*Au $( !` 
53n:(
 
E'

 
 E	

 

JJ J E	J
 
JX+'Z<< 9<BY
$
"K*
NP /B]7~     
 

 
 

 X X Br`   r@  c                       e Zd ZdZ	 d	 	 	 	 	 d fdZddZddZd Zd Zd Z	ddZ
dd	Zdd
ZddZ	 	 ddZddZddZd  fdZ	 d	 	 	 	 	 d!dZd"dZddZedd       Zedd       Zed#d       Zd Z xZS )$r  a  
    A wrapper codegen that generates code for a subgraph. For most of the
    methods, we rely on the implementation in the PythonWrapperCodegen. But we
    override a few functions to produce cleaner code (like avoiding writing
    imports twice in the output code)
    c                    || _         || _        || _        t        |           | j                         }|j                  | _        |j                  | _        |j                  | _        |j                  | _	        y rO   )
r  r  r  r  r  get_root_graphr  r   r  r  )r  r  r  r  rootr  s        r^   r  z%SubgraphPythonWrapperCodegen.__init__m  sn     +,$8!""$$($=$=!%)%?%?"!//)-)G)G&r`   c                &    | j                   | _        y rO   )r  r  r  s    r^   r  z1SubgraphPythonWrapperCodegen.set_launcher_fn_name  s     !% 2 2r`   c                     y rO   r   r  s    r^   r  z)SubgraphPythonWrapperCodegen.write_header  r  r`   c                     y rO   r   r  s     r^   r  z2SubgraphPythonWrapperCodegen.add_benchmark_harness  r  r`   c                     y rO   r   r  s     r^   r  z6SubgraphPythonWrapperCodegen.benchmark_compiled_module  r  r`   c                     y rO   r   r  s    r^   r>  z5SubgraphPythonWrapperCodegen.write_async_compile_wait  r  r`   c                6    | j                   j                         S rO   )r  rn  r  s    r^   rn  z/SubgraphPythonWrapperCodegen.next_kernel_suffix  s    ""5577r`   c                     y rO   r   r  s     r^   r  z2SubgraphPythonWrapperCodegen.generate_after_suffix  r/  r`   c                \    | j                   j                  d| j                   d       d}|S )Nz
            def z(args):
            r:   )r  r   r  rF  s     r^   rH  z>SubgraphPythonWrapperCodegen.write_launcher_fn_call_get_indent  s<    &&' (	

 r`   c                     yr   r   r  s    r^   r  z4SubgraphPythonWrapperCodegen.get_wrapper_call_indent  s    r`   c                    | j                   x}r3|j                  |j                  D ci c]  }t        |      | c}z  }|S t        j
                  j                  }|S c c}w rO   )r  input_nodesr  r   r9   rP   r  )r  r  r"  r  s       r^   r1  z-SubgraphPythonWrapperCodegen.get_graph_inputs  sh     11191**#,#:#:.A	. F
  WW))F.s   Ac                    | j                   x}rJt        |j                  j                               |j                  D cg c]  }|j
                   c}z   }|S t        j                  j                  }|S c c}w rO   )	r  r  rB  r  r  r   r9   rP   r8  )r  r  r  namess       r^   rJ  z2SubgraphPythonWrapperCodegen.get_graph_input_names  sr    11191..33566?6M6M:&2!!: E
  GG--E:s   A5c                r    | j                   x}r|j                  }|S t        j                  j                  }|S rO   )r  r  r9   rP   r  )r  r  r  s      r^   r*  z.SubgraphPythonWrapperCodegen.get_graph_outputs  s;    11191,,G  gg++Gr`   c                ~    |j                         }| j                  x}r||j                  v ry t        |   |       y rO   )rR   r  rB  r  r  )r  r  r   r  r  s       r^   r  z/SubgraphPythonWrapperCodegen.codegen_allocation  s?     222I2	@U@U8U "6*r`   c                    | j                          t        j                  j                  j                  x}| d}| j                  | d| d       |S t        |      }| j                  | d| d       |S )N_rawr   z.cuda_streamrb  r   )r  r9   rP   rT   rv  r   r+   )r  rn  rP   rv  r   s        r^   _write_get_raw_streamz2SubgraphPythonWrapperCodegen._write_get_raw_stream  s     	%%'#$77#4#4#H#HHU)*$/DNNdV3':&;<HI  'z2DNNdV#5j\CDr`   c                .    | j                  d| d       y)z#Generate NVTX range push for graph.z"torch.cuda.nvtx.range_push('graph rL  Nr  )r  post_grad_graph_ids     r^   codegen_graph_nvtx_range_pushz:SubgraphPythonWrapperCodegen.codegen_graph_nvtx_range_push  s    ;<N;OrRSr`   c                &    | j                  d       y)z"Generate NVTX range pop for graph.ztorch.cuda.nvtx.range_pop()Nr  r  s    r^   codegen_graph_nvtx_range_popz9SubgraphPythonWrapperCodegen.codegen_graph_nvtx_range_pop  s    45r`   c                8    | j                   j                          y rO   )r  r  r  s    r^   r  z5SubgraphPythonWrapperCodegen.write_triton_header_once  s     	446r`   c                8    | j                   j                          y rO   )r  r#  r  s    r^   r#  z=SubgraphPythonWrapperCodegen.write_get_raw_stream_header_once  s     	<<>r`   c                    | }t        |t              r|j                  }t        |t              rt        |t              sJ |S rO   )rc   r  r  r@  )r  r7  s     r^   r6  z+SubgraphPythonWrapperCodegen.get_root_graph  sB    DH;<&&D ;< $ 4555r`   c                     y rO   r   r  s    r^   r  z<SubgraphPythonWrapperCodegen.generate_and_run_autotune_block  s    r`   rO   )r  r   r  r@  r  r)  rM  r  r.  r-  )r   z@dict[str, ir.TensorBox | ir.TorchBindObject | sympy.Expr | None]r+  r,  r1  )rn  r   rP   zGraphLowering | Noner   r   )rK  r   r   r1  )r   r@  )r   r   r  r  r  r  r  r  r  r>  rn  r  rH  r  r1  rJ  r*  r  rI  rL  rN  r-   r  r#  r6  r  r3  r4  s   @r^   r  r  e  s     CG	HH -H @	H.38		I	+ >B&:	T6 7 7 ? ?  r`   r  )rZ   r  r   r/  )rZ   r  r   r2  )rl   r  rm   r  )NN)r   r   r   zlist[triton.Config]r   zlist[TritonGrid]r   r(  r   r   r   ztuple[str, str]rO   )r  r0  r   r   )
__future__r   r  r   r  r   r4  r  r  r  r  rA  r  r  collections.abcr   	itertoolsr   r   typingr   r   r   r	   r  
torch._opstorch.utils._pytreeutils_pytreer  r   r8  torch._dynamo.utilsr   r   r   #torch._inductor.codegen.debug_utilsr   $torch._inductor.codegen.multi_kernelr   %torch._inductor.runtime.runtime_utilsr   torch._library.opaque_objectr   r   torch._loggingr   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   torch.fx.noder   torch.utils._ordered_setr    torch.utils._sympy.singleton_intr   torch.utils._sympy.symbolr   r   r  r    r!   r"   	codecacher#   r$   r%   r   r&   runtime.hintsr'   stream_constantsr(   r)   r*   stream_utilsr+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   virtualizedr9   r  r;   r<   r=   r>   r?   r@   	cpp_utilsrA   custom_extern_kernel_codegenrB   triton_utilsrC   rD   rE   rF   rG   rH   r   rP   rI   rJ   rT   rK   wrapper_fxirrL   	getLoggerr   logdoprintrK  r   r  r   r   r   r/  r2  rr   r  r<  r_   rh   rp   r   r  r*  r   r   r  rm  r
  r  rM   r?  rQ  rY  r^  rd  rm  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r0  rQ  r2  r0  re  rz  r  r  r  r  r  Liner@  r  r   r`   r^   <module>rs     s<   "    
     	  	  $ " %     $ $ & E E C A ; R +  . / 9 : ( ( ' ( ' , W W ?       F P P <<%!-) g! u{{Cs:;5<<c;NPSST YY%
]OT12  >QF S> 	#


C
 8-=,>c3h,O#PP  ,0'+k&
k& k& k& )	k&
 %k& k&^ EIq&Aq&q&h   %6 %6PY Y
 2 2 2 	/k 	/ 	/ ++ + + 	2 	2 	2 1{ 1 1 "@K "@ "@J?; ? 
7K 
7 
7 5+ 5 5< <{ < <6 	({ 	( 	(  /[  /  /F 5; 5 5* ; ; ;2 2O  : 1M        %
 %
P h,% h, h,V +6, +6 +6\ /( / /& )" ) )0(! (
 #0k #0 #0L 6; 6 6, 4+ 4 48 	5+ 	5 	5 8[ 8 8 
6; 
6 
6 
K'[.7 [.|\\#7 \r`   