
    9j                   P   U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZmZ d dlmZmZ d dlmZ d dlmZmZmZmZmZmZ d dlmZmZ d dlZd dlZd dl Zd dl!m"Z" d d	l#m$Z% d d
l&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z. d dl/m0Z0m1Z1m2Z2 d dl3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z: ddl;m<Z<m=Z= ddl>m?Z? ddl@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZL ddlMmNZNmOZOmPZPmQZQmRZRmSZSmTZT erid dlUmVZVmWZWmXZXmYZY d dl mZZZ ddl[m\Z\ ddl]m^Z^m_Z_m`Z`maZa ddlbmcZc ddldmeZemfZfmgZg ddl>mhZh ddlimjZj  ed       ZkeVefdz  geef   Zlemej   Zneoej                  z  ZqeoZrej                  j                  eud!      Zv ej                  eu      Zxdd"Zyej                   G d# d$             Z{ G d% d&ej$                        Z| G d' d(e      Z} eFd)*       G d+ d,e}             Z~ G d- d.      Zej                   G d/ d0             Zej                   G d1 d2             Zej                   G d3 d4             Zej                   G d5 d6             Zej                   G d7 d8             Ze~ez  ez  ez  ez  Zi Zd9ed:<    G d; d<      Zi Zd=ed><   d?ai Zd@edA<   i ZdBedC<   	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddDZ G dE dFe      Z	 	 	 	 ddGZ	 	 	 	 	 	 ddHZddIZ	 d	 	 	 	 	 	 	 ddJZddKZddLZej*                  ddM       Z	 	 	 	 	 	 	 	 ddNZ	 	 	 	 	 	 ddOZdP ZddQZej6                  ej8                  ej:                  ej8                  iej<                  ej>                  ej@                  ejB                  ejD                  ejF                  ejH                  ejJ                  ejL                  ejN                  ejP                  fD  ci c]  } | |  c} ZdRedS<   	 	 	 	 	 	 	 	 ddTZ	 	 	 	 	 	 	 	 ddVZ	 	 	 	 	 	 	 	 d dWZd!dXZ G dY dZ      Z G d[ d\e.      Z- G d] d^      Z ej`                  d_ejb                  `      Zd"daZ G db dce<eePe         Zej                   G dd de             Z ed#i df ee"jn                  dg dhi      dj ee"jn                  dk dl dmn      do ee"jn                  dp dq drn      ds ee"jn                  dt du dvn      dw ee"jn                  dx dy dzn      d{ ee"jn                  d| d} d{~      d ee"jn                  d d dn      d ee"jn                  d d d d      d ee"jp                  d d dn      d ee"jn                  d di      d ee"jn                  d di      d ee"jn                  d di      d ee"jn                  d di      d ee"jn                  d d d d      d ee"jn                  d d d~      d ee"jn                  d d dn      d ee"jn                  d di      d ee"jn                  d di      d ee"jn                  d d dn      d ee"jn                  d d dn      d ee"jn                  d di      d ee"jn                  d di      d ee"jn                  d di      d ee"jn                  d di      d ee"jn                  d di      d ee"jn                  d di      d ee"jn                  d dìi      d ee"jn                  dń dƬi      d ee"jn                  dȄ dɬi      d ee"jn                  d˄ d̬i      d ee"jn                  d΄ dϬi      d ee"jn                  dф dҬi      d ee"jn                  dԄ dլi      d ee"jn                  dׄ dجi      d ee"jn                  dڄ d۬i      d ee"jn                  d݄ dެi      d ee"jn                  d di      d ee"jn                  d di      d ee"jn                  d di      d ee"jn                  d di      d ee"jn                  d di      Zded<   d$dZ G d deB      Z G d deE      Z G d de      Zej                   G d d             Z G d d      Z e       Z G d d      Z G d d      Z edeo       Z edUee      Zereej                  eReeedf   z  f   Z G d deeef         Z G d d      Z G d deee         Zej                   G d	 d
             Zej*                  d%d       Z G d d      Z G d de=      Zyc c} w (&      )annotationsN)ABCabstractmethod)autoEnum)chain)AnycastClassVarGeneric
NamedTupleTYPE_CHECKING)SelfTypeVar)ELEMENTWISE_TYPE_PROMOTION_KIND)_pytree)ConfigModule)
OrderedSet)int_oo)PythonPrinter)free_symbol_is_typesymbol_is_typeSymT)bound_sympyValueRanges   )configmetrics)DtypePropagationOpsHandler)BasicMathOpsMixinDefaultHandler)ShapePropagationOpsHandler)boolean_opsDeferredLineBasegenerate_assertget_current_backendIndentedBufferir_dataclass
ScopedDict	sympy_dotsympy_index_symbol
sympy_substriton_typeunique)NullHandlerops
OpsHandlerOpsValueReductionType	StoreModeV)CallableIteratorMutableMappingSequence)GraphModule)CustomGraphModulePass)BufferChoiceCallerFixedLayoutIRNodeLoopBody)BaseScheduling	SchedulerSchedulerNode)BlockShapeType   PythonWrapperCodegen_Tschedulec                x    t         j                  t        j                        rt         j	                  d|        y y )NzData type propagation: %s)schedule_logisEnabledForloggingDEBUGdebug)msgs    ^/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/codegen/common.pydata_type_loggerrS   X   s*      /6< 0    c                  Z    e Zd ZU dZded<   ded<   ddZedd       ZddZedd	       Z	y
)FileBackedGraphModulez
    Output of FX wrapper codegen. Exposes the same methods as ModuleType, but these
    map back to a GraphModule instead of Python source.
    r:   gmzCallable[..., Any]compiled_fnc                &   t        j                  ddd      | _         t        j                  t        j
                  | j                   j                         | j                   5 }|j                  | j                         d d d        y # 1 sw Y   y xY w)Nzw+z.pyF)modesuffixdelete)	tempfileNamedTemporaryFileatexitregisterosremovenamewritevalue)selffs     rR   __post_init__z#FileBackedGraphModule.__post_init__g   si     !33eE
 			4==#5#56]] 	 aGGDJJ	  	  	 s   "BBc                .    | j                   j                  S N)r]   rc   rf   s    rR   __file__zFileBackedGraphModule.__file__q   s    }}!!!rT   c                      | j                   | S rj   )rX   rf   argss     rR   callzFileBackedGraphModule.callu   s    t&&rT   c                .    | j                   j                  S rj   )rW   coderk   s    rR   re   zFileBackedGraphModule.valuex   s    ww||rT   NreturnNonert   str)ro   	list[Any]rt   r	   )
__name__
__module____qualname____doc____annotations__rh   propertyrl   rp   re    rT   rR   rV   rV   ]   sF    
 	O##  " "'  rT   rV   c                  <    e Zd ZdZdZdZedd       Zedd       Zy)	WorkspaceZeroModer   rF   r   c                    | |k(  s|t         j                  k(  r| S | t         j                  k(  r|S t        d| d|d      )NzWorkspaceZeroMode.combine(, ))r   UNINITIALIZEDNotImplementedErrorabs     rR   combinezWorkspaceZeroMode.combine   sK    6Q+999H!///H!$>qe2aU!"LMMrT   c                F    | rt         j                  S t         j                  S rj   )r   ZERO_ON_CALLr   )	zero_fills    rR   	from_boolzWorkspaceZeroMode.from_bool   s    $111 ...rT   N)r   r   r   r   rt   r   )r   boolrt   r   )	ry   rz   r{   r   r   ZERO_PER_GRAPHstaticmethodr   r   r   rT   rR   r   r   }   s9    MLNN N / /rT   r   c                  4    e Zd ZdZedd       Zedd       Zy)CodegenSymbolzP
    An IR object possibly corresponding to a variable in the wrapper code.
    c                     y rj   r   rk   s    rR   get_namezCodegenSymbol.get_name       rT   c                     y rj   r   rk   s    rR   get_examplezCodegenSymbol.get_example   r   rT   Nrv   rt   ztorch.Tensor | sympy.Symbol)ry   rz   r{   r|   r   r   r   r   rT   rR   r   r      s/        rT   r   T)frozenc                  &   e Zd ZU dZded<   ded<   ded<   ded	<   d
Zded<   ej                  Zded<   e	ddd       Z
e	d d       Ze	d!d       Ze	d!d       Zd"dZeZd#dZd$dZd%dZed%d       ZeZeZeZd&dZd'dZd'dZd(dZd)dZd*dZy)+WorkspaceArga2  A temporary buffer used for a single kernel, then discarded.

    Not registered as a traditional buffer since there are no users,
    so it would be dead code eliminated.

    Args:
        nbytes: The size of the buffer in bytes.
        zero_fill: Whether the buffer should be initialized to zero.

    
sympy.Exprcountr   	zero_modetorch.devicedevicerw   
outer_namews_ptr
inner_nametorch.dtypedtypec                P    |  t        t        j                  j                         S rj   )nextr5   graphworkspace_id)prefixs    rR   unique_namezWorkspaceArg.unique_name   s!    $qww334566rT   c                    | j                   |j                   k(  xr4 | j                  |j                  k(  xr | j                  |j                  k(  S rj   )r   r   r   r   s     rR   can_joinzWorkspaceArg.can_join   s@     LLALL(XQWW-?XAHHPQPXPXDX	
rT   c                    t        | j                  |j                  z   t        j                  | j                  |j                        | j
                  | j                  | j                  | j                        S N)r   r   r   r   r   r   )	r   r   r   r   r   r   r   r   r   r   s     rR   joinzWorkspaceArg.join   sS    ''AGG#'//Q[[I''88||||
 	
rT   c                   | j                   |j                   k(  r2| j                  |j                  k(  r| j                  |j                  k(  sJ t        t	        j
                  | j                  |j                        t        j                  | j                  |j                        | j                   | j                  | j                  | j                        S r   )r   r   r   r   sympyMaxr   r   r   r   r   r   s     rR   maximumzWorkspaceArg.maximum   s     GGqww188qxx#7ALLALL<X	
X))AGGQWW-'//Q[[I''88||||
 	
rT   c                    | j                   S rj   r   rk   s    rR   
get_devicezWorkspaceArg.get_device   s    {{rT   c                    | j                   S rj   r   rk   s    rR   	get_dtypezWorkspaceArg.get_dtype   s    zzrT   c                >    | j                         j                         S rj   )
get_layoutr   rk   s    rR   r   zWorkspaceArg.get_example   s     ,,..rT   c                f    ddl m}  || j                  | j                  | j                  gdg      S )Nr   )r>   rF   )r   r   sizestride)irr>   r   r   r   )rf   r>   s     rR   r   zWorkspaceArg.get_layout   s.    $;;****3	
 	
rT   c                "    | j                         S rj   )r   rk   s    rR   layoutzWorkspaceArg.layout   s      rT   c                6    t         j                  j                  S rj   )r   SZerork   s    rR   
get_offsetzWorkspaceArg.get_offset   s    ww||rT   c                    | j                   gS rj   )r   rk   s    rR   get_sizezWorkspaceArg.get_size   s    

|rT   c                8    t         j                  j                  gS rj   )r   r   Onerk   s    rR   
get_stridezWorkspaceArg.get_stride   s    }rT   c                    | j                   S rj   )r   rk   s    rR   r   zWorkspaceArg.get_name   s    rT   c                     y)NFr   rk   s    rR   get_is_pinnedzWorkspaceArg.get_is_pinned   s    rT   c                    g S rj   r   rk   s    rR   get_inputs_that_alias_outputz)WorkspaceArg.get_inputs_that_alias_output  s    	rT   N)
workspace_)r   rw   rt   rw   )r   r   r   r   rt   r   )r   r   r   r   rt   r   )rt   r   )rt   r   r   )rt   r>   )rt   r   )rt   list[sympy.Expr]rv   )rt   r   )rt   	list[str])ry   rz   r{   r|   r}   r   torchuint8r   r   r   r   r   r   r   get_device_or_errorr   r   r   r~   r   get_output_specmaybe_get_output_specmaybe_get_layoutr   r   r   r   r   r   r   rT   rR   r   r      s    	   OJE;$7 7 
 

 
 
 
 
 %/
 ! ! !O&!rT   r   c                      e Zd ZddZddZy)TritonScratchWorkspacec                     || _         || _        y rj   )r   _generate_dtype_str)rf   r   generate_dtype_strs      rR   __init__zTritonScratchWorkspace.__init__  s    	#5 rT   c                "    | j                         S rj   )r   rk   s    rR   r   z)TritonScratchWorkspace.generate_dtype_str  s    ''))rT   N)r   intr   Callable[..., str]rv   )ry   rz   r{   r   r   r   rT   rR   r   r     s    6*rT   r   c                  p    e Zd ZU ded<   ded<   ded<   ej
                  j                  Zded<   dZd	ed
<   y)	TensorArgrw   rc   bufferr   r   r   offsetN
str | Nonealias_of)	ry   rz   r{   r}   r   r   r   r   r   r   rT   rR   r   r     s.    
IKFJ%HjrT   r   c                  4    e Zd ZU ded<   ded<   edd       Zy)SizeArgrw   rc   r   exprc                     y rj   r   rk   s    rR   r   zSizeArg.alias_of  s    rT   Nrt   r   )ry   rz   r{   r}   r~   r   r   rT   rR   r   r     s    
I
 rT   r   c                      e Zd ZU ded<   y)ConstexprArgrw   rc   Nry   rz   r{   r}   r   rT   rR   r   r   #  s    
IrT   r   c                  6    e Zd ZU ded<   ded<   ded<   ded<   y)	TMADescriptorArgrw   rc   api_typezlist[sympy.Expr] | Noneblock_shapetorch.dtype | Noner   Nr   r   rT   rR   r   r   (  s    
IM((rT   r   c                  >    e Zd ZU ded<   ded<   dZded<   dZded<   y)	DeviceCodegenSchedulingConstructor
schedulingWrapperConstructorwrapper_codegenNWrapperConstructor | Nonecpp_wrapper_codegenfx_wrapper_codegen)ry   rz   r{   r}   r   r   r   rT   rR   r   r   0  s&    %%''59294818rT   r   zdict[str, DeviceCodegen]device_codegensc                      e Zd ZddZddZddZddZddZddZddZ	ddZ
dd	Zdd
ZddZddZddZddZddZddZ	 d	 	 	 	 	 	 	 ddZy)DeviceOpOverridesc                    t         rj   r   rf   rc   s     rR   import_get_raw_stream_asz*DeviceOpOverrides.import_get_raw_stream_as>      !!rT   c                    t         rj   r  rf   
device_idxs     rR   
set_devicezDeviceOpOverrides.set_deviceA  r  rT   c                    t         rj   r  rk   s    rR   synchronizezDeviceOpOverrides.synchronizeD  r  rT   c                    t         rj   r  r	  s     rR   device_guardzDeviceOpOverrides.device_guardG  r  rT   c                    t         rj   r  rk   s    rR   cpp_device_guardz"DeviceOpOverrides.cpp_device_guardJ  r  rT   c                    t         rj   r  rk   s    rR   cpp_aoti_device_guardz'DeviceOpOverrides.cpp_aoti_device_guardM  r  rT   c                    t         rj   r  rk   s    rR   cpp_stream_guardz"DeviceOpOverrides.cpp_stream_guardP  r  rT   c                    t         rj   r  rk   s    rR   cpp_aoti_stream_guardz'DeviceOpOverrides.cpp_aoti_stream_guardS  r  rT   c                    t         rj   r  rk   s    rR   cpp_getStreamFromExternalz+DeviceOpOverrides.cpp_getStreamFromExternalV  r  rT   c                    t         rj   r  rk   s    rR   kernel_headerzDeviceOpOverrides.kernel_headerY  r  rT   c                    t         rj   r  rk   s    rR   kernel_driverzDeviceOpOverrides.kernel_driver\  r  rT   c                    t         rj   r  rk   s    rR   cpp_stream_typez!DeviceOpOverrides.cpp_stream_type_  r  rT   c                    t         rj   r  rk   s    rR   aoti_get_streamz!DeviceOpOverrides.aoti_get_streamb  r  rT   c                    t         rj   r  rk   s    rR   cpp_kernel_typez!DeviceOpOverrides.cpp_kernel_typee  r  rT   c                    t         rj   r  rk   s    rR   cpp_device_ptrz DeviceOpOverrides.cpp_device_ptrh  r  rT   c                    t         rj   r  rk   s    rR   tma_descriptor_helpersz(DeviceOpOverrides.tma_descriptor_helpersk  r  rT   Nc                    t         rj   r  )rf   idx	workspacer   s       rR   cpp_scratchzDeviceOpOverrides.cpp_scratchn  s
     "!rT   rc   rw   rt   rw   )r
  r   rt   rw   rv   rj   )r)  r   r*  r   r   r   rt   ztuple[list[str], str] | None)ry   rz   r{   r  r  r  r  r  r  r  r  r  r  r  r  r!  r#  r%  r'  r+  r   rT   rR   r  r  =  s~    """""""""""""""" QU""#9"CM"	%"rT   r  zdict[str, DeviceOpOverrides]device_op_overrides_dictFz'dict[str, CustomGraphModulePass | None]custom_backend_passeszdict[str, ConfigModule | None]custom_backend_codegen_configsc                    t        ||||      t        | <   |t        | <   |r)t        |t              r|t
        usJ d|dt
               |t        | <   y )Nzdevice_custom_config=z: cannot be the same as the default inductor config config=)r   r   r.  
isinstancer   r   r/  )r   device_schedulingdevice_wrapper_codegendevice_cpp_wrapper_codegendevice_fx_wrapper_codegendevice_custom_passdevice_custom_configs          rR   register_backend_for_devicer8    sv     ,"!	OF %7&!+\:$F2	
 %#%%`Y_Xab		
3
 .B"6*rT   c                      e Zd Z e       Z e       Z e       Z e       Z e       Z e       Z	 e       Z
 e       Z e       Z e       Zy)BackendFeatureN)ry   rz   r{   r   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTUPLE_REDUCTIONPREFER_STORE_LOOP_ORDERTRITON_TEMPLATESREDUCE_TO_SINGLE_ELEMENTr   rT   rR   r:  r:    sL    fGIfO $6D6DfO"fv#vrT   r:  c                :   | 
t               S t                t        | t        j                        r| j
                  }n7t        | t              sJ t        |              | }t        j                  |      } t        |      }|sJ  |d       }|j                  |       S rj   )	r   init_backend_registrationr1  r   r   typerw   get_scheduling_for_deviceget_backend_features)r   device_typescheduling_ctorr   s       rR   rI  rI    s     ~|&%,,'kk&#&4V4&k*/<O? &J**622rT   c                @    t        |t              sJ |t        |       v S )zSee also V.graph.has_feature)r1  r:  rI  )r   features     rR   has_backend_featurerN    s%     g~...*6222rT   c                <    | t         v rt         |    j                  S d S rj   )r   r   r   s    rR   rH  rH    s     17?1J?6"--TPTTrT   c                v    | t         v r1t         |    }|r|j                  S |r|j                  S |j                  S y rj   )r   r   r   r   )r   cpp_wrapper
fx_wrapperwrapper_codegen_objs       rR   get_wrapper_codegen_for_devicerT    sD      -<V-D&999&:::&666rT   c                ,    t         j                  |       S rj   )r.  getr   s    rR   "get_custom_backend_pass_for_devicerW    s     $$V,,rT   c                ,    t         j                  |       S rj   )r/  rV  r   s    rR   $get_custom_backend_config_for_devicerY    s    )--f55rT   c                 ~   ddl m}  ddlm} ddlm} ddlm} ddlm	} ddl
m} ddlm} dd	lm} dd
lm} ddlm}	 ddlm}
 ddlm} ddlm} ddlm} t9        d      5| ||
|dt;        dfd|t<        j>                  j@                  r|n||       t9        d      |||dt;        dfd|||       t9        d      t;        d||       t9        d      t;        d||||       t9        d      d|it;        dfd|       t9        d      t;        d||||       t9        d      t;        d|
|	||       tB        jD                  jG                         }|dk7  rLt9        |      @ddl$m%} 	  |d       } |d!      } |d"      } |d#      }|r|r|rt;        |||||       yyyyyy# tL        $ r Y yw xY w)$z
    Register the backend for different devices, including the scheduling
    for kernel code generation and the host side wrapper code generation.
    rF   )CppScheduling)CppWrapperCpu)CppWrapperCpuArrayRef)CppWrapperGpu)CppWrapperMps)CUDACombinedScheduling)HalideScheduling)MetalScheduling)PallasScheduling)PythonWrapperMtia)TritonSchedulingrG   )WrapperFxCodegen)XPUCombinedSchedulingcpuN)cpphalidetritonpallasc                6     t         j                     |       S rj   )r   cpu_backend)r   cpu_backendss    rR   <lambda>z+init_backend_registration.<locals>.<lambda>	      ?|F,>,>?
K rT   cuda)rk  rj  rl  c                6     t         j                     |       S rj   )r   cuda_backend)r   cuda_backendss    rR   rp  z+init_backend_registration.<locals>.<lambda>  s    A}V-@-@A*M rT   tpuxpurl  c                6     t         j                     |       S rj   )r   tpu_backend)r   tpu_backendss    rR   rp  z+init_backend_registration.<locals>.<lambda>8  rq  rT   mpsmtiaprivateuseoner   )_get_custom_mod_func
SchedulingrH   CppWrapperCodegenrf  )'ri  r[  cpp_wrapper_cpur\  cpp_wrapper_cpu_array_refr]  cpp_wrapper_gpur^  cpp_wrapper_mpsr_  cuda_combined_schedulingr`  rj  ra  r{  rb  rl  rc  python_wrapper_mtiard  rk  re  wrapperrH   wrapper_fxirrf  xpu.xpu_combined_schedulingrg  rH  r8  r   aot_inductorallow_stack_allocationr   _C_get_privateuse1_backend_name torch.utils.backend_registrationr~  RuntimeError)r[  r\  r]  r^  r_  r`  ra  rb  rc  rd  re  rH   rf  rg  private_backendr~  r2  r   r   r   ro  ru  rz  s                       @@@rR   rF  rF    s    #.@..@($(6(-.B '/ &&&	
 	$K ""99 "	
 !(0 -&&

 	$M 	
 !'/# 	
 !'/#! 	
 !'/&
 	$K 	
 !'/# 	
 !(0#	
 hh<<>O?*%o6>I	 4\ B23IJO"67J"K!56H!I _9L+#%#'& :M_  ? 	+$  		s   55F0 0	F<;F<c                L    ddl m} g | t        ||j                  |            S )Nr   )FlexibleLayout)r   r  r*   contiguous_strides)index
index_varssizesr  s       rR   index_prevent_reorderingr  f  s,    
 $ UUTIj.*K*KE*RSTTrT   c                    |t         | <   y rj   )r-  )r   device_op_overridess     rR   register_device_op_overridesr  q  s     (;V$rT   c                 t    t         ry ddlm}  ddlm} ddlm} ddlm} ddlm} t        d |              da y )NrF   )mps_device_op_overrides)CpuDeviceOpOverrides)r  rv  T)
 _device_op_overrides_initialized r  cpu_device_op_overridesr  rr  r  r|  rw  r  )r  r  r  mtia_op_overridesxpu_op_overridess        rR   _initialize_device_op_overridesr  w  s/     ()=)>< !(<(>?'+$rT   c                h    t        | t              sJ t        |              t                t        |    S rj   )r1  rw   rG  r  r-  r   s    rR   get_device_op_overridesr    s,    fc"0DL0"#%#F++rT   zdict[torch.dtype, torch.dtype]DTYPE_TO_COMPUTATION_DTYPEc                r   | t               v rt        j                  S | dv rd|v r|d   S |d   S | dv rt        j                  S | dv rt        j                  S | dk(  rd|v r|d   S |d   S | dk(  rd|v r|d   S |d   S | d	v r$|d   }t
        j                  j                  |      S | d
k(  rd|v r|d   S |d   S y)zK
    Given op name and a list of input dtypes, deduce the output dtype
    )to_dtype
index_exprr   )randrandn)	get_index	randint64	load_seed	reductionrF   constant)loadstorestore_reductionto_dtype_bitcastN)r#   r   r   floatint64r5   r   r   )op_namero   kwargsbuf_names       rR   deduce_output_dtype_by_namer    s
    +-zz	  
 #*V"3vgAbA	  
 {{	  

 {{	K	")V"3vg@a@	J	")V"3vgAbA	  

 7ww  **	&	&")V"3vgAbArT   CSEVariableTypec                   t               }t        j                  j                  r'|dk(  r"| j	                  d| dt        |       d       y t        j                  j                  r|dk(  rddlm}m	} t        ||      sJ t        |             |t        j                  k(  r|j                  rd| d	}n.d
| d| d}n$d| d}|j                  rd| d}d| d||    d}| j	                  d| d       y y y )Nrk  tl.static_assert(z
.dtype == r   ri  rF   )CppCSEVariableDTYPE_TO_CPPzIsVecMaskType<decltype(z	)>::valuezstd::is_same_v<decltype(z$), bool> || std::is_same_v<decltype(z), int>z	decltype(z	typename z::value_typezstd::is_same_v<r   >zstatic_assert(z);)r&   r   test_configsruntime_triton_dtype_assert	writeliner-   static_cpp_dtype_assert	cpp_utilsr  r  r1  rG  r   r   is_vec)r   varr   backendr  r  
is_same_dt
c_var_types           rR   check_dtyper    s    "#G667h;N,SEK<N;OqQR				4	4E9I;#~.9S	9.EJJzz6se9E
  8u<`ad`eelm
$SE+Jzz(LA
*:,be9L8MQOJ>*R89! :J	4rT   c                    t               }|J t        j                  j                  rM|dk(  rGt	        |      dk7  rdj                  d |D              n|d    d}| j                  d| d| d	       y y y )
Nrk  rF   r   c              3  2   K   | ]  }t        |        y wrj   rw   ).0ds     rR   	<genexpr>zcheck_shape.<locals>.<genexpr>  s     ,c!f,s   r   ,r  z.shape == ()))r&   r   r  runtime_triton_shape_assertlenr   r  )r   r  shaper  	shape_strs        rR   check_shaper    s     "#G667h;N03E
aDII,e,,azQR^ 	 	,SEYKrJK	 <O6rT   c                j    t               }|dk(  r$d}| j                  d| d| d| d| d| d       y y )	Nrk  zNaN or Inf foundztl.device_assert((z == ) & (z != float('inf')) & (z != float('-inf')), 'z'))r&   r  )r   r  r  rQ   s       rR   	check_nanr    sS    !#G(  T#eC58McURghkgllno	
 rT   c                  `    e Zd Zd
dZddZddZddZddZddZe	dd       Z
e	dd       Zy	)DataTypePropagationc                    || _         d|j                  j                  i| _        |j                  j                         D ]  \  }}|j                  | j                  |<     y Nroot)body
root_blockr   graphs	subblocksitems)rf   r  kvs       rR   r   zDataTypePropagation.__init__  sT    	DOO))<
 NN((* 	%DAqWWDKKN	%rT   c                   |j                   }|D cg c]9  }t        |t        j                  j                        s(|j
                  dk7  s8|; }}t        |      dk(  ry t        d |D              }|sy t        j                  t        j                  |D cg c])  }|j                  t        j                     j                  + c}      S c c}w c c}w )Nplaceholderr   c              3     K   | ]K  }t         j                  |j                  v xr) |j                  t         j                     j                  d u M y wrj   )OptimizationContextkeymetar   )r  ns     rR   r  zBDataTypePropagation.deduce_node_dtype_by_inputs.<locals>.<genexpr>  sS      )
   ##qvv- B*../55TAB)
s   AA)all_input_nodesr1  r   fxNodeopr  all	functoolsreducepromote_typesr  r  r  r   )rf   nodeinputsr  input_nodesall_input_nodes_propagateds         rR   deduce_node_dtype_by_inputsz/DataTypePropagation.deduce_node_dtype_by_inputs  s    %%
Auxx}}!=!$$-BWA
 
 {q %( )
 !)
 &
"
 *<GHqQVV'++,22H
 	

  Is   )CCC.C
c                b    | j                   |j                     }| j                  |      }|sJ |S rj   )r  targetpropagate_graph)rf   r  	sub_graphr   s       rR   deduce_node_dtype_by_subgraphz1DataTypePropagation.deduce_node_dtype_by_subgraph  s0    KK,	$$Y/urT   c                   |j                   dk(  ry |j                  dk(  rt        |j                        dk7  ry |j                  t        j
                  u rT|j                  d   }t        |t        j                  j                        sJ t        |             | j                  |      S t        |j                  t              sJ t        |j                               |j                  j                  d      r| j                  |      S t        |j                  g|j                  i |j                   x}	 |S | j#                  |      S )Nr  outputrF   r   masked_subblock)r  r  r  ro   operatorgetitemr1  r   r  r  rG  deduce_node_dtyperw   
startswithr  r  r  r  )rf   r  node_argoutput_dtypes       rR   r  z%DataTypePropagation.deduce_node_dtype"  s   77m#;;("s499~':;;(***yy|Hh6FXF6))(33$++s+>T$++->>+;;!!"3455d;; 8 ++ L
   //55rT   c                n   |j                   sJ d }|j                   D ]  }t        j                  |j                  v r|j                  t        j                     }n
t               }| j	                  |      |_        ||j                  t        j                  <   |j                  dk(  s|j
                  } |S )Nr  )nodesr  r  r  r  r   r  )rf   r   graph_dtyper  opt_ctxs        rR   r   z#DataTypePropagation.propagate_graph?  s    {{{*. KK 		,D"&&$))3))$7$;$;<-/ 2248GM18DII)--.{{h&%mm		, rT   c                >    | j                  | j                  d         S r  )r   r  rk   s    rR   	propagatezDataTypePropagation.propagateQ  s    ##DKK$788rT   c                .     | |      j                         S rj   )r  )clsr  s     rR   propagate_loopbodyz&DataTypePropagation.propagate_loopbodyT  s    4y""$$rT   c                    ddl m} ddlm} t	        ||      sJ t        |             t	        |j                  |      sJ t        |j                               t        j                  |j                        S )Nr   r@   )rD   )		loop_bodyrA   	schedulerrD   r1  rG  _bodyr  r  )r  r  rA   rD   s       rR   propagate_scheduler_nodez,DataTypePropagation.propagate_scheduler_nodeX  sX    (-$.:T
:.$**h/Adjj1AA/"55djjAArT   N)r  rA   rt   ru   )r  torch.fx.Nodert   r   )r  r  rt   r   )r   ztorch.fx.Graphrt   r   )rt   r   )r  rA   rt   r   )r  rD   rt   r   )ry   rz   r{   r   r  r  r  r   r  classmethodr  r  r   rT   rR   r  r    sJ    %
*6:$9 % % B BrT   r  c                  D     e Zd Zddd	 	 	 	 	 	 	 d fdZdd fdZ xZS )r   T)simplifypc                   |r]t        |t        j                        rCt        t        j
                  d      r)t        j
                  j                  j                  |      }t        | %  |      S )Nsizevars)
r1  r   Exprhasattrr5   r   r   r  superdoprint)rf   r   r  r  	__class__s       rR   r$  zPythonPrinter.doprintc  sK     
44*9U77##,,T2Dwt$$rT   c                    t        |t        j                        rd| j                  |       dS t        |   |||      S N(r   )r1  r   Mod_printr#  parenthesize)rf   itemlevelstrictr%  s       rR   r+  zPythonPrinter.parenthesizek  s@    dEII& t{{4()++7'eV<<rT   )r   r   r  r   r  r   rt   rw   )F)r,  r   r-  r   r.  r   rt   rw   )ry   rz   r{   r$  r+  __classcell__r%  s   @rR   r   r   b  s7    48D%%-1%=A%	%= =rT   r   c                  x   e Zd ZdZedd       Zedd       Zedd       Zedd       Zedd       Z	edd       Z
edd       Zedd	       Zedd
       Zedd       Zedd       Zedd       Zedd       Zedd       Zedd       Zedd       Zedd       Zedd       Zedd       Zedd       Zy)OpDecompositionsz!
    Decomposes inductor ops
    c                    | S rj   r   )re   s    rR   identityzOpDecompositions.identityz  s	     rT   c                r    t        j                  t        j                  dt        j                        |       S )Ng      ?)r0   truedivr  r   float32xs    rR   
reciprocalzOpDecompositions.reciprocal  s$     {{3<<U]];Q??rT   c                .    t        j                  | |       S rj   r0   mulr8  s    rR   squarezOpDecompositions.square  s    wwq!}rT   c                    t        j                  t        j                  dt        j                        t        j
                  |             S NrF   )r0   subr  r   r7  erfr8  s    rR   erfczOpDecompositions.erfc  s*    wws||Au}}5swwqzBBrT   c                    t        j                  t        j                  t        j                  |             t        j                  |             S rj   )r0   r=  expr>  rC  r8  s    rR   erfcxzOpDecompositions.erfcx  s,    wwswwszz!}-sxx{;;rT   c                    t        j                  t        j                  |       t        j                  dt        j
                              S r@  )r0   rA  rE  r  r   r7  r8  s    rR   expm1zOpDecompositions.expm1  s*    wwswwqz3<<5==#ABBrT   c           	         t        j                  t        j                  |       t        j                  dt	        j                  d      z  t
        j                              S )NrF   
   r0   r=  logr  mathr   r7  r8  s    rR   log10zOpDecompositions.log10  s7    wwswwqz3<<DHHRL0@%--#PQQrT   c           	         t        j                  t        j                  |       t        j                  dt	        j                  d      z  t
        j                              S )NrF   r   rK  r8  s    rR   log2zOpDecompositions.log2  s6    wwswwqz3<<DHHQK#OPPrT   c           
         t        j                  t        j                  | t        j                  t	        j
                  d      t        j                                    S )Nr   )r0   rE  r=  r  rM  rL  r   r7  r8  s    rR   exp2zOpDecompositions.exp2  s3    wwswwq#,,txx{EMM"JKLLrT   c           	         t        j                  t        j                  | t        j                  dt        j
                                    S r@  )r0   rL  addr  r   int32r8  s    rR   log1pzOpDecompositions.log1p  s+    wwswwq#,,q%++">?@@rT   c                    t        j                  dt        j                        }t        j                  |t        j
                  |t        j                  t        j                  |                         S r@  )r0   r  r   rU  r6  rT  rE  neg)r9  ones     rR   sigmoidzOpDecompositions.sigmoid  sC    ll1ekk*{{3SWWSWWQZ-@ ABBrT   c                r    t        j                  | t        j                  dt        j                              S Nr   )r0   r   r  r   rU  r8  s    rR   reluzOpDecompositions.relu  s"    {{1cll1ekk:;;rT   c                V    t        j                  t        j                  | |      |      S rj   )r0   rT  r=  r9  yzs      rR   fmazOpDecompositions.fma  s     wwswwq!}a((rT   c                .    t        j                  | |      S rj   r<  r9  r`  s     rR   mul_rnzOpDecompositions.mul_rn  s     wwq!}rT   c                .    t        j                  | |      S rj   r0   r6  rd  s     rR   div_rnzOpDecompositions.div_rn  s     {{1a  rT   c                T    t        j                  t        j                  |       |      S rj   )r0   r  floorr   r   s     rR   floor_to_intzOpDecompositions.floor_to_int      ||CIIaL%00rT   c                T    t        j                  t        j                  |       |      S rj   )r0   r  ceilrk  s     rR   ceil_to_intzOpDecompositions.ceil_to_int  s    ||CHHQK//rT   c                T    t        j                  t        j                  |       |      S rj   )r0   r  truncrk  s     rR   trunc_to_intzOpDecompositions.trunc_to_int  rm  rT   c           	        t        j                  | |      }t        j                  t        j                  |t        j                  dt
        j                              t        j                  t        j                  |      t        j                  |                  }t        j                  |t        j                  ||      |      S r\  )
r0   modand_ner  r   rU  signbitwhererT  )r   r   rconds       rR   	remainderzOpDecompositions.remainder  sy    GGAqMxxFF1cll1ekk23FF3;;q>3;;q>2
 yyswwq!}a00rT   c                T    t        j                  t        j                  |       |      S rj   )r0   r  roundrk  s     rR   round_to_intzOpDecompositions.round_to_int  rm  rT   N)re   OpVarTrt   r  r9  r  rt   r  )r9  r  r`  r  ra  r  rt   r  r9  r  r`  r  rt   r  )r   r  r   r   rt   r  r   r  r   r  rt   r  )ry   rz   r{   r|   r   r4  r:  r>  rC  rF  rH  rN  rP  rR  rV  rZ  r]  rb  re  rh  rl  rp  rs  r|  r  r   rT   rR   r2  r2  u  s      @ @
   C C < < C C R R Q Q M M A A C C < < ) )   ! ! 1 1 0 0 1 1 1 1 1 1rT   r2  z[a-z0-9_.]+|\([^)]*\)|)flagsc                    | d   dk7  st        |       dk  ryd}t        | dd        D ]3  \  }}|dk(  r|dz  }n
|dk(  r|dz  }|dk(  s!|t        |       dz
  k7  s3 y |dk(  sJ y)Nr   r(  r   FrF   r   T)r  	enumerate)stringr   ichars       rR   _all_in_parensr    s    ayC3v;?EVABZ( 43;QJES[QJEA:!s6{Q. A::rT   c                  ^   e Zd Zed"d       Zed#d       Zed$d       Zed%d       Zed&d       Zed&d       Z	ed&d       Z
ed&d       Zed&d	       Zed'd
       Zed(d       Z	 	 d)	 	 	 	 	 	 	 	 	 d*dZ	 	 	 	 	 	 	 	 	 	 d+dZd,dZ	 d-	 	 	 	 	 	 	 	 	 d.dZd/dZd0dZ	 	 	 	 	 	 	 	 	 	 d1dZ	 	 	 	 	 	 	 	 d2dZ	 	 	 	 	 	 	 	 	 	 d3dZ	 	 d4	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d5dZd6dZd&dZdej8                  dddd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d7dZd8dZd9dZed:d       Z e!d;d        Z"e!d<d!       Z#y)=OpOverridesc                r    t        | t              s t        j                  |       st	        |       r| S d|  dS r'  )r1  CSEVariable_RE_PAREN_NOT_NEEDED	fullmatchr  )r  s    rR   parenzOpOverrides.paren  s9     v{+#--f5f% M6(!}rT   c                    t        |       S rj   )repr)re   r   s     rR   r  zOpOverrides.constant  s    E{rT   c                2    dt         j                  |        S )N~r  r  r8  s    rR   bitwise_notzOpOverrides.bitwise_not  s     ;$$Q'())rT   c                2    t         j                  |        dS )Nz == 0r  )r   s    rR   logical_notzOpOverrides.logical_not   s     ##A&'u--rT   c                \    t         j                  |        dt         j                  |       S )Nz & r  rd  s     rR   bitwise_andzOpOverrides.bitwise_and  -     ##A&'s;+<+<Q+?*@AArT   c                \    t         j                  |        dt         j                  |       S )Nz | r  rd  s     rR   
bitwise_orzOpOverrides.bitwise_or
  r  rT   c                \    t         j                  |        dt         j                  |       S )Nz ^ r  rd  s     rR   bitwise_xorzOpOverrides.bitwise_xor  r  rT   c                \    t         j                  |        dt         j                  |       S )Nz << r  rd  s     rR   bitwise_left_shiftzOpOverrides.bitwise_left_shift  -     ##A&'tK,=,=a,@+ABBrT   c                \    t         j                  |        dt         j                  |       S )Nz >> r  rd  s     rR   bitwise_right_shiftzOpOverrides.bitwise_right_shift  r  rT   c                .    t        j                  | |      S rj   rg  r   s     rR   int_truedivzOpOverrides.int_truediv  s     {{1a  rT   c                T    t        j                  | t        j                  |            S rj   )r0   r  r   Integer)rc   r   s     rR   r  zOpOverrides.load_seed&  s    xxemmF344rT   Tc                *    t        t        |            S rj   )r+   rw   )rf   r  r   checkwrap_negs        rR   indirect_indexingzOpOverrides.indirect_indexing*  s     "#c(++rT   c                D    t        t        |       j                   d      )Nz,: check_bounds should be handled by CSEProxyr   rG  ry   rf   r   r   loweruppers        rR   check_boundszOpOverrides.check_bounds3  s'     "Dz""##OP
 	
rT   c                D    t        t        |       j                   d      )Nz$: load should be handled by CSEProxyr  rf   rc   r  s      rR   r  zOpOverrides.load:  s%    !Dz""##GH
 	
rT   Nc                D    t        t        |       j                   d      )Nz%: store should be handled by CSEProxyr  rf   rc   r  re   rZ   s        rR   r  zOpOverrides.store?  s'     "Dz""##HI
 	
rT   c                D    t        t        |       j                   d      Nz3: device_assert_async should be handled by CSEProxyr  rf   r{  rQ   s      rR   device_assert_asynczOpOverrides.device_assert_asyncF  %    !Dz""##VW
 	
rT   c                D    t        t        |       j                   d      )Nz/: store_reduction should be handled by CSEProxyr  rf   rc   r  re   s       rR   r  zOpOverrides.store_reductionK  s%    !Dz""##RS
 	
rT   c                D    t        t        |       j                   d      )Nz): reduction should be handled by CSEProxyr  rf   r   	src_dtypereduction_typere   s        rR   r  zOpOverrides.reductionP  s'     "Dz""##LM
 	
rT   c                D    t        t        |       j                   d      )Nz$: scan should be handled by CSEProxyr  rf   dtypes
combine_fnvaluess       rR   scanzOpOverrides.scan[  s'     "Dz""##GH
 	
rT   c                D    t        t        |       j                   d      )Nz$: sort should be handled by CSEProxyr  rf   r  r  stable
descendings        rR   sortzOpOverrides.sorth  s'     "Dz""##GH
 	
rT   c                D    t        t        |       j                   d      )Nz): bucketize should be handled by CSEProxyr  rf   r  
boundariesboundary_indicesindexing_dtyperightsortersorter_indicess           rR   	bucketizezOpOverrides.bucketizes  s'     "Dz""##LM
 	
rT   c                D    t        t        |       j                   d      )Nz2: halide_clamp only implemented for Halide backendr  )rf   re   r   r  s       rR   halide_clampzOpOverrides.halide_clamp  s%    !Dz""##UV
 	
rT   c                D    t        t        |       j                   d      )Nz): dot only implemented for Triton backendr  )rf   r9  r`  s      rR   dotzOpOverrides.dot  s%    !Dz""##LM
 	
rT   rF   )constraintsr   is_purepackinput_dtypesc               D    t        t        |       j                   d      )Nz<: inline_asm_elementwise only implemented for Triton backendr  )rf   asmr  r   r  r  r  r  s           rR   inline_asm_elementwisez"OpOverrides.inline_asm_elementwise  s'     "Dz""##_`
 	
rT   c                D    t        t        |       j                   d      )Nz.: ops.output should not appear at codegen timeAssertionErrorrG  ry   rn   s     rR   r  zOpOverrides.output  s%    Dz""##QR
 	
rT   c                D    t        t        |       j                   d      )Nz3: ops.placeholder should not appear at codegen timer  rf   r  s     rR   r  zOpOverrides.placeholder  s%    Dz""##VW
 	
rT   c                0     d fd} |_         d|_        |S )Nc                J    t        t        |       j                   d       )Nz does not implement ops.r  )rf   ro   r  rc   s      rR   unimplementedz1OpOverrides._unimplemented.<locals>.unimplemented  s*    %:&&''?vF rT   T)rf   r  ro   r	   r  r	   rt   r  )ry   is_unimplemented)rc   r  s   ` rR   _unimplementedzOpOverrides._unimplemented  s     	
 "&)-&rT   c                p    t        | |d       }t        t        |d       }| xs ||k(  xs t        |dd      S )Nr  F)getattrr1   )r  rc   fn
default_fns       rR   _is_unimplementedzOpOverrides._is_unimplemented  s?    S$%Zt4
vSz)SWR9KU-SSrT   c                P   |dv sJ |       t         j                         D ]  \  }}t        ||      }|/| j                  |      s&t	        | || j                  |             C|| j                  vsJ d| d| j                          ||_        t	        | |t        |              y )N)rk  ri  cppvecrj  r{  zmultiple definitions of z on )	pointwise_overrides_datar  r  r  setattrr  __dict__ry   r   )r  r  funcnamedataimpls        rR   _initialize_pointwise_overridesz+OpOverrides._initialize_pointwise_overrides  s    EEMvME6<<> 
	;NHd4(D|((2C3+=+=h+GHs||3 .xjS\\NK3 !)X|D'9:
	;rT   )r  r  rt   r  )re   zbool | float | intr   r   rt   r  r  )r   r  rt   r  r  r  )rc   rw   r   r  rt   r  TT)
r  r  r   sympy.Expr | intr  r   r  r   rt   sympy.Symbol
r   r   r   r   r  r   r  r   rt   ru   )rc   rw   r  r   rt   r  rj   )
rc   rw   r  r   re   r  rZ   r4   rt   ru   r{  r  rQ   rw   rt   ru   )rc   rw   r  r   re   r  rt   ru   )
r   r   r  r   r  r3   re   OpVarT | tuple[OpVarT, ...]rt   r  )r  tuple[torch.dtype, ...]r  zFCallable[[tuple[OpVarT, ...], tuple[OpVarT, ...]], tuple[OpVarT, ...]]r  tuple[OpVarT, ...]rt   r  )
r  r   r  r  r  r   r  r   rt   r  NN)r  r  r  .tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]r  r  r  r   r  r   r  tuple[str, sympy.Expr] | Noner  zOpVarT | Nonert   r  )re   r  r   r   r  r   rt   r  )r  r  r  rw   r  r   r   r   r  r   r  r   r  ztuple[torch.dtype, ...] | Nonert   r  )ro   r  rt   ru   )r  r   rt   r  )rc   rw   rt   zCallable[..., OpVarT]rc   rw   rt   r   )r  rw   rt   ru   )$ry   rz   r{   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r7  r  r  r  r  r  r  r  r   rT   rR   r  r    s>   	 	   * * . . B B B B B B C C C C ! ! 5 5 ,, , 	,
 , 
,

&0
9=
FJ
	

 NR

 *
39
AJ
	




	
	
 	
 &		

 +	
 
%	

'


 #
 

	
'	
 #	
 		

 	
 
	
$ 15(,

 C
 !	

 $
 
 .
 &
 




 #'"]]7;

 
  	

 
 
 
 5
 





   T T
 ; ;rT   r  c                  |    e Zd ZU ded<   ded<   dZded<   dZded<   ej                  Zd	ed
<   dZ	ded<   dZ
ded<   y)OverridesDatarw   rc   r   ri  NzCallable[..., str] | Nonerk  r  r   type_promotion_kindrj  r{  )ry   rz   r{   r}   rk  r  r   DEFAULTr  rj  r{  r   rT   rR   r  r    sQ    
I	(,F%,(,F%,'// 8  )-F%,%)C	")rT   r  airy_aic                    d|  dS )Nzairy_ai_forward(r   r   r8  s    rR   rp  rp    s    (1- rT   special_airy_ai)r  ri  rc   	bessel_j0c                    d|  dS )Nzbessel_j0_forward(r   r   r8  s    rR   rp  rp        *1#Q/ rT   c                    d|  dS )Nzlibdevice.j0(r   r   r8  s    rR   rp  rp        =1- rT   special_bessel_j0)r  ri  rk  rc   	bessel_j1c                    d|  dS )Nzbessel_j1_forward(r   r   r8  s    rR   rp  rp    r  rT   c                    d|  dS )Nzlibdevice.j1(r   r   r8  s    rR   rp  rp    r  rT   special_bessel_j1	bessel_y0c                    d|  dS )Nzbessel_y0_forward(r   r   r8  s    rR   rp  rp    r  rT   c                    d|  dS )Nzlibdevice.y0(r   r   r8  s    rR   rp  rp    r  rT   special_bessel_y0	bessel_y1c                    d|  dS )Nzbessel_y1_forward(r   r   r8  s    rR   rp  rp    r  rT   c                    d|  dS )Nzlibdevice.y1(r   r   r8  s    rR   rp  rp    r  rT   special_bessel_y1digammac                    d|  dS )Nzcalc_digamma(r   r   r8  s    rR   rp  rp    s    aS* rT   c                    |  dS )Nz
.digamma()r   r8  s    rR   rp  rp    s    A3j) rT   )r  ri  r  rc   rF  c                    d|  dS )Nzcalc_erfcx(r   r   r8  s    rR   rp  rp        A3a( rT   c                    d|  dS )Nzlibdevice.erfcx(r   r   r8  s    rR   rp  rp    s    +A3a0 rT   special_erfcxrb  c                    d|  d| d| dS )Nz	std::fma(r   r   r   r_  s      rR   rp  rp    s    is"QCr!A6 rT   c                    d|  d| d| dS )Nzfmadd(r   r   r   r_  s      rR   rp  rp    s    s"QCr!A6 rT   c                    d|  d| d| dS )Nztl.fma(r   r   r   r_  s      rR   rp  rp  
  s    2aS1#Q7 rT   )r  ri  r  rk  rc   re  c                    d|  d| dS )Nr(  ) * (r   r   rd  s     rR   rp  rp    s    1QCuQCq) rT   c                Z    t         j                  j                  r	d|  d| dS d|  d| dS )Nr(  r*  r   zlibdevice.mul_rn(r   )r   versionhiprd  s     rR   rp  rp    s<    ==  s%s!,  2aS* rT   igammac                    d|  d| dS Nzcalc_igamma(r   r   r   rd  s     rR   rp  rp        <s"QCq1 rT   igammacc                    d|  d| dS Nzcalc_igammac(r   r   r   rd  s     rR   rp  rp  #      =2aS2 rT   gammaincc                    d|  d| dS r0  r   rd  s     rR   rp  rp  (  r1  rT   special_gammainc	gammainccc                    d|  d| dS r4  r   rd  s     rR   rp  rp  -  r5  rT   special_gammaincci0c                    d|  dS )Nzcalc_i0(r   r   r8  s    rR   rp  rp  2      1o rT   c                    d|  dS Nzlibdevice.cyl_bessel_i0(r   r   r8  s    rR   rp  rp  3      3A3a8 rT   c                    |  dS )Nz.i0()r   r8  s    rR   rp  rp  4  s    A3e rT   )r  ri  rk  r  rc   i0ec                    d|  dS )Nz	calc_i0e(r   r   r8  s    rR   rp  rp  9      	!A& rT   c                    |  dS )Nz.i0e()r   r8  s    rR   rp  rp  :  s    A3f rT   special_i0ei1c                    d|  dS )Nzcalc_i1(r   r   r8  s    rR   rp  rp  ?  r>  rT   c                    d|  dS Nzlibdevice.cyl_bessel_i1(r   r   r8  s    rR   rp  rp  @  rA  rT   
special_i1i1ec                    d|  dS )Nz	calc_i1e(r   r   r8  s    rR   rp  rp  E  rE  rT   special_i1elog_ndtrc                    d|  dS )Nzcalc_log_ndtr(r   r   r8  s    rR   rp  rp  J  s    qc+ rT   special_log_ndtrmodified_bessel_i0c                    d|  dS )Nzmodified_bessel_i0_forward(r   r   r8  s    rR   rp  rp  P      3A3a8 rT   c                    d|  dS r@  r   r8  s    rR   rp  rp  Q  rA  rT   special_modified_bessel_i0modified_bessel_i1c                    d|  dS )Nzmodified_bessel_i1_forward(r   r   r8  s    rR   rp  rp  V  rU  rT   c                    d|  dS rK  r   r8  s    rR   rp  rp  W  rA  rT   special_modified_bessel_i1modified_bessel_k0c                    d|  dS )Nzmodified_bessel_k0_forward(r   r   r8  s    rR   rp  rp  \  rU  rT   special_modified_bessel_k0modified_bessel_k1c                    d|  dS )Nzmodified_bessel_k1_forward(r   r   r8  s    rR   rp  rp  a  rU  rT   special_modified_bessel_k1ndtrc                    d|  dS )Nz
calc_ndtr(r   r   r8  s    rR   rp  rp  g  s    
1#Q' rT   special_ndtrndtric                    d|  dS )Nzcalc_ndtri(r   r   r8  s    rR   rp  rp  l  r#  rT   special_ndtri	polygammac                *    |  d| d|  d| d| d|  dS )Nz == 0 ? calc_digamma(z) : (z == 1 ? trigamma(z) : calc_polygamma(r   r  r   rd  s     rR   rp  rp  q  s8    S%aSaS0A!DWXYWZZ\]^\__ab rT   scaled_modified_bessel_k0c                    d|  dS )Nz"scaled_modified_bessel_k0_forward(r   r   r8  s    rR   rp  rp  y      :1#Q? rT   !special_scaled_modified_bessel_k0scaled_modified_bessel_k1c                    d|  dS )Nz"scaled_modified_bessel_k1_forward(r   r   r8  s    rR   rp  rp  ~  rl  rT   !special_scaled_modified_bessel_k1spherical_bessel_j0c                    d|  dS )Nzspherical_bessel_j0_forward(r   r   r8  s    rR   rp  rp    s    4QCq9 rT   special_spherical_bessel_j0zetac                    d|  d| dS )Nzzeta(r   r   r   rd  s     rR   rp  rp    s    52aS* rT   special_zetachebyshev_polynomial_tc                    d|  d| dS )Nzchebyshev_polynomial_t_forward(r   r   r   rd  s     rR   rp  rp        :1#Rs!D rT   special_chebyshev_polynomial_tchebyshev_polynomial_uc                    d|  d| dS )Nzchebyshev_polynomial_u_forward(r   r   r   rd  s     rR   rp  rp    ry  rT   special_chebyshev_polynomial_uchebyshev_polynomial_vc                    d|  d| dS )Nzchebyshev_polynomial_v_forward(r   r   r   rd  s     rR   rp  rp    ry  rT   special_chebyshev_polynomial_vchebyshev_polynomial_wc                    d|  d| dS )Nzchebyshev_polynomial_w_forward(r   r   r   rd  s     rR   rp  rp    ry  rT   special_chebyshev_polynomial_wlegendre_polynomial_pc                    d|  d| dS )Nzlegendre_polynomial_p_forward(r   r   r   rd  s     rR   rp  rp        9!BqcC rT   special_legendre_polynomial_pshifted_chebyshev_polynomial_tc                    d|  d| dS )Nz'shifted_chebyshev_polynomial_t_forward(r   r   r   rd  s     rR   rp  rp        B1#Rs!L rT   &special_shifted_chebyshev_polynomial_tshifted_chebyshev_polynomial_uc                    d|  d| dS )Nz'shifted_chebyshev_polynomial_u_forward(r   r   r   rd  s     rR   rp  rp    r  rT   &special_shifted_chebyshev_polynomial_ushifted_chebyshev_polynomial_vc                    d|  d| dS )Nz'shifted_chebyshev_polynomial_v_forward(r   r   r   rd  s     rR   rp  rp    r  rT   &special_shifted_chebyshev_polynomial_vshifted_chebyshev_polynomial_wc                    d|  d| dS )Nz'shifted_chebyshev_polynomial_w_forward(r   r   r   rd  s     rR   rp  rp    r  rT   &special_shifted_chebyshev_polynomial_whermite_polynomial_hc                    d|  d| dS )Nzhermite_polynomial_h_forward(r   r   r   rd  s     rR   rp  rp    s    82aSB rT   special_hermite_polynomial_hhermite_polynomial_hec                    d|  d| dS )Nzhermite_polynomial_he_forward(r   r   r   rd  s     rR   rp  rp    r  rT   special_hermite_polynomial_helaguerre_polynomial_lc                    d|  d| dS )Nzlaguerre_polynomial_l_forward(r   r   r   rd  s     rR   rp  rp    r  rT   special_laguerre_polynomial_lzdict[str, OverridesData]r  c                     t         fdt        j                  j                  t        j                  j                  t        j                  j
                  t        j                  j
                  fD              S )Nc              3  &   K   | ]  }|v  
 y wrj   r   )r  r9  rc   s     rR   r  z$is_buffer_removed.<locals>.<genexpr>  s       		   )anyr5   r   removed_bufferskernelinplaced_to_removerc   s   `rR   is_buffer_removedr    sU      GG##HH$$GG&&HH''	
  rT   c                  4     e Zd ZdZd fdZddZddZ xZS )DeferredLinezHA line that can be 'unwritten' by adding name to V.graph.removed_buffersc                V    t         |   |       || _        t        |t              rJ y rj   )r#  r   rc   r1  r$   )rf   rc   liner%  s      rR   r   zDeferredLine.__init__  s+    	d$45555rT   c                F    t        | j                        s| j                  S y rj   )r  rc   r  rk   s    rR   __call__zDeferredLine.__call__  s     +99rT   c                .    t        | j                  |      S rj   )r  rc   )rf   r  s     rR   	_new_linezDeferredLine._new_line  s    DIIt,,rT   )rc   rw   r  rw   r   )r  rw   rt   r  )ry   rz   r{   r|   r   r  r  r/  r0  s   @rR   r  r    s    R6

-rT   r  c                      e Zd ZdddZy)BracesBufferc                H     t         j                  d fd       } |       S )Nc               3    K   t              D ](  } j                  d       xj                  dz  c_        * t               D ](  } xj                  dz  c_        j                  d       * d  t               D ](  } j                  d       xj                  dz  c_        * t              D ](  } xj                  dz  c_        j                  d       * y w)N{rF   })ranger  _indent)_r   rf   s    rR   ctxz BracesBuffer.indent.<locals>.ctx  s     6] "s#!" F7^ $!s#$ F7^ "s#!" 6] $!s#$s   C C#)rt   Iterator[None])
contextlibcontextmanager)rf   r   r  s   `` rR   indentzBracesBuffer.indent  s$    		"	"	$ 
#	$ urT   N)rF   )r   r   rt   z'contextlib.AbstractContextManager[None])ry   rz   r{   r  r   rT   rR   r  r    s    rT   r  c                  "    e Zd ZU ded<   ded<   y)InplacedBufferrw   r   r   other_namesNr   r   rT   rR   r  r    s    OrT   r  c                  .    e Zd ZU ded<   dZded<   ddZy)	ArgNamerw   rc   Fr   is_constexprc                B    | j                    | j                  rd S d S )Nz : tl.constexprr  )rc   r  rk   s    rR   	full_namezArgName.full_name  s*    ))$2C2C.LMMLMMrT   Nrv   )ry   rz   r{   r}   r  r  r   rT   rR   r  r    s    
IL$NrT   r  c                      e Zd ZddZy)
RemovedArgc                     y)NREMOVEDr   rk   s    rR   __str__zRemovedArg.__str__  s    rT   Nrv   )ry   rz   r{   r  r   rT   rR   r  r    s    rT   r  c                     e Zd Ze	 	 	 	 	 	 	 	 dd       ZddZddZedd       ZddZddZ	ddZ
ej                  f	 	 	 	 	 	 	 ddZdd	Zdd
ZddZd dZd!dZd"dZd#dZ	 d$	 	 	 d%dZ	 	 d&dZd'dZd(dZd)dZy)*
KernelArgsc                ~    |j                  |t              }t        |t              r|  t	        |       x||<   }|S |S rj   )rV  r  r1  r  r  )r   odictrc   result
new_results        rR   _lookupzKernelArgs._lookup  sD     $)99T7#;fj)*0#e*'>>E$K*rT   c                J    i | _         i | _        i | _        i | _        g | _        y rj   )input_buffersoutput_buffersinplace_buffersr   workspace_argsrk   s    rR   r   zKernelArgs.__init__!  s)    -/;=GI/124rT   c                    dj                  dj                  t        t        | j                  | j
                  | j                  | j                  g                  S )NzKernelArgs({})r   )formatr   mapr  r  r  r  r   rk   s    rR   __repr__zKernelArgs.__repr__(  sS    &&II**++,,	

 	
rT   c                "    t        | t              S rj   )r1  r  r  s    rR   _buffer_is_marked_removedz$KernelArgs._buffer_is_marked_removed7  s     $
++rT   c                :   t         j                  j                  r4t         j                  j                  j                  j	                  ||      }|t         j                  j
                  vsJ |       || j                  v rt        t        | j                  |         S || j                  v r't        t        | j                  |         j                  S |j                  d      r| j                  d| j                  |      S | j                  d| j                  |      S )Nseedin_ptr)r5   r   r  mutation_real_namerV  r  r  r
   rw   r  r  r   r	  r  r  r  s     rR   inputzKernelArgs.input<  s    7777$$77;;D$GD1772228D824&&&T006774'''(<(<T(BCNNN??6"<<(:(:DAA||Hd&8&8$??rT   c                   t         j                  j                  r4t         j                  j                  j                  j	                  ||      }|t         j                  j
                  vsJ |       || j                  v r't        t        | j                  |         j                  S | j                  d| j                  |      S )Nout_ptr)r5   r   r  r  rV  r  r  r
   r  r   r  r  r  s     rR   r  zKernelArgs.outputH  s    7777$$77;;D$GD1772228D824'''(<(<T(BCNNN||It':':DAArT   c                   |t         j                  j                  v r)t         j                  j                  j                  |       || j                  vsJ |       || j                  v rL| j                  |   }t        |t              rJ |j                  j                  |       || j                  |<   y | j                  j                         D cg c]  }t        |t              s| }}| j                  j                         D cg c]  }t        |t              r| }}t        t        |            t        |      z   }t        d| ||g      }|| j                  |<   || j                  |<   y c c}w c c}w )N
in_out_ptr)r5   r   unaligned_buffersrT  r  r1  r  r  appendr  r  r.   r  )rf   
input_nameoutput_namebufvalalive_buffersr  inplace_buffer_idxs           rR   make_inplacezKernelArgs.make_inplaceP  sk   222GG%%))+6$"6"66CC6---&&z2C!#z222OO"";/03D  -  //668!#z2 M   //668c:. O 
 "%VM%:!;c/>R!R /01[)C 03D  ,03D  -!
s   E3E8c                x   t        |t        j                  |      t        j                  j                         t         j                         |      }t        | j                        D ]  \  }}t         j                  ||      rJ|j                  }t         j                  ||      | j                  |<   |j                  |j                  |fc S |j                  |j                  k7  r|j                  |j                  k7  rJ |        | j                  j                  |       |j                  |j                  dfS )aZ  
        Allocate or extend a workspace buffer of nelem elements.

        This function manages the allocation of a workspace buffer. It either creates
        a new WorkspaceArg or extends an existing one.

        Note:
        - Calling this function will in-place mutate the args by adding or updating
        a WorkspaceArg.
        - The codegen for generating the Python argdefs and call_defs will check
        this field and allocate the buffer accordingly.
        - A new argument "ws_ptr" will be present in the generated code.

        Args:
            nelem (sympy.Expr): The number of elements to allocate.
            zero_fill (bool): Whether to initialize the buffer to zero.
            dtype (torch.dtype): the dtype of the workspace tensor

        Returns:
            Tuple[str, str, int]: A tuple containing:
                - "ws_ptr": A string identifier for the workspace pointer.
                - "workspace_{i}": agraph level unique identifier for
                    the workspace tensor.
                - offset: An integer representing the item offset in the workspace.
        )r   r   r   r   r   r   )r   r   r   r5   r   get_current_device_or_throwr   r  r  r   r   r   r   r   r  )rf   nelemr   r   argr  existing_argr   s           rR   r*  zKernelArgs.workspacel  s   8 '11)<77668#//1
  ))<)<= 	OA|$$\37%++)5):):<)M##A&#..0G0GOO''3>>9 ++s~~= >	 	""3'~~s~~q00rT   c           
        t         j                  j                         }t        |t        j
                  t        j                  dd|j                   d|j                   |      }| j                  D ]*  }|j                  |j                  k(  s||k(  r#J ||f        | j                  j                  |       |j                  S )a  
        Lazily allocate a graph-wide semaphores buffer with at least min_size.  This is a single buffer shared by
        all kernels and zero initialized once at graph start.  Each kernel must leave the buffer zeroed on exit.

        Warning: multiple calls to this function will return the same buffer.

        Args:
            min_size: the number of int32 semaphores required

        Returns:
            name of the semaphores buffer
        sem_ptrsemaphores_r  )r   r   r   r   r   r   )r5   r   r  r   r   r   r   uint32rG  r  r  r   r  )rf   min_sizecurrent_devicer  r  s        rR   
semaphoreszKernelArgs.semaphores  s     <<>'66,, $^%8%8$9>;O;O:PQ!
 !// 	@L&&#..8l*?S,,??*	@ 	""3'~~rT   c                f   t        |t              sJ t        |      |f       t        j                  |      }|| j
                  v r| j
                  |   S | j
                  j                         v r0 t        fd| j
                  j                         D               | j
                  |<   S )Nc              3  F   K   | ]  }|j                        sd   yw)rF   N)r	  )r  r  rc   s     rR   r  z)KernelArgs.seed_offset.<locals>.<genexpr>  s     U1!,,tBTQUs   !!)r1  r   rG  r   r  r   r  sum)rf   rc   re   s    ` rR   seed_offsetzKernelArgs.seed_offset  s    %%;UU';;%e$DMM!==''4==''))&U(<(<(>UUVW   $erT   c                    t        |t        j                        sJ t        |      |f       |j                  dk(  rd| j
                  |<   y| j                  d| j
                  |      S )Nr  ks)r1  r   SymbolrG  rc   r   r  r  s     rR   r   zKernelArgs.size  sX    $-AT
D/AA-99"(DMM$||D$--66rT   c                    t        | j                  j                         | j                  j                         | j                  j                               S rj   )r   r  keysr  r   rk   s    rR   
call_nameszKernelArgs.call_names  sA    ##%t':':'?'?'A4==CUCUCW
 	
rT   c                   | j                   j                  |d      }|t        |t              s|j                  S | j
                  j                  |d      }|t        |t              s|S | j                  j                  |d      S )z;
        Returns inner name of a given outer name.
        N)r  rV  r1  r  r   r  r  )rf   rc   inplacedr  s       rR   arg_namezKernelArgs.arg_name  s}     ''++D$7
8Z(H&&&))--dD9":k:+N!!%%dD11rT   c                    |S rj   r   )rf   r  r   s      rR   wrap_ptr_argzKernelArgs.wrap_ptr_arg  s    
rT   c                    t        |      S rj   r  )rf   r   s     rR   wrap_size_argzKernelArgs.wrap_size_arg  s    4yrT   Nc                   ddl m} |ddl m} |}g }g }g }t        | j                  j                               D ]  }t        |t              r|j                  d   }|j                  }	t        j                  j                  |      }
||
   }|j                  | d|	        |j                  | j                  ||
             |j                  | d        | j                  j!                         D ]  \  }}	|| j                  v rt        j                  j                  |      }
||
   }|j                  d| d|	        |j                  | j                  ||
             |j                  d| d        | j"                  j!                         D ]  \  }}|| j                  v st        |t              r%t        j                  j                  |      }
||
   }|j                  | d|        |j                  | j                  ||
             |j                  | d        | j$                  j!                         D ]  \  }}	t        |t&        j(                        r@t+        |t,        j.                        r&|j                  d|	        |j                  d	       n+|j                  d| d
|	        |j                  d|        |j                  | j1                  |             t        j                  j2                  st        j                  j2                  j5                  |        | j6                  rJ d       |||fS )NrF   )
INDEX_TYPE)r  r  z* *zconst zconst float zconst float zWorkspace not supported on CPU )r  r  r  r.   r  r  r1  r  r  r   r5   r   r   r  r  r  r  r  r   r   r   r   r   UNBACKED_FLOATr
  wrapper_codeensure_size_computedr  )rf   dtype_to_cpp_typer  r  	call_argsarg_defs	arg_typesr  outerinnerr   	cpp_dtypemaybe_inners                rR   cpp_argdefszKernelArgs.cpp_argdefs  s    	*$/ ,		t33::<= 		.H(J/((,E''EGG%%e,E)%0IOOykE734T..ue<=	{!_-		. !..446 	4LE5,,,GG%%e,E)%0IOOfYKr%9:T..ue<=vi[23	4 #'"5"5";";"= 	.E;,,,
;
0SGG%%e,E)%0IOOykK=9:T..ue<=	{!_-	. !MM//1 	ALE5%.>++4 ,ug 67  /&AeW =>  6*!67T//67ww##$$99%@	A &&I(II&I--rT   c                   g }g }g }g }t        | j                  j                               D ]  }t        |t              r|j                  t        |j                               |j                  |j                  d          |j                  t        j                  j                  |j                  d                |j                  t        |j                  |j                  d   t        j                  j                  |j                  d                       t        | j                  j                         | j                   j                               D ]  \  }}|| j                  v st        |t              r%|j                  t        |             |j                  |       |j                  t        j                  j                  |             |j                  t        ||t        j                  j                  |                    | j"                  j                         D ]  \  }}|j                  t        |             |j                  |       |j                  t%        |             |j                  t'        ||             t        j                  j(                  st        j                  j(                  j+                  |        | j,                  D ]m  }|j                  t        |j                               |j                  |j.                         |j                  |       |j                  |j0                         o ||||fS )Nr  )rc   r   r   )r.   r  r  r1  r  r  r  r   r  r5   r   r   r   r   r  r  r  r   rG  r   r  r  r  r   r   )	rf   r  r  r  precompile_argsr  r  r  r  s	            rR   python_argdefszKernelArgs.python_argdefs  s    #%!	!	/1t33::<= 	H(J/OOGH$7$789X11"56QWW..x/C/CB/GHI""!,,#//3''++H,@,@,DE	 "$$&%%'
 	LE5
 ,,,
5*0MOOGEN+U#QWW..u56"" ''++E2	" !MM//1 	ALE5OOGEN+U#T%[)""75%#89ww##$$99%@	A && 	(COOGCNN34S^^,""3'SYY'		(
 OY>>rT   c              #    K   t        | j                  j                               D ]  }t        |t              r|j
                  D ]  }|t        j                  j                  v s|t        j                  j                  v r<|| j                  v r| j                  |   |j                  f || j                  v svt        t        | j                  |         |j                  f   y wrj   )r.   r  r  r1  r  r  r5   r   r  r  r  r   r  r
   rw   )rf   r  others      rR   aliaseszKernelArgs.aliasesK  s     t33::<= 	UH(J/!-- 	UQWW777 ; ;;D...,,U3X5H5HHHD///sD$7$7$>?ATATTT	U	Us   B9C,<0C,c                    t        | j                  j                  |t              t              xr. t        | j
                  j                  |t              t              S rj   )r1  r  rV  r  r  r  r  s     rR   
is_removedzKernelArgs.is_removedZ  sK    ##D'2J
 N--11$@*M	NrT   c                l   t               }t        | j                  j                               D ]1  }t	        |t
              r|j                  |j                  d          3 | j                  j                         D ]5  \  }}|| j                  v st	        |t
              r%|j                  |       7 |S )Nr  )
r   r.   r  r  r1  r  rT  r  r  r  )rf   	live_outsr  r  r  s        rR   live_output_bufferszKernelArgs.live_output_buffersb  s    %/\	t33::<= 	4H(J/MM(..r23	4 !//557 	!LE5,,,
5*0MMM% 	! rT   )r   rw   r  z*dict[_T, str | RemovedArg] | dict[_T, str]rc   rI   rt   rw   rs   rv   )rc   r	   rt   r   r,  )r  rw   r  rw   rt   ru   )r  r   r   r   r   r   rt   ztuple[str, str, int])r  r   rt   rw   )rc   rw   re   r   rt   rw   )rc   r  rt   rw   )rt   zIterator[str])rc   rw   rt   r   )r  rw   r   r   rt   rw   )r   
SymbolLikert   rw   rj   )r  zdict[torch.dtype, str] | Nonert   z&tuple[list[str], list[str], list[str]])rt   z?tuple[list[ArgName], list[str], list[KernelArgType], list[Any]])rt   zIterator[tuple[str, str]]r  )rt   zOrderedSet[str])ry   rz   r{   r   r  r   r  r  r  r  r  r   r   r*  r  r  r   r  r  r  r
  r  r  r   r"  r%  r   rT   rR   r  r    s    		9	 	 
		 	5
 , ,
@B4: HM{{-1-1,0-19D-1	-1^87


2 BF4.!>4.	/4.l1?	H1?fUN
rT   r  c                  `     e Zd ZdZ	 	 d	 	 	 	 	 	 	 d	 fdZd
dZddZddZddZd
dZ	 xZ
S )r  aD  A CSEVariable is just a name for an expression but it is useful to be able to annotate them on a backend dependent basis.
    To do so, the backends can simply overload `Kernel.create_cse_var`
    The "CSEVariable.update_on_args" method gives you a hook for annotations
    See example of TritonCSEVariable in triton.py
    c                    t         |           t        |t              sJ t	        |             || _        || _        d| _        || _        || _	        y r@  )
r#  r   r1  r   rG  rc   bounds	use_countr   r  )rf   rc   r)  r   r  r%  s        rR   r   zCSEVariable.__init__v  sL     	&+.<V<.	

rT   c                    | j                   S rj   r  rk   s    rR   r  zCSEVariable.__str__  s    yyrT   c                ,    t        | j                        S rj   )hashrc   rk   s    rR   __hash__zCSEVariable.__hash__  s    DIIrT   c                X    t        |t              xr |j                  | j                  k(  S rj   )r1  r  rc   )rf   r  s     rR   __eq__zCSEVariable.__eq__  s!    %-I%**		2IIrT   c                     y rj   r   )rf   rc   ro   r  s       rR   update_on_argszCSEVariable.update_on_args  s    rT   c                N    | j                   j                   d| j                  dS r'  )r%  ry   rc   rk   s    rR   r  zCSEVariable.__repr__  s$    ..))*!DII=::rT   r  )rc   rw   r)  ValueRanges[Any]r   r   r  rE   rv   )rt   r   )r  objectrt   r   )rc   rw   ro   r	   r  r	   rt   ru   )ry   rz   r{   r|   r   r  r.  r0  r2  r  r/  r0  s   @rR   r  r  o  sU     %) $ ! "	
 J;rT   r  AugmentedKeyT)default)boundr7  .c                  P   e Zd ZdZ	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZddZddZddZddZ	dd	Z
dd
ZddZ ej                         ddddd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ ej                         ddf	 	 	 	 	 	 	 ddZ ej                         ddf	 	 	 	 	 	 	 	 	 ddZy)CSEz Common subexpression eliminationNc                    || _         || _        i | _        || _        |xs i | _        |xs i | _        |xs t        j                         | _        t               | _
        |xs i | _        y rj   )r   r[   _cachename_prefixstore_cachereduction_cache	itertoolsr   iter_buffer_idsr   invalidated_storesvarname_map)rf   r   r[   r=  iter_buffersr>  r?  rC  s           rR   r   zCSE.__init__  sm     FH&ALARPR!r 	 6B5VY__EV3=<7B7HbrT   c                6   g | j                   j                         D ]2  \  }}||vs| j                   |= | j                  j                  |       4 |r9| j                  j                         D ci c]  \  }}||v s|| c}}| _        y i | _        y c c}}w rj   )r>  r  rB  rT  r<  )rf   	keep_varsrc   tmpr  r  s         rR   
invalidatezCSE.invalidate  s    44++1134 	2ID#)#$$T*''++D1	2 ,0KK,=,=,?RDAq1	>1a4RDKDK Ss   1B>Bc           	          t        |       | j                  | j                  | j                  | j                  | j
                  | j                  | j                        S )N)r   r[   r=  rD  r>  rC  r?  )rG  r   r[   r=  rA  r>  rC  r?  rk   s    rR   clonez	CSE.clone  sP    tDz;;;;((--(((( 00
 	
rT   c                    | j                         }t        | j                        |_        t        | j                        |_        t        | j                        |_        |S )zNReturn a copy of using ScopedDict so changes to *_cache aren't visible in self)rJ  r)   r<  r?  r>  )rf   new_cses     rR   scoped_copyzCSE.scoped_copy  sH    **,#DKK0",T-A-A"B()9)9:rT   c                "    t        t        |      S )z@Override this method to augment cache key with backend specifics)r
   r6  rf   	cache_keys     rR   augment_keyzCSE.augment_key  s    M9--rT   c                @    || j                   | j                  |      <   y rj   r<  rQ  )rf   rP  r  s      rR   putzCSE.put  s    36D$$Y/0rT   c                <    | j                  |      | j                  v S rj   )rQ  r<  rO  s     rR   containszCSE.contains  s    	*dkk99rT   c                X    | j                   j                  | j                  |      d       S rj   )r<  rV  rQ  rO  s     rR   try_getzCSE.try_get  s"    {{t//	:DAArT   c                >    | j                   | j                  |         S rj   rS  rO  s     rR   rV  zCSE.get  s    {{4++I677rT   T)r)  rd   
assignmentr   r  c          	        t        |t              r|j                  }|s|sJ t        |t              rE|j                  j                  |      |_        |xj                  dz  c_        t        t        |      S t        |t              r|j                         }n1t        |t              r|j                  }nt        |t              sJ |}| j                  |      }	||sd}|	s| j                  |||      }	| j!                  ||	       |rt"        j$                  j&                  r+t"        j$                  j&                  j)                  |d       t        |t              rP|r |j+                  | j,                   |	 d       |j/                  |       |j+                  | j0                         |	S t        |t              rM|sJ |j+                  |j3                  | j,                   |	 d|j                   | j0                                |	S |r | j,                   |	 d| | j0                   }
n| | j0                   }
|j+                  |
       |rPt4        j6                  j8                  st4        j6                  j:                  r|t=               dk7  rt?        ||	|       |	S |	j                  j                  |      |	_        |	xj                  dz  c_        |	S )NrF   r   T)	only_oncez =z = ri  ) r1  r2   re   r  r)  tightenr*  r
   r  r'   getvaluer$   r  rw   rX  newvarrT  r5   r  current_nodecodegen_originating_infor  r   splicer[   r  r   r  r  r  r&   r  )rf   r   r   r)  rd   rZ  r   r  rP  r  r  s              rR   generatezCSE.generate  sl    dH%::D
""dK( ++--f5DKNNaN..n-I./		IdC(((Ill9%= E++feU3CHHY$88((HH))BB$ C  dN3!((DKK=R)@AMM$'$$T[[1: 
9  &67%%:$$$++se3tyyk$++'WX4 
- ""&++se3tfT[[MJ"&}5$$T* #"//KK%22JJ!-/1U:#FC7 
 ++F3CJMMQM
rT   c                    | j                    t        | j                         }t        j                  j                  ||||      }|| j                  |<   |S rj   )r=  r   rA  r5   r  create_cse_varrC  )rf   r)  r   r  var_namer  s         rR   r_  z
CSE.newvar4  sT     &&'T-A-A(B'CDhh%%huE%("
rT   c                    t        j                  | j                  vfd       t        j                  j                  |||      }|| j                  <   |S )Nc                     d  S )Nzduplicate name: r   r  s   rR   rp  zCSE.namedvar.<locals>.<lambda>G  s    4DTF2K rT   )r   _check_valuerC  r5   r  re  )rf   rc   r)  r   r  r  s    `    rR   namedvarzCSE.namedvar?  sU     	(((*K	
 hh%%dFE5A!$
rT   )r  r  rG  NNNN)r   rw   r[   rw   r=  rw   rD  zitertools.count[int] | Noner>  z+MutableMapping[str, CSEVariableType] | Noner?  z9MutableMapping[ReductionCacheKey, CSEVariableType] | NonerC  z!dict[str, CSEVariableType] | None)rF  zOrderedSet[CSEVariable]rt   ru   rt   r   )rP  rw   rt   r6  )rP  rw   r  r  rt   ru   )rP  rw   rt   r   )rP  rw   rt   zCSEVariableType | None)rP  rw   rt   r  )r   r'   r   z@str | CSEVariable | OpsValue | IndentedBuffer | DeferredLineBaser)  r4  rd   r   rZ  r   r   r   r  rE   rt   r  )r)  r4  r   r   r  rE   rt   r  )
rc   rw   r)  r4  r   r   r  rE   rt   r  )ry   rz   r{   r|   r   rH  rJ  rM  rQ  rT  rV  rX  rV  r   unknownrc  r_  rj  r   rT   rR   r:  r:    s   *  48CG9=II I 	I
 2I AII 7I.	
.7:B8 $7;#6#6#8$( $KK OK
 !K K K "K K 
K^ $7;#6#6#8$( $		 	 "	 		
 
	 $7;#6#6#8$( $ ! "	
  
rT   r:  c                  0     e Zd Zd fdZddZddZ xZS )CodeGenc                T    t         |           t        j                         | _        y rj   )r#  r   r  	ExitStack
exit_stackrf   r%  s    rR   r   zCodeGen.__init__O  s    $..0rT   c                :    | j                   j                          | S rj   )rq  	__enter__rk   s    rR   rt  zCodeGen.__enter__S  s    !!#rT   c                >    | j                   j                  |||       y rj   )rq  __exit__)rf   exc_typeexc_valexc_tbs       rR   rv  zCodeGen.__exit__W  s      7F;rT   rs   rk  rw  r	   rx  r	   ry  r	   rt   ru   )ry   rz   r{   r   rt  rv  r/  r0  s   @rR   rn  rn  N  s    1<rT   rn  c                  T    e Zd ZU dZded<   dZded<   dZded<   	 d#	 	 	 	 	 d$ fdZej                  d%d	       Z
ej                  	 	 d&	 	 	 	 	 	 	 d'd
       Z	 	 	 	 	 	 	 	 d(dZd)dZd)dZd*dZ	 d+	 	 	 	 	 	 	 	 	 d,dZd-dZ	 	 	 	 	 	 	 	 	 	 d.dZ	 	 	 	 	 	 	 	 	 	 d/dZ	 	 	 	 	 	 	 	 d0dZ	 	 	 	 	 	 	 	 	 	 d1dZd2dZ	 	 d&	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d3dZed4d       Z	 d+	 	 	 	 	 	 	 	 	 d5dZ	 	 	 	 	 	 	 	 	 	 d6dZd7dZd8 fdZd9 fdZd:dZd;dZ d;dZ!	 	 	 	 d<d Z"d=d!Z#d>d"Z$ xZ%S )?Kernelr  rw   newvar_prefixr[   Nz$Callable[[], OpsHandler[Any]] | None	overridesc                4   t         |           |rt        xj                  dz  c_        |xs
 t	               | _        t               | _        t               | _        t               | _	        d| _
        d| _        d| _        d| _        t        | j                  | j                         | _        t%               | _        t%               | _        d | _        d | _        d | _        d | _        t%               | _        t%               | _        i | _        d| _        d | _        y )NrF   Fr   )r#  r   r   generated_kernel_countr  ro   r'   loadscomputestoresatomic_add_foundnum_load	num_storenum_reductionr:  r}  r[   cser   must_keep_buffersstore_buffer_names
_load_mask_load_otherr`  node_to_boundsr  r  inplace_update_buffersmin_elem_per_threadkernel_name)rf   ro   increase_kernel_countr%  s      rR   r   zKernel.__init__`  s     	 **a/*(JL	#%
%'$& %.1$2D2Ddkk.R2<,3=<&*/326LP0:3=<
 79##$ '+rT   c              #     K   | j                   }|| _         |j                  j                         j                         | _        	 d  || _         y # || _         w xY wwrj   )r`  r  r)  
get_boundsr  )rf   r  priors      rR   set_current_nodezKernel.set_current_node  sO     !! "jj//1<<>	& %DDs   AAA A	AAc              #    K   ||}|d u x}r
t               }| j                  }| j                  }| j                  }| j                  }|| _        || _        || _        |j                         | _        	 d  || _        || _        || _        || _        |r
|rJ d       y y # || _        || _        || _        || _        |r
|rJ d       w w xY ww)Nz$unexpected store inside swap_buffers)r'   r  r  r  r  rM  )	rf   lbcbsbdisallow_storesr  r  r  r  s	            rR   swap_bufferszKernel.swap_buffers  s      :B Dj(?(!B

,,hh
??$		FDJ"DL DKDHEEEv2  DJ"DL DKDHEEEv2 s   A/C2B 6)C*C		Cc                     y)zOverride kernel emission. Return True if overridden, False to use default.

        External template handlers (e.g. Helion) can override this method
        to implement custom kernel emission to the wrapper.
        Fr   )rf   r  src_coder  node_schedulekernel_pathget_kernel_metadatas          rR   emit_kernel_overridezKernel.emit_kernel_override  s     rT   c                    t         rj   r  r  s      rR   r  zKernel.load  r  rT   c                    | j                   }	 | j                  | _         | j                  ||      || _         S # || _         w xY w)z+A load the depends on an index we have read)r  r  r  )rf   rc   r  r  s       rR   indirect_loadzKernel.indirect_load  s8    

	DJ99T5)DJDJs	   "8 	Ac                    t         rj   r  r  s       rR   r  zKernel.store_reduction  r  rT   c                    t         rj   r  r  s        rR   r  zKernel.store  
     "!rT   c                D    t        t        |       j                   d      r  r  r  s      rR   r  zKernel.device_assert_async  r  rT   c                    t         rj   r  r  s        rR   r  zKernel.reduction  
     "!rT   c                    t         rj   r  )rf   rc   r  re   
extra_metas        rR   partial_accumulatezKernel.partial_accumulate  r  rT   c                    t         rj   r  r  s       rR   r  zKernel.scan  s
     "!rT   c                    t         rj   r  r  s        rR   r  zKernel.sort  r  rT   c                    t         rj   r  rk   s    rR   
var_rangeszKernel.var_ranges  r  rT   c                    t         )z3
        See [Note: Inductor bucketize op]
        r  r  s           rR   r  zKernel.bucketize  s
     "!rT   c                    t         rj   r  rk   s    rR   assert_functionzKernel.assert_function	  s    !!rT   c           	     v   t        |t              rt        |      }t        |t              sJ t        |             |t        |t              sJ |t        |t              sJ |r|rd| d| d| d| d	}| d| d| }n|r
| d| }|}n|sJ | d| }|}|r	d| d| d}| j                   d| d| dS )	Nr(  z <= r  z < r   z) | ~(z, "index out of bounds: z"))r1  r  rw   rG  r  )rf   r  r  r  maskr{  
cond_prints          rR   indirect_assertzKernel.indirect_assert	  s    c;'c(C#s#.T#Y.#}
5# 666}
5# 666U ugT#eC5E7!<D!7$se3ug6JWD&DJL5U#eW%DJtfF4&*D&&'q.FzlRTUUrT   c                    t         rj   r  r  s        rR   r  zKernel.check_bounds0	  r  rT   c                    t         rj   r  r  s     rR   index_to_strzKernel.index_to_str5	  r  rT   c           	     (   t         |           | j                  sJ | j                  j	                  t        j                  t        | | j                                            | j                  j	                  t        j                  |              | S rj   )	r#  rt  r~  rq  enter_contextr5   set_ops_handlerCSEProxyset_kernel_handlerrr  s    rR   rt  zKernel.__enter__8	  sl    ~~~%%htT^^-=>?	
 	%%a&:&:4&@ArT   c                H    | j                          t        | 	  |||       y rj   )remove_kernel_local_buffersr#  rv  )rf   rw  rx  ry  r%  s       rR   rv  zKernel.__exit__A	  s     ((*7F3rT   c                   t         j                  j                  syt        fd| j                  D              }t               | j                  D ]c  }|| j
                  vs|| j                  j                  vs+j                  ||      s>| xj                  dz  c_	        j                  |       e D ]  }|| j                  j                  v rw| j                  j                  |   }t        |t              rEt        fd|j                  D              }|r| j!                  |       | j"                  j                  |       | j%                  |        y)z
        Any buffers that are both created and have a last use in the
        same kernel can be removed.

        Note that V.graph.scheduler can be None when codegening triton template
        kernels.
        Nc              3  t   K   | ]/  }|j                   v rj                   |   j                          1 y wrj   )name_to_bufdefining_op_name)r  r  r  s     rR   r  z5Kernel.remove_kernel_local_buffers.<locals>.<genexpr>P	  s;      &
i+++ !!#&779&
s   58rF   c              3  &   K   | ]  }|v  
 y wrj   r   )r  r  names_to_removes     rR   r  z5Kernel.remove_kernel_local_buffers.<locals>.<genexpr>f	  s     KaQ/1Kr  )r5   r   r  r   r  r  ro   r  $can_buffer_be_removed_through_fusionr  rT  r  r1  r  r  r  remove_inplace_bufferr  remove_buffer)rf   fused_node_namesrc   r  rb   r  r  s        @@rR   r  z"Kernel.remove_kernel_local_buffersE	  s3    GG%%	% &
..&
 

 ,6<++ 		*DD222		 7 77BB* !###D)		* $ 
	)Dtyy000ii//5c:.K3??KK..t4''++D1""4(
	)rT   c                    t         j                  d|       t        | j                  j                  |<   | j
                  j                  |       y )Nzremove_buffer(%r))rL  rP   r  ro   r  r  rT  r  s     rR   r  zKernel.remove_bufferm	  s;     			%t,)0		  &  &rT   c                    t         j                  d|       t        | j                  j                  |<   | j
                  j                  |       y )Nzremoving_inplace_buffer(%r))rL  rP   r  ro   r  r  rT  r  s     rR   r  zKernel.remove_inplace_bufferu	  s9    		/6*1		!!$'  &rT   c           
        t        |t        t        f      r|D cg c]  }| j                  |       c}S t        j
                  j                  j                  |      }t        |j                  d       }|D ci c]f  }t        |t        j                  t        j                  t        j                  t        j                  f      r|| j                   j#                  |      h }}t%        ||      S c c}w c c}w )Nc                    | j                   S rj   r  )ss    rR   rp  z(Kernel.rename_indexing.<locals>.<lambda>	  s
    !&& rT   )r  )r1  listtuplerename_indexingr5   r   r   r  sortedfree_symbolsr   r   UNBACKED_INTSIZEPRECOMPUTED_SIZEr  ro   r   r,   )rf   r  r9  sorted_symbolsreplacementss        rR   r  zKernel.rename_indexingz	  s    
 edE]+5:;D((+;;  ))%0 2 28HI $
%%II))''	 tyy~~a  
 
 %..! <
s   C4;A+C9c                    t        |i |S rj   )r  )rf   ro   r  s      rR   re  zKernel.create_cse_var	  s    D+F++rT   c                Z    |y| j                   j                  |j                               S )zC
        Returns arg name of a given input or output node.
        N)ro   r  r   )rf   r  s     rR   r  zKernel.arg_name	  s'     <yy!!$--/22rT   )NT)ro   zKernelArgs | Noner  r   rt   ru   )r  rD   rt   r  r  )r  r'   r  IndentedBuffer | Noner  r  rt   r  )r  rw   r  rw   r  rw   rt   r   rc   rw   r  r   rt   r  rc   rw   r  r   re   r  rt   ru   rj   
rc   rw   r  r   re   r  rZ   r4   rt   ru   r  
r   r   r  r   r  r3   re   %CSEVariable | tuple[CSEVariable, ...]rt   r  )
rc   rw   r  r3   re   r  r  dict[str, Any]rt   ru   r  r   r  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]r  tuple[CSEVariable, ...]rt   r  
r  r   r  r  r  r   r  r   rt   r  )rt   zdict[sympy.Symbol, sympy.Expr]r  r  r  r  r  r  r  r   r  r   r  r  r  zCSEVariable | Nonert   r  rv   )
r  zCSEVariable | strr  r   r  r   r  zCSEVariable | str | Nonert   rw   r  )r  r   rt   rw   rk  rz  rs   )rc   rw   rt   ru   )r  z6list[sympy.Expr] | tuple[sympy.Expr, ...] | sympy.Exprrt   r   )ro   r	   r  r	   rt   r  )r  r?   rt   r   )&ry   rz   r{   r}  r}   r[   r~  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r~   r  r  r  r  rt  rv  r  r  r  r  re  r  r/  r0  s   @rR   r|  r|  [  s   M3FC6:I3: MQ#,%#,EI#,	#,J & &  %)$(	FF "F "	F
 
F F:  	  
 "" SW"" *"3>"FO"	"


"" " &	"
 5" 
/""" &" 	"
 #" 
""'"
" (" 
!""'" (" 	"
 " 
!"" 15-1"" C" &	"
 $" " ." +" 
" " " *.VV V 	V
 'V 
V<""&0"9="FJ"	"
"4&)P''
/K/	/0,3rT   r|  c                  8    e Zd ZU dZded<   dZded<   dZded	<   y)
r  r  zClassVar[str]r  Nr   r   r  rw   ops_name)ry   rz   r{   r  r}   r   r  r   rT   rR   r  r  	  s!    "C" $E$HcrT   r  c                 b    	 dd l } | j                  | j                        S # t        $ r Y y w xY w)Nr   )	undefined)jinja2EnvironmentStrictUndefinedImportError)r  s    rR   
jinja2_envr  	  s?    !!,, " 
 	
  s   " 	..c                      e Zd ZdZe	 d	 	 	 	 	 	 	 dd       Zedd       Ze	 	 	 	 dd       ZdddZe	dd       Z
e	dd       Zdd	Z	 	 	 	 	 	 dd
ZddZy)KernelTemplatezj
    Base class for defining kernel templates.

    Children classes: TritonTemplate, CUTLASSTemplate
    c                    | j                  d      }t        |      dkD  r|dd  D cg c]  }d|z  |z  |z    c}|dd  dj                  |      S c c}w )NTrF   r  r  )
splitlinesr  r   )sourcenum_indentsindents_spacinglinesr  s        rR   indent_except_firstz"KernelTemplate.indent_except_first	  sd     !!$'u:>INqrAE&4<E!"I wwu~s   Ac                    t               }|y t        j                  |j                  d<   ddlm} 	 |j                  |       S # |$ r} G d d|      } ||      |d }~ww xY w)Nr  r   )TemplateSyntaxErrorc                  (     e Zd Zd fdZddZ xZS )IKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxErrorc                    t         |   |j                  |j                  |j                  |j
                         || _        y rj   )r#  r   messagelinenorc   filenameoriginal_error)rf   r  r%  s     rR   r   zRKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxError.__init__	  s>    G$&..&--&++&//	 +9D'rT   c                F   d| j                    d}|d| j                   dz  }t        | j                  d      r| j                  j                  j                  d      }|dz  }t        d| j                   dz
        }t        t        |      | j                   dz         }t        ||      D ]s  }|| j                   dz
  k(  rN||dz    d	||    dz  }t        | j                  d
      s=|dd| j                  j                  dz
  z  z   dz   z  }c||dz    d||    dz  }u |S )NzError in template at line 
zError message: r  z	Context:
r   r   rF   z: --> columnz     r  z^
z:     )r  r   r"  r  r  splitmaxminr  r  r  )rf   
error_infor  startendr  s         rR   r  zQKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxError.__str__	  sA   #=dkk]"!MJODLL>"DDJt22H= $ 3 3 : : @ @ F"l2
 #At{{Q 7!#e*dkkAo>!&uc!2 
KA DKK!O3 *QveAhZr.J J
#*4+>+>#I$.(/*-1D1D1K1Ka1O*P)Q*/)0%&J !+QveAhZr.J J

K &%rT   )r  r  rt   ru   rv   )ry   rz   r{   r   r  r/  r0  s   @rR   DetailedTemplateSyntaxErrorr  	  s    9&rT   r  )r  r  r  filtersr  r  from_string)r  envr  er  s        rR   _template_from_stringz$KernelTemplate._template_from_string	  sj    l;-;-O-O)*.#	8??6**" !	8&.A &> .a0a7C!	8s   A A!AA!c                   t         j                  j                  t        | t        t
        f      r.| D ci c]!  }|j                         |j                         # c}n | j                         | j                         idfd}|S c c}w )Nc                >    j                  |       }||S  |       S rj   )rV  )rc   r  _get_dtype_reallookups     rR   r   z1KernelTemplate._fake_get_dtype.<locals>.get_dtype	  s'    ZZ%F!"4((rT   )rc   rw   rt   r   )r5   r   r   r1  r  r  r   )	fake_outsr  r   r  r  s      @@rR   _fake_get_dtypezKernelTemplate._fake_get_dtype	  sr     ''++i$/AJK#cllncmmo5KF((*I,?,?,ABF	)  Ls   &B
Nc                     || _         || _        y rj   )rc   _hash)rf   rc   r-  s      rR   r   zKernelTemplate.__init__
  s    	
rT   c                    | j                   S )a  
        entry point to override for templates to ensure a uid e.g. through a prefix

        the purpose of this is that every KernelTemplate/ExternKernelChoice is unique
        in the system, but reproducible e.g. restarting pytorch should yield the same id
        r  rk   s    rR   uidzKernelTemplate.uid
  s     yyrT   c                    | j                   S )a  
        source hash for a Template.

        Templates can optionally provide a src hash to make it easier to cache/validate that
        a template has not changed from one version to another. Override this if that detection
        is different for your specific Template
        )r  rk   s    rR   src_hashzKernelTemplate.src_hash
  s     zzrT   c                X    g } | j                   |fi |}|t        |      dk(  r|d   S y)z
        Maybe generates a new ChoiceCaller and returns it, or None if generation fails.

        kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
        NrF   r   )maybe_append_choicer  )rf   r  temp_choicesr  s       rR   choice_or_nonezKernelTemplate.choice_or_none
  s>     #%))),A&A>c,/14?"rT   c                   	 |j                   | j                  di |       y# t        $ rQ}t        j	                  d|t        |       t        j                         t        j                  k         |cY d}~S d}~ww xY w)a%  
        Maybe generates a new ChoiceCaller and appends it into existing choices.
        Returns None if success, otherwise returns the error.

        choices: A list of ChoiceCallers.
        kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
        Nz3Cannot Append Choice: %s. KernelTemplate type is %s)
stack_infor   )	r  rc  r   rL  inforG  getEffectiveLevelrN   INFO)rf   choicesr  r  s       rR   r   z"KernelTemplate.maybe_append_choice)
  sn    
	NN=4==2623" 	HHET
002W\\A	   H	s   !$ 	A>AA93A>9A>c                    t         )zM
        Generates a ChoiceCaller instance from the given arguments.
        r  )rf   r  s     rR   rc  zKernelTemplate.generate@
  s
    
 "!rT   )   )r  rw   r  r   r  r   rt   rw   )r  rw   rt   r	   )r  zlist[Buffer] | Bufferrt   zCallable[[str], torch.dtype]rj   )rc   rw   r-  r   rt   ru   rv   r   )r  r	   rt   zChoiceCaller | None)r(  rx   r  r	   rt   zNotImplementedError | None)r  r	   rt   r=   )ry   rz   r{   r|   r   r  r  r  r   r~   r  r  r"  r   rc  r   rT   rR   r  r  	  s     >?"%8;	  *8 *8X (	% "    
 ,/	#."rT   r  c                  6    e Zd ZdZd Zd fdZddZddZ	 	 d	 	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 	 	 ddZ	ddZ
ddZ	 d	 	 	 	 	 	 	 	 	 dd	Zdd
ZddZddZ	 	 	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 	 	 d dZ	 	 d!	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d"dZ xZS )#r  z~A ops handler that proxies calls to `kernel` and its
    handler and returns `CSEVariable`s with correct shape and dtype.
    c                b    t         |           ddlm}  |       | _        || _        || _        y )Nr   ValueRangeAnalysis)r#  r   r)  r.  vr_analysisr  parent_handler)rf   r  r0  r.  r%  s       rR   r   zCSEProxy.__init__O
  s+    /-/,rT   c           	     d  
  | j                   gi  t        | j                        i }t               }t	               }t               
t        |      }d d dk(  r
dk(  r|j                  |j                  nydk(  rT
dk(  rOt        j                  j                  j                  j                  t        j                  d       j                  d n 
dv rt        |      } |i  |i 
dv rJ dd
fd}	t        j                   |	|      S )	Nmaskedrk  ri  )rk  ri  r{  )rk  ri  r   c                   t        	t        t        f      r	
   n	}t        t        t        f      r,t              dkD  rt        d   t        t        f      r
   n}
dz  
t        | t              r+dk(  r| j
                  || _        | j                  || _        t        j                  j                  j                  t        j                  j                  | 	      }|j                         t        j                  j                  st        j                  j                   r)|J t#        t        j                  j                  ||       t        j                  j$                  r)J t'        t        j                  j                  |       t        j(                  r$t+        t        j                  j                  |       |S )Nr   rF   ri  r)  r   r  )r1  r  r  r  r  r   r  r5   r  r  rc  r  r2  r   r  r  r  r  r  r  runtime_triton_nan_assertsr  )r  	var_dtype	var_shapecsevarro   r  r)  r  rc   r  
output_idxoutput_shapes       rR   do_csez!CSEProxy._default.<locals>.do_csex
  s   
 lT5M: Z(!  lT5M:%)|Au> Z( "  !OJ ![)e#'AG77?'AGXX\\**  "" + F !!$f5 ##??&&>> ,,,AHH,,fi@"">>#///AHH,,flC00!((**F3MrT   )r  zstr | CSEVariablert   r  )_bound_variabler  r0  r   r"   r&   r   r  r5   interpreterr`  r  rV  r  r  pytreetree_map)rf   rc   ro   r  re   dtype_handlershape_handlershape_opdtype_opr;  r  r)  r  r9  r:  s    ```      @@@@@rR   _defaultzCSEProxy._defaultW
  s@   %%%d<T<V<2++T2DCFC2424%'=$/88 3 ;;L ;;LX'U"2==55::>>#''e   L00}d3H#T4V4L#T4V4L''+++
0	 0	d vu--rT   c                  	 ddl m} ddlm} ddlm} t        t        j                  |      rt        j                         S t        t        j                  |      rt        j                         S t        t        j                  t              rt        j                         S t        j                  j                  		j                  |k(  r| j                  j                  t        | j                  j                  t               s$J t#        | j                  j                               | j                  j                  j%                  	t        j                               S t&        j(                  rjt+        ||      r^t-        	fddD              rt        j                         S |rJ d	d}t/        t1        ||            } t3        | j4                  |      | S t        j                         S )
z
        If the variable comes from an FX node, we forward the bound we have already computed
        Else, if the variable when codegen'ing another op, we try to compute its bounds
        r   r-  )TritonTemplateKernelrF   )CUTLASSTemplateKernelc              3  :   K   | ]  }|j                   v   y wrj   )r  )r  r  fx_nodes     rR   r  z+CSEProxy._bound_variable.<locals>.<genexpr>
  s     V11&Vs   )set_indirectr  r  c                    t        | t              r| j                  S t        | t        j                        rt        |       S | S rj   )r1  r  r)  r   r!  r   r8  s    rR   arg_to_boundz.CSEProxy._bound_variable.<locals>.arg_to_bound
  s2    a-88O5::.&q>)HrT   )r9  r	   rt   r	   )r)  r.  select_algorithmrF  cutlass.kernelrG  r1  r5   r  r   rl  r=  r/   r`  r  r  dictrG  rV  r   compute_all_boundsr"  r  r  r  r  r/  )
rf   rc   ro   r  r.  rF  rG  rL  
arg_boundsrI  s
            @rR   r<  zCSEProxy._bound_variable
  sm   
 	0;9ahh 45&&((ahh 56&&((amm[1&&((--,,>>T!dkk&@&@&Ldkk88$? **B ? ;;--11';;N;N;PQQ&&73Et+L V0UVV"**,, : c,56J274++T2J??""$$rT   c                P   t        |t              rt        j                  |      }t        |t        j                        sJ t        |      |f       |j                  j                  dk  r|rt        j                  |t        j                  |t        j                              }|j                  j                  dk\  r0t        j                  |d      }t        j                  |||      }n|}t!        j"                         }|j                  t!        j"                         k7  rt        |t        j$                        r|j                  t!        t&         d      z  }t!        |j                  |z   |j                  |z         }|j                  j                  dk\  r"|j                  t!        dt&              z  }	||	z  }| j(                  j*                  j-                  | j(                  j.                  |||j0                  |j2                        }| j4                  j7                  |||      }
t9        |      ro|j                  j                  dk\   }t        |t        j$                         xs |j                  j                  |k   }| j(                  j;                  |
|||       |
S )Nr   r  r4  )r1  r   r   r  r!  rG  r)  r  r0   rT  r  r   longr  ltry  r   rl  Numberr   r  r  rc  r  r   r  r0  r  r%   r  )rf   r  r   r  r  stmrT  
new_bounds
neg_boundspos	sympy_varassert_lowerassert_uppers                rR   r  zCSEProxy.indirect_indexing
  s    dC ==&D$

+?d4j$-??+ ::aggc3>>$

#CD::##q(QB))BS1C %,,.Jzz[0022z$7U !ZZ+vgr*BB
($$t+Z-=-=-D
 ::##q(**{1f'==C!+c!1J++//**##!iiii + C ''99#tUK	5! #

 0 0A 56L)$== 

  4'BL KK$$YlLQrT   c                >    | j                   j                  ||||      S rj   )r  r  r  s        rR   r  zCSEProxy.check_bounds  s     {{''dE5AArT   c                   || j                   j                  j                  v r)t        j                   j                  j                  |       t        |t        j                        r| j                   j                  ||      S | j                   j                  j                  }||v r||   S | j                   j                  ||      }|j                  dk(  r| j                   xj                  dz  c_        |S r@  )r  r  rB  r5   r  rT  r   r   TMPr  r>  r  r*  r  )rf   rc   r  r>  outs        rR   r  zCSEProxy.load  s    4;;??555 HH&&**40udhh/;;,,T599kkoo11;t$$kktU+ ==AKK  A% 
rT   c                l   || j                   j                  j                  |<   | j                   j                  r{|t        j
                  j                  v r^| j                   j                  j                  |      }|j                         D ]%  }|| j                   j                  j                  |<   ' y y y rj   )	r  r  r>  r`  r5   r   name_to_buffer
get_outputget_mutations)rf   rc   re   r  
other_names        rR   _update_store_cachezCSEProxy._update_store_cache)  s    ,1##D);;##0F0F(F++**55d;C!//1 @
:?++J7@ )G#rT   c                2   | j                   j                  j                  |       |dk7  r| j                  ||       |t        j
                  j                  vr?| j                   j                  ||||       | j                   xj                  dz  c_        y y )N
atomic_add)rZ   rF   )	r  r  rT  rf  r5   r   r  r  r  r  s        rR   r  zCSEProxy.store0  s{     	&&**40<$$T51qww...KKdE5t<KK!!Q&! /rT   c                <    | j                   j                  ||       y rj   )r  r  r  s      rR   r  zCSEProxy.device_assert_async;  s    ''c2rT   c                6     | j                   j                  |  y rj   )r  r  rn   s     rR   r  zCSEProxy.partial_accumulate?  s    &&&-rT   c                "   | j                   j                  j                  |       | j                  ||       |t        j
                  j                  vr<| j                   xj                  dz  c_        | j                   j                  |||      S y r@  )	r  r  rT  rf  r5   r   r  r  r  r  s       rR   r  zCSEProxy.store_reductionB  so    &&**40  u-qww...KK!!Q&!;;..tUEBB /rT   c                |    | j                   xj                  dz  c_        | j                   j                  ||||      S r@  )r  r  r  r  s        rR   r  zCSEProxy.reductionJ  s4     	!!Q&!{{$$UI~uMMrT   c                <    | j                   j                  |||      S rj   )r  r  r  s       rR   r  zCSEProxy.scanT  s     {{
F;;rT   c                >    | j                   j                  ||||      S rj   )r  r  r  s        rR   r  zCSEProxy.sort_  s     {{
CCrT   c           	     D    | j                   j                  |||||||      S )a  
        [Note: Inductor bucketize op]

        Inputs:
        -------
        values: the values to be bucketized.
        boundaries: a tuple containing
          (a) the name of the boundaries tensor (which must be sorted, unless
          the sorting tensor is present),
          (b) the length of the tensor in the last dimension (i.e. the length of
          one set of boundaries),
          (c) the number of elements in the underlying storage (i.e. the length
          of the flattened tensor, ignoring striding), and
          (d) the stride of the tensor in the last dimension.
        boundary_indices: indices into a flattened version of the boundaries
        tensor, of the same size and shape as "values".  Each index points to
        the first element in the set of boundaries to be used for the
        corresponding value.
        indexing_dtype: the dtype to use when indexing into the boundaries
        tensor.  This must be int64 or int32.  This additionally specifies the
        dtype of the return value.
        right: see "Details" below.
        sorter: an optional tuple containing
          (a) the name of an optional sorting tensor, used to access unsorted
          boundaries without reordering the boundaries tensor, and
          (b) the stride of the tensor in the last dimension.
        The values in the sorting tensor are used as indices into the *last*
        dimension of the boundaries tensor, with all other indices matching.
        The size of the sorting and boundaries tensors must be equivalent.
        sorter_indices: must be present if the sorting array is present; see
        "boundary_indices" for the equivalent definition for the boundaries
        tensor.

        Output:
        -------
        The buckets each value belongs in, within a given set of boundaries.  0
        indicates a position before the first boundary, and len(boundaries_set)
        represents a position after the last boundary.

        Details:
        --------
        Given a value and a set of boundaries, calculate the bucket that each
        value belongs to.  This works differently in 1-D and N-D cases.

        for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [0, 4, 4, 8], right=True
        return =   [[ 0, 1, 1, 1], [1, 3, 3, 4]].

        for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [[0, 4], [4, 8]], right=True
        return =   [[ 0, 1, 1, 1], [0, 1, 1, 2]]

        Note that in the N-D boundaries case, the shape of "values" and
        "boundaries" must match in every dimension _except_ the last.

        When right == False, bucket i refers to range (boundaries[i], boundaries[i+1]].
        When right == True,  bucket i refers to range [boundaries[i], boundaries[i+1]).

        Boundaries must be non-decreasing, or a sorter must be provided which
        would re-index offsets in a non-decreasing order (e.g. the second output
        of torch.sort(offsets)).  Otherwise, the result is undefined.
        )r  r  r  s           rR   r  zCSEProxy.bucketizeh  s1    L {{$$
 	
rT   )r  zKernel[Any]r0  zOpsHandler[Any])rc   rw   ro   ztuple[Any, ...]r  r  rt   r	   )rc   rw   ro   r	   r  r	   rt   r4  r  )
r  r  r   r  r  r   r  r   rt   r  r  r  )rc   rw   re   r  rt   ru   rj   r  r  )ro   r	   rt   ru   r  r  r  r  r  r  )ry   rz   r{   r|   rc   r   rD  r<  r  r  r  rf  r  r  r  r  r  r  r  r  r/  r0  s   @rR   r  r  H
  s    D-S.j.%h 55 5 	5
 5 
5nBB&0B9=BFJB	B
"@ SW	'	' *	'3>	'FO	'		'3.CNN N &	N
 5N 
/N	<'	<
	< (	< 
!	<D'D (D 	D
 D 
!D  15-1N
N
 CN
 &	N

 $N
 N
 .N
 +N
 
N
rT   r  )rQ   rw   rt   ru   )NNNN)r   rw   r2  r   r3  r   r4  r   r5  r   r6  CustomGraphModulePass | Noner7  ConfigModule | Nonert   ru   )r   torch.device | str | Nonert   zOrderedSet[BackendFeature])r   rr  rM  r:  rt   r   )r   rw   rt   zSchedulingConstructor | None)FF)r   rw   rQ  r   rR  r   rt   r   )r   rw   rt   rp  )r   rw   rt   rq  rs   )r  Sequence[sympy.Expr]r  rs  r  rs  rt   r   )r   rw   r  r  rt   ru   )r   rw   rt   r  )r  rw   ro   r	   r  r	   rt   r   )r   r'   r  r  r   r   rt   ru   )r   r'   r  r  r  rE   rt   ru   )r   r'   r  r  rt   ru   )r  rw   rt   r   r   r  )rt   r	   )
__future__r   r_   r  dataclassesenumr  r@  rN   rM  r  ra   rer]   abcr   r   r   r   r   typingr	   r
   r   r   r   r   typing_extensionsr   r   r   r   torch.fxtorch._prims_commonr   torch.utilsr   r>  torch.utils._config_moduler   torch.utils._ordered_setr   torch.utils._sympy.numbersr   torch.utils._sympy.printersr   _PythonPrintertorch.utils._sympy.symbolr   r   r   torch.utils._sympy.value_rangesr   r   r  r   r   dtype_propagationr   ops_handlerr    r!   shape_propagationr"   utilsr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   virtualizedr/   r0   r1   r2   r3   r4   r5   collections.abcr6   r7   r8   r9   r:   custom_graph_passr;   r   r<   r=   r>   r?   r  rA   r  rB   rC   rD   rE   r  rH   rI   r   rG  r   rw   r   r&  r  _logginggetArtifactLoggerry   rL   	getLoggerrL  rS   	dataclassrV   r   r   r   r   r   r   r   r   r   KernelArgTyper   r}   r  r-  r  r.  r/  r8  r:  rI  rN  rH  rT  rW  rY  cacherF  r  r  r  r  bfloat16r  float16r   r7  float64int8int16rU  r  r   uint16r  uint64r  r  r  r  r  r  r2  compile
IGNORECASEr  r  r  r  rO  INT_TO_FLOATr	  r  r  r  r  r  r  r  r  r  r  r6  r  r  r   ReductionCacheKeyr:  rn  r|  r  r  r  r  r   s   0rR   <module>r     sQ   "          	 	  #   J J +    ? ) 3 / - G O O D  : ; :      LL$9>>$DD2-	B$i$&6%7%GH23u||#J F~~//*Eg!=
   >/		 /(C  Td= d dN* *                9 9 9 y(725EET,.) .5" 5"p :< 6 ;#(  AC > CAC  > C8 =A;?7;04BB,B /B !:	B
  9B 5B .B 
B4
&T 
&3%33$3%30>3	3U
 @E"8<-6 w wtUU$U  U 	U;;&7;	;,&, 
NNEKK	MM5;;> JJMMMMJJKKKKKKKKLLLLLL
 	u> : ,''' ' 	'T::!0:9D:	:2	L	L!0	L9G	L		L
aB aBH=N =&_1 _1D "rzz";2==Q  X;#%5z# X;v * * *  6: r6;HH-r6 ;HH/- 	r6 ;HH/- 	r6$ ;HH/- 	%r60 ;HH/- 	1r6< ;HH*)	=r6L ;HH(0	Mr6X 	;HH66
 8
	Yr6z ;CC)+ {r6L ;HH1Mr6V ;HH2Wr6` ;HH1ar6j ;HH2 kr6t ;HH%8$ur6B 	;HH&%		Cr6N ;HH%8	Or6Z 	;HH&	[r6d ;HH+er6p %;HH88)	qr6| %;HH88)	}r6H %;HH8)Ir6R %;HH8)Sr6^ 
;HH'
_r6h ;HH(ir6r ;HHc	sr6B ,;HH?0Cr6L ,;HH?0Mr6X &;HH9*Yr6b 
;HH*
cr6l );HHD-mr6v );HHD-wr6@ );HHD-Ar6J );HHD-Kr6T (;HHC,Ur6^ $1;HHL5$_r6h $1;HHL5$ir6r $1;HHL5$sr6| $1;HHL5$}r6F ';HHB+Gr6P (;HHC,Qr6Z (;HHC,[r6 2 rj	-# -"> *Z 
 N N N 
 ,X Xv
#; #;L 5+;TeK,--	/k'/=0
1 k\
< 
<@3Wgo. @3F
     S" S"ln
~ n
k=s   "
f#