
    9j*[                      U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZm Z m!Z! d dl"m"Z" d dlm#Z# d dl	m$Z$ d dl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 d dl1m2Z2m3Z3m4Z4 d d	lm5Z5 d dl6Z6d dl7Z7d dl8m9c m:Z; d d
l<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZC d dlDmEZE d dl8mFZFmGZG ddgZHd dlImJZJ d dlKmLZLmMZMmNZNmOZO e-rhd dlmPZPmQZQmRZR d dlSmTZT d dl7mUZUmVZVmWZW d dlXmYZY d dlZm[Z[ d dl\m]Z] d dl^m_Z_ ddl`maZa ddlbmcZc ddldmeZe ddlfmgZg dd lhmiZimjZjmkZkmlZlmmZmmnZn dd!lompZp dd"lqmrZrmsZs g d#Zt e0d$      Zuej                  dLd%       Zwd d&lxmyZy d d'lzm{Z{ d d(l|m}Z} d d)l~mZ d d*lmZ d d+lmZ d d,lmZmZmZmZmZ d d-lmZmZ d d.lmZmZ dd/lmZ dd0lmZ ej                  d1k(  Z ej,                  e      Z e0d2      Zee6j6                  e6j6                  f   Ze7j:                  ez  e7j                  z  dz  Zerd3n ej@                  d4d5      Zd6d7d8e d9Zd:Zd:Zd:Zd;Z eEe7jN                  e7jP                  e7jR                  e7jT                  e7jV                  e7jX                  e7jZ                  e7j\                  e7j^                  e7j`                  e7jb                  e7jd                  e7jf                  e7jh                  e7jj                  g      Zd<ed=<   d>Zeedz
  z  d k(  red?k\  sJ d@       dMdAZdNdBZ G dC dDe6jv                        Z ejz                  dEF       G dG dH             ZdOdPdIZ	 	 	 dQ	 	 	 	 	 	 	 	 	 dRdKZ	 	 	 dQ	 	 	 	 	 	 	 	 	 dRdLZej                  dSdM       ZdTdNZÐdUdOZĐdVdPZŐdWdQZƐdXdRZdYdSZ	 	 	 	 dZdTZȐd[dUZɐd\dVZ	 	 	 	 d]dWZːd^dXZdY f	 	 	 	 	 d_dZZ	 	 	 	 	 	 	 	 d`d\Zΐdadbd]Z	 	 dc	 	 	 	 	 	 	 	 	 ddd^Z	 	 	 	 	 de	 	 	 	 	 	 	 	 	 	 	 	 	 dfd_Zѐdgd`ZҐdhdaZӐdidbZԐdjdcZՐdkddZ e3de      Z e0dfdEg      Zee(e&ef   ef   Z G dh die,e)eef         ZڐdldjZ	 	 	 	 dldkZ	 	 	 	 dmdlZ	 	 	 	 dndmZ	 	 	 	 	 	 dodnZ	 	 	 	 	 	 dpdoZ	 dq	 	 	 	 	 drdpZ	 	 	 	 	 	 dsdqZdtdrZdudsZdvdtZdwduZdxdvZdydwZdzdxZd{dyZd|dzZ eg d{      Z	 	 	 	 d}d|Zd~d}Zdd~Zd dlZddZg Zd[ed<   ddZddZej                  dd       Zej                  	 	 	 d	 	 	 	 	 	 	 dd       ZeZeZeZdJdddZdJd	 	 	 	 	 	 	 ddZ ejF                  d?      dd       Z G d de+      Zejz                   G d d             Z  G d d      Z G d de      Zej                  dd       Z G d d      Z G d de      Zej                  ddd       ZejF                  dd       ZejF                  dSd       ZddZ		 dq	 	 	 	 	 	 	 ddZ
dddZ	 	 	 	 	 	 ddZddZddZdJdJdEd	 	 	 	 	 	 	 	 	 ddZddJd	 	 	 	 	 	 	 ddZ	 d	 	 	 	 	 ddZdJd	 	 	 	 	 	 	 ddZdJd	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 ddZ ejF                  d      dSd       Z ejF                  d      dSd       Zd Z ejF                  d      dSd       Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZ	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZee6j6                  z  Zded<   ej                  	 d	 	 	 	 	 	 	 	 	 dd       Zej                  dd       Zej                  dd       Z ej                  dd       Z!ej                  dd       Z"ddZ#ddZ$ddZ%ddZ&ddZ'	 	 	 	 	 	 	 	 ddZ(	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ)dSdZ* G d d      Z+	 	 	 	 	 	 	 	 ddZ,	 	 	 	 	 	 	 	 ddÄZ-ddĄZ.ddńZ/ddƄZ0	 	 	 	 	 	 	 	 ddǄZ1	 	 	 	 	 	 	 	 ddȄZ2ej                  	 	 	 	 	 	 ddɄ       Z3	 dq	 	 	 	 	 ddʄZ4dd˄Z5dd̄Z6dd̈́Z7dd΄Z8ddτZ9ddЄZ:ej                  ddф       Z;dd҄Z<ej                  ddӄ       Z=ej                  ddԄ       Z>ej                  ddՄ       Z?ddքZ@ddׄZAdd؄ZBddلZCdSdڄZDdSdۄZEdd܄ZFd|d݄ZG G dބ dej                        ZI	 	 	 	 	 	 	 	 	 	 ddZJddZK	 	 	 	 ddZL	 dq	 	 	 	 	 ddZMddZN	 dq	 	 	 	 	 ddZOddZP	 	 	 	 	 	 ddZQ	 	 	 	 	 	 	 	 ddZRd f	 	 	 	 	 	 	 	 	 	 	 ddZSd f	 	 	 	 	 	 	 	 	 	 	 ddZTddZUddZVddZWejz                   G d d             ZXej                  dd       ZYddZZddZ[dSdZ\ddZ]ddZ^	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ_ddZ`ddZaddZbddZc	 	 	 	 	 	 	 	 ddZdddZe	 	 	 	 	 	 	 	 ddZfdАd Zg	 dq	 	 	 	 	 	 	 dѐdZh	 	 	 	 	 	 dҐdZidӐdZj	 	 	 	 	 	 dԐdZkdSdZldĐdZmddd	d
ddddZnenj                         D  ci c]  \  } }|| 
 c}} Zp ej                  d      ZrdՐdZsd֐dZtdאdZudאdZvej                  dؐd       Zwejz                   G d d             Zxi Zyded<   	 	 	 	 	 	 	 	 dِdZz eE       Z{ded<   dڐdZ|dqdېdZ}dܐdZ~ e0d      Z e0d      Z G d  d!e eef         Z e2dE"      dqdEdFdݐd#       Zdސd$Z G d% d&ej                        Zej                  dߐd'       ZdSd(Zdd)Zdd*Zdd+Zdd,ZdLd-Zdd.ZdSd/Zdd0Zd1Zdd2Zdd3Zdd4Z	 	 d	 	 	 	 	 	 	 	 	 dd5Zdd6Zdd7ZdSd8Zdd9Zdd:Z ejz                  dEF       G d; d<             Zed=e&f   Zeeegef   Z G d> d?      Z e       Zdd@ZddAZddBZddCZddDZ eEg dE      ZddFZe#ddG       ZddHZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddIZ	 	 	 	 	 	 ddJZ	 d	 	 	 	 	 	 	 ddKZyc c}} w (      )annotationsN)Callable
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)	lru_cache)StringIO)AnycastConcatenateGenericLiteral
NamedTupleProtocolTYPE_CHECKING	TypeAlias	TypeGuardTypeVar)dataclass_transform	ParamSpecSelf)mock)datasheet_tops)DeviceProperties)_needs_inductor_compile)dtype_abbrs)
OrderedSet)tree_flattentree_map_only!activation_quantization_aten_passinductor_autotune_lookup_table_sympy_subs)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)Path)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node)ScalingType   )WorkspaceArgPythonWrapperCodegen)DepGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTc                     t         D  cg c]#  } t        t        |       j                         s"| % }} t	        |      dk  sJ t	        |      dk(  rd}|S |j                         }|S c c} w )Nr7   r   rH   )	GPU_TYPESgetattrtorchis_availablelenpop)x
avail_gpusgpu_types      U/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/utils.pyget_gpu_typerX   j   sg    &K'%*;*H*H*J!KJKz?aZA-vHO 4>>>3CHO Ls
   #A'A')get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32_TspvTORCHINDUCTOR_XPU_KERNEL_FORMATzebinz.cubinz.hsaco.)rH   hiprJ         zOrderedSet[torch.dtype]_TMA_SUPPORTED_DTYPES@      zmust be power of 2c                *    | t         z   dz
  t          z  S )z/Round up to the nearest multiple of ALIGN_BYTESr7   )ALIGN_BYTES)nbytess    rW   _alignrz      s    [ 1$44    c                   t        | t        j                  t        j                  f      r#t	        t        t        | j                              S t        | t              xs! t        j                  | t              t        k(  S )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrx   )vs    rW   r   r      sQ    !eii+,3{AFF+,,aK599Q#<#KKr{   c                  *    e Zd ZdZdZdZedd       Zy)r   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr7   Tc                    t        |t        t        j                  f      rt	        t        |            S t        |      r|S y N)r}   intr~   Integerrz   r   )clsvalues     rW   evalz
align.eval   s6    ec5==12#e*%%uL r{   N)r   
sympy.Exprreturnzsympy.Expr | None)__name__
__module____qualname____doc__nargs
is_integerclassmethodr    r{   rW   r   r      s!    FEJ r{   r   T)frozenc                  :    e Zd ZU dZded<   ded<   ded<   ded<   y	)
GraphPartitionMapzP
    Mapping from the partition info (e.g., input/output) to the graph info
    r   idzlist[int | None]input_index_mappingoutput_index_mapping	list[str]constant_namesNr   r   r   r   __annotations__r   r{   rW   r   r      s$    
 	G *)** r{   r   c           
         |         t         j                  j                          t        j                  t	        d      t         j
                  d      }t         j                  j                  d      }t         j                  j                  d      }|j                          t        d      D ]  }|j                           |          |j                          t         j                  j                          |j                  |      dz  }t        dt	        ||z              }t        dt	        ||z              }	t        |      D ]	  } |          t        |	      D cg c]"  }t         j                  j                  d      $ }}t        |	      D cg c]"  }t         j                  j                  d      $ }}t         j                  j                  t         j                  j                  j                  g      5 }
t         j                  j                          t        |	      D ]q  }|j                          ||   j                          t         j                  j                   j                  d	      5   |         d
d
d
       ||   j                          s t         j                  j                          t        j"                  t%        ||      D cg c]  \  }}|j                  |       c}}      }d
d
d
       t        j&                        j)                         }t*        j-                  d       t*        j-                  
j/                         j1                  dd             t3        |
j5                         D cg c]A  }|j6                  t8        j                  k(  r"t;        j<                  d|j>                        |C c}      }|r"|tA        j&                  d |D              dz  z  }t*        j-                  d|       |S c c}w c c}w # 1 sw Y   xY wc c}}w # 1 sw Y   3xY wc c}w )R  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
        ArH   dtypedeviceTenable_timing   r7   
activitiesRunCudaModuleN
raw eventsself_device_time_totalsort_by	row_limitzfused_abs_max_\dc              3  4   K   | ]  }|j                     y wr   device_time_total.0events     rW   	<genexpr>zfp8_bench.<locals>.<genexpr>(  s     QE33Q        @@profiling results: %s ms)!rP   rH   synchronizeemptyr   float16Eventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityCUDAnvtxtensorzipmeanitemlogdebugkey_averagestabler\   eventsdevice_typer[   rematchname
statistics)fnwarmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpisetimesresr   filtered_eventss                     rW   	fp8_benchr      sT    D	JJKKJu}}VLE **"""6K

  t 4I1X 
 	JJ**959K 1c&;./0H1c#+,-H 8_ 
 BGxQA5::##$#7QKQ?DXO!!!!5OIO			NN++00
 
  
 
 


 x 	"AKKMN!!#&&7 aL!	" 	

 +.{I+FG41aQ^^AG

" **U

 
 
"CIIlIIann$$-EQS$TU 	
!!Z__4HH0%**=I	 	
	O OOQQQ	

 II(#.JO RO 
 H
 
*	
sE   "'P'PA9P2=PAP2P,9P2AP?P)$P22P<Fc                8    ddl m}   |t              | |||      S )Nr   )may_distort_benchmarking_result)$torch._inductor.runtime.benchmarkingr   _do_bench_using_profiling)r   r   r   is_vetted_benchmarkingr   s        rW   do_bench_using_profilingr   0  s(    " UE*+DE
FC/ r{   c           
        |sddl m}  |        t               }|j                         }t	        |      } |         |j                          t        j                  t        d      t        j                  |      }|j                  d      }	|j                  d      }
|	j                          t        d      D ]  }|j                           |          |
j                          |j                          |	j                  |
      dz  }t        dt        ||z              }t        dt        ||z              }t        |      D ]	  } |          |j                          t        j                  j!                  t#        t        j                  j$                  |      g	      5 }t        |      D ]  }|j                           |          |j                          d
d
d
       t&        j)                  d       t&        j)                  j+                         j-                  dd             t/        |j1                         D cg c]0  }|j2                  t#        t4        |      k(  r|j6                  dk7  r|2 c}      }t/        |D cg c]  }d|j6                  vs| c}      }t9        |      dk(  rt;        d| dt9        |       d|       |j=                          |j+                         }t&        j)                  d       t&        j)                  |j-                  d             t?        d |D              dz  |z  }t&        j)                  d|       |S # 1 sw Y   xY wc c}w c c}w )r   r   )may_ban_benchmarkingr   r   Tr   r   r7   r   Nr   r   r   r   zContext SyncFillFunctorzDFailed to capture any events after filtering cache clearing events. z	 events: z, repeats: zprofiling time breakdown)r   c              3  4   K   | ]  }|j                     y wr   r   r   s     rW   r   z,_do_bench_using_profiling.<locals>.<genexpr>  s     A%e%%Ar   r   r   ) r   r   rX   upperrY   r   rP   r   r   r   r   r   r   r   r   r   r   rO   r   r   r   r   r   r\   r   r   r[   r   rR   RuntimeError_build_treesum)r   r   r   r   r   r   device_type_upperdevice_interfacer   r   r   r   r   r   r   r   r   r   actual_eventsr   s                       rW   r   r   H  s
    "M.K#))+/<D  "KKJuyyME #((t(<K &&T&:I1X 
   "**959K 1c&;./0H1c#+,-H 8_ 
   "			ENN335FG
 
  
 ' 
x 	AKKMD		 	$$&' IIlIIann$$-EQS$TU 	
  GJ8I$JJ

n, 	
O +O5}EJJ/NOM =QRm9S%9$:+hZQ
 	
 !..0MII()IIm!!B!/0
A=A
AF
JX
UCII(#.J[' '$	
 	Ps   $8M5M*	M/M/M'c                    	 ddl m}  t        j                  j	                  dd       | d uxr% t        t        t        j                  dd       d      S # t        $ r Y yt        $ r}dt        |      v sJ Y d }~yd }~ww xY w)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr  Fztorchvision::nms does not exist)torchvision.opsr  rP   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrO   opsImportErrorr   str)r  r   s     rW   has_torchvision_roi_alignr    s|    -667I6R$ 
EII}d3[*
 	
   0CF:::s   AA 	A?A?&A::A?c                b   | t        j                  d      j                  S t        | t              rt        j                  |       } | j
                  dvrZ| j                  Nt        | j
                        }t        j                  | j
                  |j                  j                               S | S )Ng        )cpumeta)index)
rP   r   r   r}   r
  typer  rY   Workercurrent_devicer   r   s     rW   decode_devicer    s    ~||C '''&#f%{{/)fll.B3FKK@||FKK/?/F/F/U/U/WXXMr{   c                |    t        j                  t        j                  | t        j
                  j                        S r   )	functoolsreduceoperatormulr~   SOne)its    rW   sympy_productr    s#    HLL"eggkk::r{   c           	         t        |       t        |      k(  sJ t        j                  t        d t	        | |      D                    S )Nc              3  ,   K   | ]  \  }}||z    y wr   r   )r   abs      rW   r   zsympy_dot.<locals>.<genexpr>  s     >daAE>s   )rR   r~   expandr   r   )seq1seq2s     rW   	sympy_dotr%    s8    t9D	!!!<<>c$o>>??r{   c                \    | D ci c]  }t        |      | c}j                         S c c}w r   )r   values)r  rT   s     rW   uniquer(    s'     !BqE1H!((**!s   )c           
     n   t        | t        j                        st        |t        j                        r2t        t        j                  |       t        j                  |            S t        | t
              rt        |t
              s$J |  dt        |        d| dt        |              t        | |      S )Nz: , )r}   r~   Exprr_   sympifyr   r  runtime_ceildiv)numberdenoms     rW   rj   rj     s    &%**%E5::)Fu}}V,emmE.BCC fc"z%'= ("T&\N"UG2d5k];= 65))r{   c                j   | yt        |       j                  d      d   }i dddddd	d
ddddddd	dddddddddddddddddd d!d"d#d$dd%d&d'd(}|j                  t        |j	                               D ci c]  }|| c}       t        | t               r| S d)||    S c c}w )*Nz*i8rp   r   booli1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e4m3fnuzfp8e4b8float8_e5m2fnuzfp8e5b16float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16i32i64u16u32u64)int32int64uint8uint16uint32uint64*)r
  splitupdatelistr'  r}   )key	dtype_strtysr   s       rW   _type_ofra    sX   
 {Cs#B'Ii 	G 	z	
 	 	 	w 	9 	: 	$ 	D 	6 	F  	6!" 	6#$ 	%& 	'( 3C8 JJd3::<01112S#&3@aI/?,@@ 2s   
B0c                R    | D cg c]  }t        j                  |       c}S c c}w )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    )r~   r,  lstr   s     rW   convert_shape_to_inductorre    s!     '**EMM!***s   $c                f    t        | t        j                        r| j                  j                  S | S )z
    Convert SymInt to sympy.Expr, leave int as is.

    Unlike sympy.sympify() which converts int to sympy.Integer,
    this function preserves int as int and only converts SymInt to Expr.
    )r}   rP   r2   nodeexprvals    rW   convert_symint_to_exprrk    s%     #u||$xx}}Jr{   c                    ddl m} t        | t              r| S t        | t        j
                        rt        |       S |j                  j                  j                  j                  | d      S )zL
    Like convert_shape_to_symint, but operates on a single expression.
    r7   VN)hint)
virtualizedrn  r}   r   r~   r   graphsizevars	shape_envcreate_symintnode)r   rn  s     rW   convert_to_symintru    se      a 	

 !U]]+ F	 !!++==ad=Kr{   c                >    | D cg c]  }t        |       c}S c c}w )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    )ru  rc  s     rW   convert_shape_to_symintrw  *  s     +..Qa ...s   c                N    t        d | j                  j                  D              S )z-
    Does this op overload have aliasing
    c              3  8   K   | ]  }|j                   d u  y wr   )
alias_infor   r   s     rW   r   zis_view.<locals>.<genexpr>8  s     FAq||4'Fs   )any_schema	argumentsops    rW   is_viewr  4  s     F1E1EFFFr{   c                     yNFr   )r   s    rW   <lambda>r  =      r{   c                   | j                   dk7  ryt        | j                  t        j                  j
                        s| j                  t        j                  u syt        t        j                  j
                  | j                        }|t        j                  u st        |      rt        fd| j                  D              S t        j                  j                  |j                  v xs  |      S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc              3  6   K   | ]  }t        |        y wr   )is_pointwise_use)r   uis_pointwise_fns     rW   r   z#is_pointwise_use.<locals>.<genexpr>N  s     KA#A7Ks   )r  r}   targetrP   _ops
OpOverloadr  getitemr   r  r   usersTag	pointwisetags)user  r  s    ` rW   r  r  ;  s     vv 3::uzz445xGWGW9W%**''4F!!!WV_KKKK99&++-H1HHr{   	list[Any]c           	        t         j                  j                         g dfd} j                  | gt	        t         j
                  |||f       }t        | j                  j                        dk(  r2t        | j                  j                  d   j                        dk(  r|f}j                  |       t         j                  j                  i       }|fS )Nc                `    j                  |        j                  dt                     S )Narg)appendplaceholderrR   )r  g
graph_argss    rW   add_tensor_argz)gen_gm_and_inputs.<locals>.add_tensor_argY  s,    #}}s3z?"3455r{   r7   r   Tensor)r  torch.Tensorr   r5   )rP   fxGraphr  r#   r  rR   r}  returnsr
  r  outputr4   )r  r   kwargsr  rg  gmr  r  s         @@rW   gen_gm_and_inputsr  S  s     	A%'J6 1??u||^dF^LD 	FNN""#q(&&q)../8;wHHTN			b!	$Bz>r{   c                h    | dk(  ry t        |       }|j                         r|j                          y y Nr  )rY   rQ   r   r  s     rW   r   r   k  s4    /7$$&$$& 'r{   c                    t        |       t        j                  d       t        j                         }t        |      D ]  } | | }t        |        t        j                         }J ||z
  S )Ni9  )r   rP   manual_seedtimeperf_counterr   )modelexample_inputsr   r   t0r   resultt1s           rW   timedr  s  sr     	d				B5\ 'F 
			B7Nr{   c                    t        j                  t        |      D cg c]  }t        | |||       c}      }t        j                  |      |z  }t        ||z  d       |j                         S c c}w )Nz.6f)rP   r   r   r  medianprintr   )	r  r  r   repeatbaseliner   r   timingstooks	            rW   print_performancer    sg     ll>CFmLuneV	4LG << 5(D	TH_S!#99;	 	Ms   A1c                H     t        | |             t        | |fd       y)zKReplace obj.method() with a new method that returns a precomputed constant.c                      S r   r   )r  s   rW   r  z#precompute_method.<locals>.<lambda>  s     r{   N)rO   setattr)objmethodr  s     @rW   precompute_methodr    s     !WS&!#FC(r{   c                *    |D ]  }t        | |        y)zFReplace methods with new methods that returns a precomputed constants.N)r  )r  methodsr  s      rW   precompute_methodsr    s     '#v&'r{   c                <    t        | |kD        t        | |k        z
  S r   )r   r   r!  s     rW   cmpr    s    q1u:AE
""r{   c                ~    t        | t              r| g|z  S t        |       dk(  r t        |       | d   g      |z  S | S )Nr7   r   )r}   r   rR   r  )rT   sizes     rW   pad_listliker    sC    !SsTz
1v{tAw!v%%Hr{   c                D    t        |       dk(  rg S dd}t        | |      S )Nr   c                n    t        | t              r| S ddlm} t        | |      sJ | j	                         S )Nr7   )rF   )r}   r
  	schedulerrF   get_name)elemrF   s     rW   	sort_funcztuple_sorted.<locals>.sort_func  s1    dC K0$ 1222}}r{   r^  )r  rl   r   r
  )rR   sorted)rT   r  s     rW   tuple_sortedr    s&    
1v{	 !##r{   PRV)	covariantc                  &    e Zd Zedd       ZddZy)CachedMethodc                     y r   r   )r   s    rW   clear_cachezCachedMethod.clear_cache  s    ),r{   c                     y r   r   selfr   r  s      rW   __call__zCachedMethod.__call__  r  r{   N)r   r   r   None)r   P.argsr  P.kwargsr   r  )r   r   r   staticmethodr  r  r   r{   rW   r  r    s    , ,Dr{   r  c           	         | j                   }d| dd| i}t        d| d d dj                         |        t        j                  |       || d         }d
fd	}||_        |S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfc                8    t        |       rt        |        y y r   r  delattrr  r^  s    rW   r  z"cache_on_self.<locals>.clear_cache  s    4D# r{   r  r   r   r  r   execlstripr  wrapsr  )r   r   ctxwrapperr  r^  s        @rW   cache_on_selfr    s    ;;DtfF
C *CF  E "' (+e ,			 FH "ioob!#n&=">?G &GNr{   c                    t        |       S )z]
    Variant of cache_on_self for properties. The only difference is the type signature.
    )r  )r   s    rW   cache_property_on_selfr    s     r{   c                     	 	 	 	 d fd}|S )Nc           	         d d| j                    dd| i}t        d d d dj                         |        t        j                  |       |d	         }dfd
}||_        |S )Nr  r   r  r   z            def inner(self: Any, *args: P.args, **kwargs: P.kwargs) -> RV:
                args_kwargs = (args, tuple(sorted(kwargs.items())))

                if not hasattr(self, "z2"):
                    object.__setattr__(self, "z%", {})

                cache = self.z

                try:
                    return cache[args_kwargs]
                except KeyError:
                    pass

                rv = fn(self, *args, **kwargs)

                cache[args_kwargs] = rv
                return rv
            innerc                8    t        |       rt        |        y y r   r  r  s    rW   r  z<cache_on_self_and_args.<locals>.wrapper.<locals>.clear_cache  s    tS!c" "r{   r  r  )r   r  r  r  r^  
class_names       @rW   r  z'cache_on_self_and_args.<locals>.wrapper  s     :,a}F3 Rj' (+e ,//2e 4!U #$ )	
, $	#CL1	# (r{   )r   FN_TYPE[P, RV]r   r  r   )r  r  s   ` rW   cache_on_self_and_argsr    s     
$$	$L Nr{   c           
     ^   ddl m} t        | t              rgt	        j
                  t        j                  | D cg c]0  }t        |d      r"|j                  r|j                  j                  2 c}t                     S t        | |j                        r| j                  S t               S c c}w )Nr7   irrg  ) r  r}   r]  r  r  r  or_r  rg  originsr!   r?   )node_scheduler  rg  s      rW   aggregate_originsr     s     -&LL * 4(TYY 		!! L	
 		
 
M2??	3$$$|s   5B*
c                   t        |       }|dk(  rYd }|D cg c]6  }|j                  dk(  r%d|j                  v r|j                  d    ||      8 }}t        t	        |            }n|dk(  rg }|D ]  }|j                  dk(  sd }d}d|j                  v r|j                  d   d   }n"d|j                  v r|j                  d   d   }d	}|s]t        |d
   t              r|j                  |d
   |z          |j                  |d
   j                  |z           t        t	        |            }n5|dk(  r*|D cg c]  }|j                  dk(  s|j                    }}nt        dj                  dg|z         S c c}w c c}w )Noriginal_atenc                   | j                   d   }d}t        |t        j                  j                        r|j
                  j                  }|S t        |t        j                  j                        rt        |j                               }|S )Nr  r  )
r  r}   rP   r  r  _overloadpacketr   HigherOrderOperatorr
  r   )originr  r^  s      rW   get_origin_meta_strz2get_fused_kernel_name.<locals>.get_origin_meta_str=  so    "KK8MC-)>)>?#33<< J M5::+I+IJ-,,./Jr{   r  rP   r  source_fn_stackr   fwd_source_fn_stackbackwardr7   inductor_noder   fused)r  r  r  r  r!   r}   r
  r  r   r   NotImplementedErrorjoin)r  descriptive_namesall_originsr	  r  sources	source_fnsuffixs           rW   get_fused_kernel_namer  6  s    $M2KO+	 &
yyO+6;;.O,8	  '
 
 G,-	g	%! 	CFyyO+ 	$3 &,= >r BI*fkk9 &,A B2 FI'F ilC0NN9Q<&#89NN9Q<#8#86#AB	C" G,-	o	-&1
"VYY/5QFKK
 
 "!88WI'((G
<
s   ;E=<FFc                   !" t        |       }|D cg c]  }|j                  dk(  s| }}t        j                  t              }t        j                  t              }d!|rt        d |D              }t        |      dk(  r_|d   j                  !t        !d      s/t        !j                        D 	ci c]  \  }}	|	|
 }
}}	|
!_        |j                  !fd       |D ]Z  }d	|j                  v r|j                  d	   |j                  d	   }d}t        |t        j                   j"                        rt%        |j&                        }n=t        |t        j                   j(                        rt%        |j+                               }|r||   j-                  |j*                         d
|j                  v r<|j                  d
   d   j*                  }||   j-                  |j*                         |j                  j/                  d      dk(  s3||j*                     j-                  |j*                         ] !dnd}j0                   d| ddj3                  |j5                                ddj3                  |j5                                d}j0                   dg}t7        |j9                               D ]@  \  }}|j-                  j0                   d| ddj3                  t7        |                    B !ddlm  |j-                  j0                   d       t               }g }t        |  j>                        sddl m!} 	 	 	 	 	 	 d* fd}d+d"d,"fd}| D ]  }	t        |	d      r|	jD                  t        |	jD                  d      r|	jD                  jF                  |	jD                  jF                  D ]  }|j*                  |v r|jI                  |j*                         |j                  jK                  |j*                        }|U |||j*                        \  }}|j-                  j0                   d| d  ||       d!| d        t        |	jD                  d"      s|	jD                  jL                  )|	jD                  jL                  D ]T  }|j                  jK                  |j*                        }|+ |||j*                        \  }}|j-                  d#|z          V  |D ]y  }|jO                  d$%      }|Ct        jP                  jR                  r)|jU                  fd&|jW                         D               Z|j-                  j0                   d|        { |j-                  j0                   d'd(j3                  |              |d)j3                  |      fS c c}w c c}	}w )-aH  
    Retrieves metadata information for a kernel.
    Args:
        node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
            Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
        wrapper (PythonWrapperCodegen):
            An instance of PythonWrapperCodegen, used to define the code comment format.
    Returns:
        tuple[str, str]:
            A tuple containing two strings:
                - The first string represents the kernel's metadata.
                - The second string represent the kernel's detailed metadata.
    r  Nc              3  4   K   | ]  }|j                     y wr   )rq  )r   ns     rW   r   z&get_kernel_metadata.<locals>.<genexpr>  s     "Cq177"Cr   r7   r   )_inductor_kernel_metadata_node_to_idx_mapc                "    j                   |    S r   )r  )r  single_graphs    rW   r  z%get_kernel_metadata.<locals>.<lambda>  s    lTTUVW r{   r  r  	from_nodepartitioner_tagis_backwardzTopologically SortedUnsorted z Source Nodes: [r*  z], Original ATen: []z" Source node to ATen node mapping:   z => r  z Graph fragment:rm  c                >   t        | j                        rAt        | j                  j                        r!| j                  j                  j                  }n| j                  }||}n|j
                  }	 | j                         }||fS # t        $ r d }Y ||fS w xY wr   )r}   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutr  )bufferrw_namer(  r   layoutr  s        rW   get_buffer_infoz,get_kernel_metadata.<locals>.get_buffer_info  s     fbll3
KK9 #)++"2"2">">K"("4"4K&"D&++D"#..0F V|# + "!FV|#"s   7B BBc           	     d    ddj                  | D cg c]  }t        |       c}       dS c c}w )N[r*  r"  )r  r
  )shaperT   s     rW   stringify_shapez,get_kernel_metadata.<locals>.stringify_shape  s-    499e%<c!f%<=>a@@%<s   -
c                    | y | j                          } | j                         }| j                   }dt        | j                      | | | dS )Nr  ")r  strider   r    r   )r,  shape_annotationstride_annotationdevice_annotationr1  s       rW   stringfy_layoutz,get_kernel_metadata.<locals>.stringfy_layout  sl    >&5fkk&B%C '6v}}'E&F!'-}}o! FLL123C2D()*;)<A?r{   read_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadatac              3  @   K   | ]  }j                    d |   yw)r#  N)comment)r   liner  s     rW   r   z&get_kernel_metadata.<locals>.<genexpr>  s)      ) 's4&1)s   z
   return ,
)r*  z-ir.TensorBox | ir.Buffer | ir.TorchBindObjectr+  r
  r   ztuple[str, ir.Layout | None])r0  zIterable[int]r   r
  )r,  zir.Layout | Noner   r
  ),r  r  collectionsdefaultdictr]  r!   rR   rq  r  	enumeratenodesr  sortr  r}   rP   r  r  r
  r  r  r   r  getr?  r  keysr  itemsr  r  r?   rp  rn  r9  r:  addtry_get_bufferr;  format_nodeversionrq   extend
splitlines)#r  r  r  r  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsidxr  node_to_idx_maprg  r  r^  sort_strmetadatadetailed_metadataoriginal_noderF  	all_reads
all_writesrn  r-  r8  rr*  
input_namer,  woutput_namer   formatted_noder  r  r1  s#    `                              @@@rW   get_kernel_metadatarb  m  sZ   $ $M2K+6W&)):VfWNW ,,T2N$006
 L""CN"CC}")!,22L<)TU8A,BTBT8U"Vfc11c6"V"VIXFW     8dii'DIIo,F,R IIo6MC-)>)>?-778M5::+I+IJ-,,./"3'..tyy9$))#))K(+00C3&&tyy1YY]],->499%,,TYY78  *6)A%zH??
1XJ&6tyyATATAV7W6X Y99%7%<%<%>?@	C  $OO,,NOP &~';';'= > 
u  s=/diiu6N5OP	

   GOO#44D!EF%/\	 "
-9&$E$PS$-$(A
 # =q-0AMM4I1=='2q}}7J7J7V]]00 66Y.$!aff-!"!7!7!?!>$-<VQVV-L*
F)00&/tJ<z.v677Mj\YZ\ AMM84,,8]]11 =!"!7!7!?!>$)8)HQ"))#*;<=-=< # 	RD!--d-KN)emm.?.? "(( ) . 9 9 ;) 
 "((GOO+<C?O)PQ	R 	  GOO#4Jsxx
?S>T!UVTYY0111Y X #Ws   W	W	8Wc                    t        |       } t        |       }| rV| j                         }|j                  D ]4  }|r	 ||      r||vs|j	                  |       | j                  |       6 | rV|S )zJReturns the set of nodes whose values depend on those within initial_queue)r]  r!   rS   r  rK  r  )initial_queueskip_filterdominated_setrg  users        rW   dominated_nodesrh    sz    
 'M}-M
  "JJ 	+D{40=(!!$'$$T*	+  r{   c                4  	 ddl m d	fd	t        |      \  }}|D cg c]  } 	|      s|j                   }}t        |       \  }}|D cg c]  } 	|      s|j                   }}t	        t        j                  g ||       S c c}w c c}w )Nr7   r  c                F   t        | j                        r | j                        S t        | j                        r | j                        S t        | j                        xr9 t        | j
                  j                  j                  j                  f       S r   )	r}   r%  r&  r'  r@   ComputedBufferInputsKernelInputBufferTemplateBuffer)r  r  is_unrealized_nodes    rW   ro  z*gather_origins.<locals>.is_unrealized_node(  s    a&%aff--a'%aff--!RYY' 

!!!!	1
 -
 	
r{   )r  r@   r   r1  )r  r  r"   r   r!   	itertoolschain)
r   r  kwargs_flattenr   rj  kwargs_originsargs_flattenargs_originsr  ro  s
           @@rW   gather_originsrv  #  s     
" %V,NA-;Wc?QRU?VckkWNW"4(OL!+7SC;Mc;RCKKSLSiooE|EnEFF XSs   BBB Bc                J    dddfddfddfd |       S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    c                    t        | t        j                        xr, t        | j                        dk(  xr | j                  d   dk(  S )N   r   r   )r}   r~   MulrR   r   )rh  s    rW   is_neg_leadzsympy_str.<locals>.is_neg_leadG  s:    tUYY'VC		Na,?VDIIaLTVDV	
r{   c                `   t        | t        j                        rt        | j                        dk(  rO | j                  d         r: | j                  d          d | j                  d   j                  d          S dj                  t        | j                              S  |       S )Nry  r7   r   z - z + )r}   r~   r   rR   r   r  r   )rh  r{  sympy_str_muls    rW   sympy_str_addz sympy_str.<locals>.sympy_str_addL  s    dEII& 499~"{499Q<'@'		!56c-		RSHYHYZ[H\:]9^__zz#mTYY"?@@ &&r{   c                    t        | t        j                        rE |       rd | j                  d          S dj	                  t        | j                              S  |       S )N-r7   z * )r}   r~   rz  r   r  r   )rh  r{  sympy_str_atoms    rW   r}  z sympy_str.<locals>.sympy_str_mulW  s[    dEII&4  >$))A,7899zz#ndii"@AA!$''r{   c                   t        | t        j                        r| j                  S t        | t        j                  t        j
                  f      rd |        dS t        | t        t        t        t        f      rC| j                  j                   ddj                  t        t        | j                               dS t!        |       S )N()r*  )r}   r~   Symbolr   r   rz  rc   r`   ra   rb   funcr   r  r   	sympy_strr   r
  )rh  r~  s    rW   r  z!sympy_str.<locals>.sympy_str_atomb  s    dELL)99uyy%))45}T*+1--(HMNii(()499SDII5N+O*PPQRRt9r{   )rh  r   r   r1  rh  r   r   r
  r   )rh  r{  r~  r  r}  s    @@@@rW   r  r  @  s$    

	'	( r{   c                    ddl m} t        j                  r3t	        |j
                  dd       x}r|j                  dk7  rt        |       S t        j                         S )Nr7   rm  current_node
index_expr)
rp  rn  ri   compute_all_boundsrO   interpreterr  rf   rg   unknown)r  rn  fx_nodes      rW   get_bounds_index_exprr  o  sN     	!!~tDDWDNNl*5!!""$$r{   c                    | d   dk(  S )Nr   r]  r   )prefixs    rW   prefix_is_reductionr  }  s    !9r{   c                J    | t         j                  k7  sJ t        | |dd      S )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)re   SIZErd   )r  rU  s     rW   sympy_index_symbol_with_prefixr    s)     TYY vsDdCCr{   c                N    | xs t         j                  xr t         j                  S r   )ri   debug_index_assertsassert_indirect_indexing)checks    rW   generate_assertr    s    /V//TV5T5TTr{   c                F    | d   dk7  sJ t        j                  | dd      S )r  r   r   Tr  )r~   r  r   s    rW   sympy_index_symbolr    s)     7c>> <<d==r{   c                    t        | |      S )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    r&   )rh  replacementss     rW   
sympy_subsr    s    
 t\**r{   c                    t        | t        j                        xs( t        | t        j                        xr | j                  S r   )r}   rP   r2   r  _has_symbolic_sizes_strides)r   s    rW   is_symbolicr    s3    a& 1ell#E(E(Er{   c                 &    t        d | D              S )Nc              3  2   K   | ]  }t        |        y wr   )r  r{  s     rW   r   z"any_is_symbolic.<locals>.<genexpr>  s     ,!{1~,   r|  )r   s    rW   any_is_symbolicr    s    ,t,,,r{   )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalarc                    ddl m} | j                  j                  D ];  }t	        |      r|c S |j
                  j                  d      x}0 ||      s9|c S  y )Nr   )r)   rj  )%torch.fx.experimental.symbolic_shapesr)   rq  rF  is_cudagraph_unsafe_fx_noder  rH  )r  r)   rg  rj  s       rW   %get_first_incompatible_cudagraph_noder    sV     L &t,K99==''C49Ns9SK r{   c                    t        t        t        | j                  j                                    }|j
                  dk(  sJ |S )z$Get the output node from an FX graphr  )nextiterreversedrq  rF  r  )r  	last_nodes     rW   output_noder    s6    T(288>>234I<<8###r{   c                    | j                   j                  d      }t        d |D              }t        |       j                  d   }t        |t              r|n|f}t        d |D              }||z  S )Nr  r  c              3     K   | ]P  }t        |j                  j                  d       t        j                        r|j                  d    j
                   R ywrj  N)r}   r  rH  rP   r  r   )r   rg  s     rW   r   z"get_all_devices.<locals>.<genexpr>  sB      9diimmE*ELL9 			%9s   AAr   c              3     K   | ]t  }t        |t        j                  j                        rNt        |j                  j                  d       t        j                        r|j                  d    j                   v ywr  )r}   rP   r  r5   r  rH  r  r   )r   r  s     rW   r   z"get_all_devices.<locals>.<genexpr>  sS      7c588==)sxx||E*ELL9 	7s   A:A<)rq  
find_nodesr!   r  r   r}   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicess         rW   get_all_devicesr    s}    ++}+=.8 9%9 /M "o""1%G$We4w7*H,6 77 -K ;&&r{   c                 |   t        t        j                  j                               D ]  } | j	                  d      st        j                  |    }|j
                  D ]  }|j	                  d      st        ||      }t        |t        j                  j                  j                  j                        sZ|j                  D ]i  }t        |t        j                  j                  j                  j                        s<|j                  j                   j"                  j%                          k  t        j                  | =  dt        j                  v rRt        j                  d   }t'        |j(                  j*                  j,                        `|j(                  j*                  `t1        j2                          y )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)r]  sysmodulesrI  
startswith__dict__rO   r}   rP   	_inductorruntimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r  driveractiveutilsinstancegccollect)module_namem	attr_namer  r  r  s         rW   unload_xpu_triton_pydsr    sK   CKK,,./ %%%&NOKK$ 	<I##I. I.EOO33EEVV #)"8"8 <%"!OO33EEYY
 #MM--1199;<	< KK$#%( #++-kk12""(()2JJ#JJLr{   _registered_cachesc                    t        | d      rt        | j                        st        |  d      t        j                  |        | S )zh
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_cache().
    cache_clearz# does not have a cache_clear method)r  callabler  AttributeErrorr  r  r  s    rW   clear_on_fresh_cacher    s?    
 3&hs.Gu$GHIIc"Jr{   c                 :    t         D ]  } | j                           y)z&
    Clear all registered caches.
    N)r  r  r  s    rW   clear_cachesr    s     " r{   c              #  \  K   t         j                  j                  |       }	 |t         j                  | <   d |!t         j                  j                  | d       y|t         j                  | <   y# |!t         j                  j                  | d       w |t         j                  | <   w xY ww)a  Thread-safe env var set/restore using atomic C-level lookups.

    We avoid mock.patch.dict(os.environ, ...) because it internally calls
    os.environ.copy(), which iterates all env var keys then fetches values in
    separate steps. That approach is not atomic and can race with background threads
    (e.g. Triton async compilation) modifying the environment, causing KeyError,
    so we use os.environ.get() for individual keys which is an atomic C-level lookup.
    N)osenvironrH  rS   )r^  r   olds      rW   _set_envr  $  sz      **..
C"

3;JJNN3%!BJJsO ;JJNN3%!BJJsOs    B,A1 7B,18B))B,c              #    K   t                ddlm}  |t        j                  |            	 t        d      5  t        j                  d        |t        j                  j                  d            }t        d|      5  d t        | t              rt        |       dk(  sJ d	       t        j                  j                  |      rtt        j                  |      }| j!                  |D ci c]D  }d
|vr>|t        j                  j#                  t        j                  j                  ||            F c}       ddd       ddd       |rUt%               r(t&        j(                  j+                         r
t-                t/        j0                  t%               fd       t                yc c}w # 1 sw Y   xxY w# 1 sw Y   |xY w# t2        $ r t        j5                  d        w xY w# t                w xY ww)z
    Contextmanager that provides a clean tmp cachedir for pt2 caches.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    r   )normalize_path_separator)dirTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictz.lockc                4    t         j                  d|      S )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r  pathr  inductor_cache_dirs      rW   r  zfresh_cache.<locals>.<lambda>g  s    S[[@&% 6A 6 r{   )ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r  torch._inductor.cpp_builderr  tempfilemkdtempr  r   r   r  r  r  r}   dictrR   existslistdirr\  getsize
is_windowsrP   rJ   rQ   r  shutilrmtree	Exceptionr  )cache_entriesr  deleter  triton_cache_dirfilesfr  s          @rW   fresh_cacher  9  s     ND1(2B2Bs2KL'/1CD 	II35GH7/:  ,.>? mT2}-2W4WW2ww~~&67 "

+; <%,, */$%#*!#3 !"277??277<<@PRS3T#U U	$ |		 6 6 8&(MM" )l  	5 	 	D  >@RS 	sm   -HG A	GA-F:3A	F5<F:GAG *H5F::G	?GGG !G33G6 6HH)reversec                   | j                   }t        t        |             }t        t	        ||d            }|st        t        |            S |S )NTr^  r  )__getitem__r   rR   r]  r  r  )seqr  gettera_rsort_idxs        rW   argsortr  z  sE    __F
C/C F3FD9:HHX&''Or{   c          	     4    d fd}t        |      D cg c]9  \  }}|t        |t        j                        r|j                  j
                  n|f; }}}t        |t        j                  |      |      }|D cg c]  \  }}|	 }}}|S c c}}w c c}}w )Nc                n    | \  }}|\  }}dfd} |||k        ry |||kD        ry||k  ry||kD  ryy)Nc                N    t        | t              r| S j                  | d      S )NT)size_oblivious)r}   r1  evaluate_expr)rh  rs  s    rW   evaluatez*argsort_sym.<locals>.cmp.<locals>.evaluate  s(    $%**4*EEr{   r   r7   r   )rh  z bool | torch.SymInt | sympy.Exprr   r1  r   )r   r!  a_idxa_valb_idxb_valr  rs  s          rW   r  zargsort_sym.<locals>.cmp  sT    uu	F
 EEM"EEM"
 5=5=r{   r  )r   tuple[int, sympy.Expr]r!  r  r   r   )	rE  r}   rP   r2   rg  rh  r  r  
cmp_to_key)	rs  r  r  r  rU  r   exprsr   r  s	   `        rW   argsort_symr"    s    4  nC 
Z5<<8affkka@E  5i2237IE %&fc1c&F&M
 's   >B=Bc                t    | t         j                  k(  ryt        j                  d|       j                         S )Nrv   r   r   )rP   rY  r   element_sizer$  s    rW   get_dtype_sizer&    s-     ;;r'4466r{   c                      e Zd ZU ded<   y)LineContextr   contextNr   r   r   r   r   r{   rW   r(  r(    s    Lr{   r(  c                  "    e Zd ZU ded<   ded<   y)ValueWithLineMapr
  r   zlist[tuple[int, LineContext]]line_mapNr*  r   r{   rW   r,  r,    s    J++r{   r,  c                      e Zd ZdZdddZej                  dd       ZddZddZ	ddZ
d ZddZdd	Zdd
ZddZddZddZd d!dZd d"dZd d"dZd#d$dZd%dZddZd&dZd'dZy)(IndentedBuffer   c                     g | _         || _        y r   )_lines_indent)r  initial_indents     rW   __init__zIndentedBuffer.__init__  s    BD%r{   c              #  b   K   | j                   }	 || _         d  || _         y # || _         w xY wwr   )tabwidth)r  r7  prevs      rW   set_tabwidthzIndentedBuffer.set_tabwidth  s,     }}	!$DM DMDDMs   /# /	,/c                   t               }d}g }| j                  D ]  }t        |t              r
 |       }|1t        |t              r|j                  ||j                  f       K|}t        |t              sJ |j                  |       |j                  d       |d|j                  d      z   z  } t        |j                         |      S )Nr7   rB  )r   r2  r}   DeferredLineBaser(  r  r)  r
  writecountr,  getvalue)r  bufr   linemaplir@  s         rW   getvaluewithlinemapz"IndentedBuffer.getvaluewithlinemap  s    j13++ 	&B"./t<B,2::/dC(((IIdOIIdOTZZ%%%A	&  88r{   c                6    | j                         j                  S r   )rB  r   r  s    rW   r>  zIndentedBuffer.getvalue  s    '')///r{   c                f   t               }| j                  D ]  }t        |t              r
 |       }|t        |t              r.|}t        |t
              sJ |j                  d      r|j                  |d d        h|j                  |       |j                  d        |j                         S )N\r   rB  )	r   r2  r}   r;  r(  r
  endswithr<  r>  )r  r?  rA  r@  s       rW   getrawvaluezIndentedBuffer.getrawvalue  s    j++ 	 B"./t<B,dC(((}}T"		$s)$		$		$	   ||~r{   c                    | j                   S r   r2  rD  s    rW   get_lines_refzIndentedBuffer.get_lines_ref  s    {{r{   c                8    | j                   j                          y r   )r2  clearrD  s    rW   rM  zIndentedBuffer.clear  s    r{   c                ,    t        | j                        S r   )r1  r2  rD  s    rW   __bool__zIndentedBuffer.__bool__  s    DKK  r{   c                :    d| j                   | j                  z  z  S )Nr!  )r3  r7  rD  s    rW   r  zIndentedBuffer.prefix  s    dllT]]233r{   c                &    | j                  d       y )NrB  	writelinerD  s    rW   newlinezIndentedBuffer.newline  s    tr{   c                   t        |t              r| j                  j                  |       y t        |t              r9| j                  j                  |j                  | j                                      y |j                         r.| j                  j                  | j                          |        y | j                  j                  d       y Nr  )r}   r(  r2  r  r;  with_prefixr  stripr  r@  s     rW   rS  zIndentedBuffer.writeline  s    dK(KKt$./KKt//>?ZZ\KK$++-78KKr"r{   c                4    |D ]  }| j                  |        y r   rR  )r  linesr@  s      rW   
writelineszIndentedBuffer.writelines  s     	!DNN4 	!r{   c                H     t         j                  d fd       } |       S )Nc               3     K   xj                    z  c_         	 d  xj                    z  c_         y # xj                    z  c_         w xY wwr   r3  )offsetr  s   rW   r  z"IndentedBuffer.indent.<locals>.ctx  s9     LLF"L'&&s   A4 AAAr   Iterator[None])
contextlibcontextmanager)r  r`  r  s   `` rW   indentzIndentedBuffer.indent  s$    		"	"	' 
#	' ur{   c                .    | xj                   |z  c_         y r   r_  r  r`  s     rW   	do_indentzIndentedBuffer.do_indent'      r{   c                .    | xj                   |z  c_         y r   r_  rg  s     rW   do_unindentzIndentedBuffer.do_unindent*  ri  r{   c           	        t        |t              rt        d      }|j                  D ]E  }t        |t              r|st        |t        |      t        |j                               z
        }G t        j                  |      rd}|j                  D ]P  }t        |t              r| j                  j                  |       /t        j                  | |t        |      d         R y t        j                  |      }|r|j                         }|sy |j                         }|j!                  d      D ]  }| j                  |        y )Ninfr   rB  )r}   r/  floatr2  r(  minrR   r  mathisinfr  rS  r   textwrapdedentrstripr[  )r  
other_coderX  rs  r@  r   s         rW   splicezIndentedBuffer.splice-  s   j.15\F")) I!$4 TS5G)GHFI zz&!")) HdK0KK&&t,",,T4F3FG	H "4J'..0
#**,J%%d+ "q!"r{   c                    t        | j                        }| j                  D cg c]
  } ||       c}|_        |S c c}w N)r4  )r/  r3  r2  )r  r  r   r@  s       rW   r   zIndentedBuffer.mapE  s4    DLL9-1[[9Td4j9

 :s   >c                @    t        |        d| j                          dS )Nr  r  )r  r>  rD  s    rW   __repr__zIndentedBuffer.__repr__J  s     t*Qt}}/q11r{   c                    | j                   |j                   k(  sJ t        | j                         }|j                  | j                         |j                  |j                         |S rx  )r3  r/  r\  r2  )r  otherr   s      rW   __add__zIndentedBuffer.__add__M  sK    ||u}},,,DLL9t{{#u||$
r{   c                    || j                   v S r   rJ  )r  new_lines     rW   containszIndentedBuffer.containsU  s    4;;&&r{   Nr   )r4  r   r   r  )r7  r   r   rb  )r   r,  r   r
  r   r  r   r1  )r@  z$LineContext | DeferredLineBase | strr   r  )r[  z.Sequence[LineContext | DeferredLineBase | str]r   r  r   )r`  r   r   'contextlib.AbstractContextManager[None])r`  r   r   r  F)ru  zIndentedBuffer | strrX  r1  r   r  )r  zCallable[[Any], Any]r   r/  )r|  r   r   r/  )r  z$DeferredLineBase | LineContext | strr   r1  )r   r   r   r7  r5  rc  rd  r9  rB  r>  rH  rK  rM  rO  r  rT  rS  r\  re  rh  rk  rv  r   rz  r}  r  r   r{   rW   r/  r/    s    H& ! !9(0(!4#!	"0
2'r{   r/  c                  (     e Zd Zd fdZddZ xZS )FakeIndentedBufferc                "    t         |           y r   )superr5  )r  	__class__s    rW   r5  zFakeIndentedBuffer.__init__Z  s    r{   c                V    |dk(  rt         j                  | |      S t        d| d      )Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   s     rW   r  z#FakeIndentedBuffer.__getattribute__]  s;    ;**466!$ (= =
 	
r{   r  )r   r
  r   r   )r   r   r   r5  r  __classcell__r  s   @rW   r  r  Y  s    
r{   r  c               #     K   t         j                  t         j                  }} 	 d  | |ct         _        t         _        y # | |ct         _        t         _        w xY wwr   )r  stdoutstderr)initial_stdoutinitial_stderrs     rW   restore_stdout_stderrr  h  s@     %(ZZNN@!/
CJ
CJs   !AA  A AAc                  P    e Zd ZdZddZddZddZddZddZddZ	ddZ
dd	Zy
)r;  z.A line that can be 'unwritten' at a later timec                6    |j                         sd}|| _        y rV  )rX  r@  rY  s     rW   r5  zDeferredLineBase.__init__t  s    zz|D	r{   c                    t         )zJReturns either self.line or None to indicate the line has been 'unwritten'r  rD  s    rW   r  zDeferredLineBase.__call__y      !!r{   c                    t         )z3Returns a new deferred line with the same conditionr  rY  s     rW   	_new_linezDeferredLineBase._new_line}  r  r{   c                @    | j                  | | j                         S r   r  r@  )r  r  s     rW   rW  zDeferredLineBase.with_prefix  s    ~~455r{   c                T    | j                  | j                  j                               S r   )r  r@  r  rD  s    rW   r  zDeferredLineBase.lstrip  s    ~~dii..011r{   c                >    | j                  | j                  |         S r   r  )r  r  s     rW   r  zDeferredLineBase.__getitem__  s    ~~dii.//r{   c                ,    t        | j                        S r   )r1  r@  rD  s    rW   rO  zDeferredLineBase.__bool__  s    DIIr{   c                ,    t        | j                        S r   )rR   r@  rD  s    rW   __len__zDeferredLineBase.__len__  s    499~r{   N)r@  r
  r   
str | None)r@  r
  r   r   )r  r
  r   r   )r   r   )r  zint | slicer   r   r  r   r   )r   r   r   r   r5  r  r  rW  r  r  rO  r  r   r{   rW   r;  r;  q  s-    8
""620r{   r;  c                  4     e Zd ZdZd fdZddZddZ xZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`c                @    t         |   |       || _        || _        y r   )r  r5  r^  value_fn)r  r^  r  r@  r  s       rW   r5  zDelayReplaceLine.__init__  s     r{   c                j    | j                   j                  | j                  | j                               S r   )r@  replacer^  r  rD  s    rW   r  zDelayReplaceLine.__call__  s#    yy  4==?;;r{   c                D    t        | j                  | j                  |      S r   )r  r^  r  rY  s     rW   r  zDelayReplaceLine._new_line  s    $-->>r{   )r^  r
  r  zCallable[[], str]r@  r
  r  )r@  r
  r   r  )r   r   r   r   r5  r  r  r  r  s   @rW   r  r    s    @!
<?r{   r  c                   t        | t        j                        r| }nt        j                  t               |       }t	        j
                  |      }t        j                  j                  rC|j                  J |j                  dk  s|j                  dk(  rt        j                  d       yy|j                  dk(  rdnd}|j                  }||k  rt        j                  d	||d
       yy)N	   
   z6GPU arch does not support max_autotune_gemm mode usageFTrJ   rr   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)r}   rP   r   rX   r   createrN  rq   majorr   r  r  multi_processor_count)index_or_devicer   propr  r  s        rW   
is_big_gpur    s    /5<<0 lno>""6*D }}zz%%%::>TZZ2-KKPQKK5(bbG**I7:%I> 	 	
 r{   c                     t         j                  j                         r(t         j                  j                         j                  S t         j
                  j                  d      j                  S )NrH   )rP   rJ   rQ   get_device_propertiesgpu_subslice_countrH   r  r   r{   rW   get_max_num_smsr    sF    yyyy..0CCC::++F3IIIr{   c                     t         j                  j                         syt         j                  j                  t         j                  j	                               } | j
                  dk(  S )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rP   rH   rQ   r  r  r  )device_propertiess    rW   
using_b200r    sJ     ::""$

889R9R9TU""b((r{   c                     t         j                  j                         r
t               S t         j                  j                         } t               | | z
  S dz
  S )zFHandle experimental carveout if set otherwise return hardware SM countr   )rP   rJ   rQ   r  r  _get_sm_carveout_experimental)carveouts    rW   get_num_smsr    sJ     yy  xx557HH,@HHaHHr{   c                    ddl m}m} |
t               }|j	                  d      }|| z  t
        z  } |||| |j                               S )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r7   )r8   WorkspaceZeroModeF)r=  	zero_moder   
outer_name)codegen.commonr8   r  r  	from_boolTMA_DESCRIPTOR_SIZEunique_name)num_tma_descriptorsr   num_programsr8   r  r  r  s          rW   get_tma_workspace_argr    sZ     @"}!++E2I--0CCD+<++-	 r{   c                    t         j                  j                  sydt         j                  j	                  d      j
                  v r| dk  ryy)Nr   gfx942rr   r7   ry  )rP   rN  rq   rH   r  gcnArchName)block_ks    rW   get_default_kpackr    s<    ==5::33A6BBBwRT}r{   c                    | j                   |vr!t        j                  d| j                   |       t        | j                  j
                        xr% | j                   |v xr t        | j                        S )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r  r  )r,  allowed_layout_dtypess     rW   _use_template_for_gpur    sf     ||00		RLL!	
 	v}}!!" 	&LL11	&v}}%r{   c                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w NrA  )r   ri   max_autotune_gemm_backendsr[  rX  backendrT   s     rW   _use_autotune_backendr    M    ==?!<<BBDJJ3O	      Ac                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w r  )r   ri   max_autotune_conv_backendsr[  rX  r  s     rW   _use_conv_autotune_backendr  	  r  r  )enable_int32enable_float8check_max_autotunec                  ddl m}m} t        j                  t        j
                  t        j                  g}|r>t        j                  t        j
                  t        j                  t        j                  g}|r/|j                  t        j                  t        j                  g       t        | j                  j                        xr t        | |      xs) | j                  j                  dk(  xr | j                  |v xrS t         j"                  xs t         j$                  xs | xr* t'        d      xr  || j                  |j(                        S )Nr7   )BackendFeaturehas_backend_featurer  TRITON)r  r  r  rP   r   rE  rG  rT  rO  r;  r<  r  r   r  r  r   ri   max_autotunemax_autotune_gemmr  TRITON_TEMPLATES)r,  r  r  r  r  r  layout_dtypess          rW   use_triton_templater    s    D]]ENNEMMBMu{{Se1153D3DEF v}}))* A)&-@O ""e+M0M
	P   VF$<$<VDV@V
	P "(+
	P  ~/N/NOr{   output_layout
add_guardsc                   	 ddl m} ddlm dfddfd}d	fd	 	 	 	 	 	 	 	 dfd	 	 	 	 	 	 	 	 dfd		 |       xr t	        fd
|D              xr  ||       S )u^  
    Return True iff *all* supplied tensors satisfy the CUDA TMA constraints
    that Triton relies on today.
    * https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

    A tensor is accepted when:
      * 1 ≤ rank ≤ 5 (cuTensorMapEncodeTiled)
      * dtype in _TMA_SUPPORTED_DTYPES (CUtensorMapDataType enum)
      * Base pointer 16-byte aligned
      * Exactly one contiguous ("inner") dim with stride 1
      * All "outer" dims have 16-byte aligned strides
      * Inner dim size × itemsize is a multiple of 16
      * For 1-byte dtypes (e.g. FP8), inner dim ≥ 32
    r   )has_triton_tma_devicer7   rm  c                X    j                   j                  j                  | t              S r   )rq  rr  statically_known_multiple_ofTMA_ALIGNMENT)
expr_bytesrn  s    rW   _alignedzcan_use_tma.<locals>._alignedA  s     ww<<ZWWr{   c                    | y| j                   }| j                  }| j                  } | j                        sy |||      S )NTF)r  r4  r   r`  )r,  sizesstridesr   r  _is_tma_compatibles       rW   _is_tma_compatible_layoutz.can_use_tma.<locals>._is_tma_compatible_layoutD  sF    >-- &!%%88r{   c                   | j                         }| j                         }| j                         }| j                         j                  j
                  v ry| j                         x}|j                  dk(  r
 |||      S  |||      S )NFrJ   )get_size
get_stride	get_dtyper  rq  unaligned_buffers
get_devicer  )r  r  r  r   m_devicern  r  _is_tma_compatible_xpus        rW   _is_tma_compatible_matrixz.can_use_tma.<locals>._is_tma_compatible_matrixQ  s{    

,,. ::<177444&H38N)%%@@!%%88r{   c                R   t        |       }|j                  }|dk  s|dkD  ry|t        vryrKj                  j                  j                  |       }j                  j                  j                  |      }nd| D cg c]'  }j                  j                  j                  |      ) }}|D cg c]'  }j                  j                  j                  |      ) }}t        |      D 	cg c]-  \  }	}j                  j                  j                  |d      r|	/ }
}	}t        |
      dk7  ry|
d   }t        |      D ]  \  }	}|	|k(  r ||z        r y ||   } ||z        sy|dk(  r'j                  j                  j                  |d      syyc c}w c c}w c c}}	w )Nr7   r   Fr       T)
rR   itemsizert   rq  rr  guard_int_seq!replace_backed_symbols_with_hintsrE  statically_known_equalsstatically_known_geq)r  r  r   rankr  sizes_i	strides_ir   str   r  	inner_idx	inner_dimrn  r  r  s                rW   r  z'can_use_tma.<locals>._is_tma_compatible_  s   
 5z>>!8tax--gg&&44U;G((66w?I PUJK  BB1EG  RYKM  BB2FI  #9-
2ww77A> 
 

 u:?!H	 y) 	EArI~BM*		 I&		H,- q=!1!1!F!FyRT!UC

s   >,F0,F,2F#c                T   |d   }j                   j                  j                  |      }j                   j                  j                  |d      syd}| D ]O  }j                   j                  j                  |      }j                   j                  j	                  ||      sO y y)Nr   r7   Fl    T)rq  rr  r  r  statically_known_gt)	r  r  r   last_stridelast_stride_hint
MAX_UINT32r  	size_hintrn  s	           rW   r  z+can_use_tma.<locals>._is_tma_compatible_xpu  s     bk77++MM
 ww778H!L 
 	D((JJ4PIww33IzJ	
 r{   c              3  .   K   | ]  } |        y wr   r   )r   r  r	  s     rW   r   zcan_use_tma.<locals>.<genexpr>  s     ?)!,?   )r  int | sympy.Exprr   r1  )r,  Layout | Noner   r1  )r  r@   r   r1  )r  Sequence[sympy.Expr]r  zSequence[_IntLike]r   torch.dtyper   r1  )torch.utils._tritonr  rp  rn  r   )
r  r  matricesr  r   rn  r  r  r	  r  s
    `   @@@@@rW   can_use_tmar%  ,  s    " :X993#3#3 3 
	3j##  
	. 	 	5?h??	5%m4r{   c                   t        j                  t         j                        j                  }g }| D ]O  }t	        |t
        t        j                  f      r||kD  s) y|j                  t        j                  ||             Q |syddl
m} t        |      dk(  r|d   nt        j                  | }|r%|j                  j                  j!                  |      S |j                  j                  j#                  |      S )NFTr7   rm  r   )rP   iinforT  r   r}   r   r~   r   r  Lerp  rn  rR   Andrq  rr  guard_or_falsestatically_known_true)r  r  	int32_max
conditionsr  rn  	conditions          rW   _descriptor_shape_fits_in_int32r/    s     EKK(,,IJ 9dS%--01iehhtY789 !$ZA!5
1599j;QI  	
''	2 WW33I>r{   r  c                x   t         j                  j                  syt        d |D              syt        fd|D              syt         j                  j                  rt        | j                        syt        j                  j                  yt         j                  j                  r| nd }t        ||dS )NFc              3  T   K   | ]   }t        |j                               d k(   " yw)ry  N)rR   r  )r   r  s     rW   r   z*use_triton_tma_template.<locals>.<genexpr>  s      8!s1::< A%8s   &(c              3  T   K   | ]  }t        |j                                 ! yw)r0  N)r/  r  )r   r  r  s     rW   r   z*use_triton_tma_template.<locals>.<genexpr>  s)       	(

LLs   %(r0  Tr  )ri   r  enable_persistent_tma_matmulr   enable_template_tma_storer/  r  rP   rN  rq   r%  )r  r  r$  r,  s    `  rW   use_triton_tma_templater6    s     ==558x88   }}..7Vz8  }}$$mmEE]4F:NNr{   c                T    t        || |dsyddlm} ddlm}  |       xr  |       S )Nr  Fr   )%has_triton_tensor_descriptor_host_tmar7   is_datacenter_blackwell_arch)r6  r#  r8  codegen.cuda.cuda_envr:  )r  r  r$  r8  r:  s        rW   !use_triton_blackwell_tma_templater<    s2     #	: IC 12U7S7UUr{   c                    | |v xr ||v S r   r   )scale_option_ascale_option_bscaling_typess      rW   use_triton_scaling_templaterA    s    
 ]*N~/NNr{   )maxsizec                 d    	 t         j                  j                  d      duS # t        $ r Y yw xY w)zCheck if CuTeDSL is importable; cache the result for reuse.

    Call ensure_cute_available.cache_clear() after installing CuTeDSL
    in the same interpreter to retry the import.
    cutlassNF	importlibutil	find_specr	  r   r{   rW   ensure_cute_availablerI    s3    ~~''	2$>>     # 	//c                     	 t         j                  j                  d      du} | r
t	                | S # t        $ r Y yw xY w)zCheck if NVIDIA Universal GEMM (cutlass_api) is importable; cache the result for reuse.

    Call ensure_nv_universal_gemm_available.cache_clear() after installing cutlass_api
    in the same interpreter to retry the import.
    cutlass_apiNF)rF  rG  rH  r	  _ensure_fp4_dtype_registered)	availables    rW   "ensure_nv_universal_gemm_availablerO    sE    NN,,];4G	 $&	  s   !1 	==c                     ddl } 	 | j                  j                  t        j                         y# t
        t        f$ r4 ddl| j                  j                  fd}|| j                  _        Y yw xY w)a$  Patch cutlass_api to handle torch.float4_e2m1fn_x2 -> cutlass.Float4E2M1FN.

    NOTE: cutlass_api doesn't natively map this dtype. We patch the lookup function
    in-place so all callers (including TensorWrapper) pick up the change.
    Remove once cutlass_api adds native FP4 support.
    r   Nc                R    | t         j                  k(  rj                  S  |       S r   )rP   rC  Float4E2M1FN)r   _origrD  s    rW   _patchedz._ensure_fp4_dtype_registered.<locals>._patched'  s'    ...+++<r{   )cutlass_api.utilsr  cutlass_type_from_torch_typerP   rC  KeyErrorr  rD  )rL  rT  rS  rD  s     @@rW   rM  rM    sh     B66u7M7MNn% 
B!!>>	 
 :B6
Bs   )2 A A54A5c                 d    	 t         j                  j                  d      duS # t        $ r Y yw xY w)aG  Check if nvMatmulHeuristics is importable; cache the result for reuse.

    nvMatmulHeuristics provides performance model-based kernel selection
    for NVIDIA GEMM operations.

    Call ensure_nvmatmul_heuristics_available.cache_clear() after installing
    nvMatmulHeuristics in the same interpreter to retry the import.
    nvMatmulHeuristicsNFrE  r   r{   rW   $ensure_nvmatmul_heuristics_availablerZ  /  s4    ~~''(<=TII rJ  c                n   t               syt        d      syddlm} t	        |j
                  j                        sy |       syt        j                  g}	t        ||	      syt        j                  st        j                  syt        | ||      syt        d | |fD              ry|r|ry|y||yy)a  
    Returns True if we can use the blackwell kernel for grouped mm.
    Required conditions:
        1. CuTeDSL backend is enabled
        2. CuTeDSL is available
        3. We are on a blackwell arch
        4. The dtype is bf16
        5. Max autotune or max autotune gemm is enabled
        6. A, B, and the output are 16B aligned
        7. We are not using dynamic shapes
        8. A is 2d
        9. B is 3d
        10. Offsets are provided
        11. Bias and Scale are not provided
    FCUTEDSLr7   r9  )r  c              3  2   K   | ]  }t        |        y wr   )
is_dynamicr   rT   s     rW   r   z3use_blackwell_cutedsl_grouped_mm.<locals>.<genexpr>q  s     
1Q:a=
1r  T)rI  r  r;  r:  r  r   r  rP   rE  r  ri   r  r  r%  r|  )
mat_amat_br,  a_is_2db_is_2doffsbiasscale_resultr:  r  s
             rW    use_blackwell_cutedsl_grouped_mmrg  ?  s    2 !" +C&--$$%')^^$M 76#;#; ue6:

15%.
11g|<3r{   c                   ddl m} |j                  j                  r,|j                  j                  st        j                  d       y|j                  j                  j                  ||z  |z  d      }|dk  s|t        j                  j                  k  ryddlm} t        j                  j                   ryt        j"                  t        j$                  t        j&                  g}t)        | |      xr/ t        j*                  xs t        j,                  xr t/        d	      }|r6 |       s/t0        j3                  d
t        j                  j4                         y|S )Nr7   rm  zYCUTLASS backend is not supported with non-AOT cpp_wrapper mode. Skipping CUTLASS backend.Fr   fallbackr   )try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cutlass.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)rp  rn  rq  cpp_wrapperaot_modewarningswarnrr  optimization_hintri   rD  cutlass_backend_min_gemm_sizecodegen.cutlass.utilsrk  rP   rN  rq   r   rE  rT  r  r  r  r  r   r  cutlass_dir)	r,  r  r  krn  	gemm_sizerk  r  r   s	            rW   use_cutlass_templaterw    s    	ww177#3#3(	
   221q519r2JIA~V^^%Q%QQ9 }} ]]ENNEKK@Mfm4 	-  <F$<$<	-!),  !#KK4 **	 Jr{   _IntLikec                  
 ddl m t               syt               syt	        d      syddlm
 
j                  ry| j                  j                  dk7  st        j                  j                  ryt        j                  st        j                  sy|||g}||j!                  |       t#        fd|D              ry||g}	||	j!                  |       t#        
fd	|	D              ryy
)a3  
    Return True if we can use the NVIDIA Universal GEMM Template.

    Required conditions:
        1. NVGEMM backend is enabled
        2. cutlass_api is available
        3. We are on a NVIDIA GPU
        4. Max autotune or max autotune gemm is enabled
        5. Not in AOT Inductor mode (requires runtime JIT compilation)
        6. Base pointers are 16-byte aligned
        7. Shape dimensions are not unbacked symbols

    Note:
        - Shape and stride constraints are handled internally by
          cutlass_api.get_kernels() which filters incompatible kernels.
        - GroupedGemm currently only supports TN layout (column-major B).
          Any other layout will act as a noop and fall back to ATen.
        - Dynamic shapes are supported as long as they have hints
          (from example inputs).
    r   )has_free_unbacked_symbolsFNVGEMMr7   rm  rH   c              3  .   K   | ]  } |        y wr   r   )r   dimrz  s     rW   r   z1use_nv_universal_gemm_template.<locals>.<genexpr>  s     
Cc$S)
Cr  c              3  j   K   | ]*  }|j                         j                  j                  v  , y wr   )r  rq  r  )r   trn  s     rW   r   z1use_nv_universal_gemm_template.<locals>.<genexpr>  s&     
O1::<177444
Os   03T)r  rz  rI  rO  r  rp  rn  aot_compilationr   r  rP   rN  rq   ri   r  r  r  r|  )r,  r  r  ru  r`  ra  rd  r  dims_to_checktensors_to_checkrn  rz  s             @@rW   use_nv_universal_gemm_templater    s    < P "-/ *}}V#u}}'8'86#;#;
 1IM}Q

C]
CC u~%

O>N
OOr{   c                    t         j                  j                  j                         }|dk(  ry| j                         |j	                  d      D cg c]  }|j                          c}v S c c}w )z8Check if CUTLASS should be used for the given operation.ALLTrA  )ri   rD  cutlass_enabled_opsr   r[  rX  )op_nameenabled_opsrT   s      rW   _use_cutlass_for_opr    sU    ..44::<Ke==?+2C2CC2HIQqwwyIIIIs   A,r   c           
        ddl m} t        j                  j                  |z  }|j
                  j                  j                  t        j                  t        j                  ||| z        t        j                  |||z                    xrO |j
                  j                   xr6 |j
                  j                   xr t        j                  j                  dkD  S )Nr   rm  )torch._inductor.virtualizedrn  ri   r  decompose_k_thresholdrq  rr  r+  r~   r)  Gern  rm  num_decompose_k_splits)r  r  ru  threshold_multiplern  r  s         rW   use_decompose_k_choicer  	  s     ."MM??BTT 	
..II1A561A56	
 	5    	5 ###	5 MM0014
r{   c           
        t         j                  j                  }ddlm} t        t        j                  j                        xr |j                  j                  j                  t        j                  t        j                  ||| z        t        j                  |||z                    xr0 |j                  j                   xr |j                  j                    S )z
    Check if we should use the contiguous subgraph transform.
    This transform makes the second matrix contiguous before the matmul.
    r   rm  )ri   rocmcontiguous_thresholdr  rn  r1  rP   rN  rq   rq  rr  r+  r~   r)  r  rn  rm  )r  r  ru  r  rn  s        rW   use_contiguousr  	  s     ";;;; . 	U]] 	$GG22II01450145
	$    	$ ###
r{   c                   t         j                  j                  }g d}t        |t        j
                        r|j                  s|S |dk(  rg S t        | t        j
                        r| j                  r&t        |t        j
                        r|j                  sd}nt        || z  ||z        }d}t	        j                  |      }|D cg c]  }||k  r||k\  r| }}g g g }}
}	|D ]Z  }||z  }|dk  r||dz
  z  dk(  r|dk\  r|	j                  |       0|dz  dk(  r|
j                  |       J|j                  |       \ t         j                  dk(  r|	|
z   |z   S |	|
z   |z   }|d | S c c}w )	N)rr   r  ru   rs      r   r  ry  rs   r7   r  
EXHAUSTIVE)ri   r  r  r}   r~   r+  	is_numberro  divisorsr  max_autotune_gemm_search_space)r  r  ru  k_splits_limitdefault_k_splitsmax_k_splitmin_k_splitr  divisorpow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitss                  rW   get_k_splitsr  0	  s    ]]99N .!UZZ 	1		1ejj!!++1ejj!!++!q&!q&)K~~a H  k!g&< 	H  =?B>) %Q 3; EAI!#$$Q'RZ1_%%a( !!!$%" ,,< #55FF#&88>IK''=s   
E,c                T    t         j                  j                  |       j                  S r   )rP   rH   r  r  r   s    rW   _rocm_native_device_arch_namer  i	  s    ::++F3???r{   c                     	 dd l } ddlm}m} ddlm} t        j                  j                  | j                        }||||fS # t        $ r dd}dd} G d d      }d }Y %w xY w)	Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationc                     g S r   r   r   r{   rW   r  z*try_import_ck_lib.<locals>.gen_ops_library	      Ir{   c                     g S r   r   r   r{   rW   r  z.try_import_ck_lib.<locals>.gen_ops_preselected	  r  r{   c                      e Zd Zy)*try_import_ck_lib.<locals>.CKGemmOperationN)r   r   r   r   r{   rW   r  r  	  s    r{   r  )r   r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr  r  ck4inductor.universal_gemm.opr  r  r  dirname__file__r	  )r  r  r  r  package_dirnames        rW   try_import_ck_libr  n	  sl    	
	
 ''//+*>*>? O-@/QQ  			 	 s   ;A A#"A#c                    t         j                  st         j                  syt        j                  j
                  sy| j                  j                  dk7  ryt        | j                        }t         j                  j                  D ci c]  }|j                  d      d   | c}xs |j                  d      d   |i}|j                         t         j                  j                  z  D cg c]  }||   	 }}|sy| j                  t        j                  t        j                   t        j"                  fvryt%               \  }}}}|st&        j)                  d       y|t         j                  _        yc c}w c c}w )NFrH   :r   z,Please pip install Composable Kernel packageT)ri   r  r  rP   rN  rq   r   r  r  r  archr[  rI  ck_supported_archr   r   rE  rG  r  r   r  ck_dir)r,  native_archru  requested_archsrequested_supported_archsck_package_dirnamer   s          rW   use_ck_templater  	  s<   6#;#;==}}V# 0>K39;;3C3CDaqwws|A)D #q!;IO
 !%%'&++*G*GG! 	! ! %||EMM5>>5==II"3"51aBC+FKK+ E!s   E6,E;c                    ddl m} t        d      xr= t        |       xr0 |j                  j
                  j                  ||z  |z  d      dkD  S )Nr7   rm  CKr   ri  r   rp  rn  r  r  rq  rr  rq  r,  r  r  ru  rn  s        rW   use_ck_gemm_templater  	  sR     	d# 	KF#	KGG..q1uqy2.FJr{   c                    ddl m} t        d      xr= t        |       xr0 |j                  j
                  j                  ||z  |z  d      dkD  S )Nr7   rm  CKTILEr   ri  r   r  r  s        rW   use_ck_tile_gemm_templater  	  sR     	h' 	KF#	KGG..q1uqy2.FJr{   c                2    t        d      xr t        |       S )Nr  )r  r  r,  s    rW   use_ck_conv_templater  	  s    %d+G0GGr{   c                |    t         j                  xs t         j                  xr | j                  j                  dk(  S r  )ri   r  r  r   r  r  s    rW   _use_template_for_cpur  	  s2    7v77&
--


%&r{   c                   ddl m} t        |j                  |      sJ |j                  j                  }|j                  j
                  }t        |       xrX |j                         t        j                  k(  xr5 t        |      dk(  xr% t        |      dk(  xr |d   |d   k(  xr |d   dk(  }t        | ||d      xr |j                  j                         xs |S )Nr7   )rA      ry  F)require_constant_mat2)r  rA   r}   r,  r  r4  r  r  rP   rG  rR   use_cpp_gemm_templateis_contiguous)r,  mat1mat2rA   	mat1_sizemat1_stridemat1_each_batch_is_contiguouss          rW   use_cpp_bmm_templater  	  s     dkk6***
   I++$$Kf% 	"NN-	"^q 	" "	" ^y|+		"
 ^q  " !t5Q !!#D'Dr{   c                ~   ddl m} ddlm} ddlm}	 ddlm}
 t        |       rt        d      syt        j                  j                  sy|j                         t        j                  t        j                   fv }t        j"                  t        j$                  t        j&                  t        j                  t        j                   g} |
|||r| j(                  nd ||      \  }}}} }}t+        ||f      ryt-        ||j.                        r|j1                         } |	|j                               \  }} |d	||||j                         |j                         |t3               | |

      }dd}| j(                  |v xr= |d uxr7  ||      xr- t-        ||j4                        xr |j7                         xs | S )Nr7   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtypemat2_transposeduse_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refq_group_sizec                N    | j                          | j                         d   dk(  S )Nr   r7   )freeze_layoutr  rT   s    rW   is_last_dim_stride1z2use_cpp_gemm_template.<locals>.is_last_dim_stride1
  s"    	||~b!Q&&r{   )rT   r@   r   r1  )r  r  codegen.cpp_micro_gemmr  codegen.cpp_utilsr  kernel.mm_commonr  r  r  ri   cppweight_prepackr  rP   rV  rK  rG  rE  halfr   has_free_symbolsr}   BaseViewunwrap_viewparallel_num_threadsr'  is_module_buffer)r,  r  r  r  r  is_woq_int4r  r  r  r  r  	int8_gemmr  r  r  ru  r  r   r  r  s                       rW   r  r  	  s    9M) (0Ee0L::$$ U[[%**$==I]]ENNEJJUZZXM")"+&,,'#Aq!VT4 A$$!@AQROL!"			NN$^^%!(*!J'
 	% 	Cd"	C%	C tR]]+	C ""$A,A(Ar{   c                 b    t         j                  xs t         j                   xs t        d      S )NATEN)ri   r  r  r  r   r{   rW   use_aten_gemm_kernelsr  (
  s-    7v77 '	v	&'r{   c                  T    e Zd ZU  ej                  d      Zded<   ddZddZd	dZ	y)
DebugDirManagerr   r
  prev_debug_namec                @    t        t        j                        | _        y r   )r  r   counterr   rD  s    rW   r5  zDebugDirManager.__init__2
  s    ../r{   c                    t         j                  j                  j                  | _        | j                   d| j
                   | _        | j                  t         j                  j                  _        y )N_tmp_)rP   _dynamori   debug_dir_rootr  r   new_namerD  s    rW   	__enter__zDebugDirManager.__enter__5
  sM    $}}33BB//0dggY?.2mm+r{   c                    t        j                  | j                         | j                  t        j
                  j                  _        y r   )r  r  r  r  rP   r  ri   r  )r  r   s     rW   __exit__zDebugDirManager.__exit__:
  s*    dmm$.2.B.B+r{   Nr  )r   r   r   r  )
r   r   r   rp  r=  r  r   r5  r	  r  r   r{   rW   r   r   .
  s(    iooa G0<
Cr{   r   c                   ddl m} t               dfd}t        j                  j                  |d|      5  t        j                  j                           | |i |}d d d        t              fS # 1 sw Y   xY w)Nr7   r<   c                (    j                  |        y r   )rK  codesource_codess    rW   save_output_codez*run_and_get_code.<locals>.save_output_codeH
  s    r{   r  r  r
  r   r  )
rq  r=   r!   r   patchr  rP   r  resetr]  )r   r   r  r=   r  r  r  s         @rW   run_and_get_coder  ?
  st    
 %$.LL 
		=*<>N	O %T$V$% 4%%%% %s   'A55A>c                   |j                  dd      }t        | g|i |\  }}g }|D ]  }t        j                  rQt        j                  j
                  dur5|j                  t        j                  d|t        j                               n4|j                  t        j                  d|t        j                               |s|D cg c]  }|dd 	 }} ||fS c c}w )Nremove_quoteFTzR"TRITON\((.*?)\)TRITON"z	'''.*?'''r  )
rS   r  ri   rm  r  autotune_at_compile_timerO  r   findallDOTALL)	r   r   r  r  r  r  kernelsr  r  s	            rW   run_and_get_kernelsr  Q
  s     ::ne4L+B@@@FLG ;&--"H"HPT"T NN2::&A4STNN2::lD"))DE29:va|:G:; 7? ;s   Cc                &     d fd}t        |      S )Nc                 R            } | j                         j                          | S r   )r   r  )r  r   s    rW   run_with_backwardz1run_fw_bw_and_get_code.<locals>.run_with_backwarde
  s!    

r{   )r   r   )r  )r   r   s   ` rW   run_fw_bw_and_get_coder!  d
  s    
 -..r{   c                X   ddl m} g dfdd	fd}t        j                  j	                  |d|      5  t        j                  j	                  |d      5  t
        j                  j                           | |i |}ddd       ddd       S # 1 sw Y   xY w# 1 sw Y   S xY w)
zLGet the inductor-generated code, but skip any actual compilation or running.r7   r<   c                (    j                  |        y r   r  r  s    rW   r  z"get_code.<locals>.save_output_codes
  s    D!r{   c                     G d d      }| j                   r| j                         n| j                         \  }} |j                         |r |j                          |       S )Nc                       e Zd ZdZddZddZy)@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulec                     y r   r   rD  s    rW   r5  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__z
  s    r{   c                     y r   r   r  s      rW   callzEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.call}
  s    r{   Nr  r   r   r  r   r   r  )r   r   r   r   r5  r*  r   r{   rW   DummyModuler'  w
  s    Fr{   r,  )rm  codegen_with_cpp_wrappercodegenr   )r  r,  wrapper_codekernel_coder  s       rW   patched_compile_to_modulez+get_code.<locals>.patched_compile_to_modulev
  s]    	 	 04/?/?D))+T\\^ 	"k 	++,[../}r{   compile_to_moduler  Nr  )r  r=   r   r   )rq  r=   r   r  r  rP   r  r  )r   r   r  r=   r1  r   r  r  s         @@rW   get_coder3  m
  s    $ L", 	

.0I	
  	

-);=MN	  	          s#   "B'BBB	BB)c                |    t        | g|i |}dt        |      cxk  rdk  sn J dt        |              |d   S Nr7   ry  z%expected one or two code outputs got r   )r3  rR   )r   r   r  r  s       rW   get_triton_coder6  
  sQ    B000LL!&Q& 
/L0A/BC& ?r{   c                    t        | g|i |\  }}dt        |      cxk  rdk  sn J dt        |              |d   S r5  )r  rR   )r   r   r  r   r  s        rW   run_and_get_triton_coder8  
  sW     'r;D;F;OA|L!&Q& 
/L0A/BC& ?r{   c                    ddl m ddlm} |j                  g dfd}t
        j                  j                  |d|      5   | |i |}d d d        |fS # 1 sw Y   fS xY w)Nr   r<   rD   c                 ^     | i | | d   }t        |      sJ j                  |       y )Nry  )r}   r  )r   r  rq  r=   graph_lowerings	real_inits      rW   	fake_initz-run_and_get_graph_lowering.<locals>.fake_init
  s7    4"6"Q%///u%r{   r5  r+  )torch._inductor.graphr=   torch._inductor.output_coderE   r5  r   r  r  )	r   r   r  rE   r=  r  r=   r;  r<  s	         @@@rW   run_and_get_graph_loweringr@  
  sq     4;((IO& 
		?J		B %T$V$% ?""% ?""s   	AA(c              #     K   ddl m} |j                  |    }	 t        j                  ||      |j                  | <   d ||j                  | <   y# ||j                  | <   w xY ww)z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorrB  	loweringsr  partial)aten_opoverride_fnrB  orig_fns       rW   override_loweringrI  
  s`      )  )G.&/&7&7W&M7#&-7#g7#s   A$'A  A$A!!A$c                     ddl m} |j                  d fd}t        j                  j
                  j                  |d|      S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerc                B     | |        | |      }r	 | |       |S r   r   )r  rF  outrH  post_fnpre_fns      rW   r  z(add_scheduler_init_hook.<locals>.wrapper
  s+    y% i'Iu%
r{   r5  )r  r   rF  r   r   r   )torch._inductor.schedulerrK  r5  unittestr   r  r  )rO  rN  rK  r  rH  s   ``  @rW   add_scheduler_init_hookrR  
  s9     4  G ==%%iWEEr{   c                z    t         j                  rt        j                  |        yt        j	                  |        y)z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)ri   developer_warningsr   r  info)msgs    rW   developer_warningrW  
  s$       Cr{   c                    	 t         j                  j                  d      } | dz   t        t         j                        k  rTt        t         j                  | dz            dkD  r2t         j                  | dz      d   dk7  rt         j                  | dz      S t         j                  D ]#  }|j                  d      s|t        d      d c S  y# t        $ r Y Bw xY w)a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr7   r   r  z--only=N)r  argvr  rR   
ValueErrorr  )rU  r  s     rW   get_benchmark_namer[  
  s    	hhnnX&!Gc#((m#CHHS1W%&*q!!$+88C!G$$ xx )>>)$s9~'(()   s   BC 	CCc                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    ywr7   Nr   r_  s     rW   r   zis_ones.<locals>.<genexpr>       %!qAv%   r   rJ  s    rW   is_onesrc        %u%%%r{   c                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    yw)r   Nr   r_  s     rW   r   zis_zeros.<locals>.<genexpr>  r_  r`  ra  rb  s    rW   is_zerosrg    rd  r{   c                &    t        d | D              S )Nc              3     K   | ]@  }t        |t        j                        r$|j                  t        j                  d       k(   B yw)r  N)r}   rP   r  r   )r   r   s     rW   r   z is_cpu_device.<locals>.<genexpr>   s8      dELL) 	u||E**s   AAra  )inputss    rW   is_cpu_devicerk    s       r{   c                    t        | t        j                        sJ d       | j                  rt        j
                  S t        j                  S )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)r}   r~   r+  r   rP   rU  rI  ri  s    rW   get_sympy_Expr_dtyperm  '  s=    c5::& B& ~~{{}}r{   c              /     K   | r-t        j                  j                  |i |5 }| d d d        y d  y # 1 sw Y   y xY wwr   )rP   r   r   )should_profiler   r  r   s       rW   maybe_profilerp  1  sE     ^^##T4V4 	G	 	 		 	s   "A7AA Ac                 l    t         j                  j                  } | dk  rt        j                         } | S Nr7   )ri   r  threadsrP   get_num_threads)rs  s    rW   r  r  :  s+    jj  G{'')Nr{   c                     ddl m}   |        }|j                  dt        j                  j
                  rd      S d      S )Nr7   )get_backend_options
num_stagesry  r  )runtime.triton_helpersrv  rH  rP   rN  rq   )rv  optionss     rW   get_backend_num_stagesrz  A  s2    ;!#G;;|%--*;*;QCCCCr{   c                   t        | t        j                  j                  j                  j
                  dk(        }||S ddlm}m} t        j                  j                         xr! t        j                  j                         dk\  }| t        j                  t        j                  t        j                  fv sJ t        j                  |      j                   j#                  d      rddlm}  |       }| t        j                  t        j                  fv r|r	 || |      S t        j                  j                  j                  j
                  dk(  r |t        j                  |      S  |t        j                  |      S | t        j                  t        j                  fv r
|r ||       S t        j                  j                  j                  j
                  dk(  r |t        j                        S  |t        j                        S )z
    We don't want to throw errors in this function. First check to see if the device is in device_info.py,
    then fall back to the inaccurate triton estimation.
    tf32)is_tf32r   )get_max_simd_tflopsget_max_tensorcore_tflops)rv   r   
clock_rate)max_clock_rate)r   rP   backendsrH   matmulfp32_precisiontriton.testingr~  r  rQ   get_device_capabilityr   rE  rG  inspect	signature
parametersrH  torch._utils_internalr  )r   ds_topsr~  r  SM80OrLaterr  sm_clocks          rW   get_device_tflopsr  I  s    u~~**11@@FJG M**))+ 

0P0P0R W 1K
 U]]ENNEMMBBBB,-88<<\J8!#U]]ENN33,UH==>>%%44>,U]]HEE&u}}h??U]]ENN33,U33>>%%44>,U]];;&u}}55r{   c                     ddl m}   |        S )Nr   get_dram_gbps)r  r  r  s    rW   get_gpu_dram_gbpsr  t  s    ,?r{   c                 x    ddl m}  | j                  j                  j	                  d      j                  dd      S )Nr   r  max_shared_mem)triton.runtimer  r  r  r  rH  r  s    rW   get_gpu_shared_memoryr  {  s.    %==44Q7;;<LaPPr{   c                     t         j                  j                         rUt         j                  j                         j                  } t         j                  j                         j
                  }|| z  S d} d}|| z  S )Nr  i   )rP   rH   rQ   r  	warp_sizemax_threads_per_block)r  r  s     rW   get_max_numwarpsr    sh    zz JJ446@@	 %

 @ @ B X X
 !I-- 	 $ I--r{   c                $    | j                  d      S )Nwelford)r  reduction_types    rW   is_welford_reductionr    s    $$Y//r{   c                (    t        |       ry| dk(  ryy)Nr  online_softmax_reducery  r7   )r  r  s    rW   reduction_num_outputsr    s    N+	2	2r{   c                 0    t        j                         dk(  S )NLinux)platformsystemr   r{   rW   is_linuxr    s    ??''r{   c                 (    t         j                  dk(  S )Nrk   )r  r  r   r{   rW   r  r    s    <<7""r{   c                &    t        d | D              S )Nc              3  n   K   | ]-  }t        |t        j                        xr |j                    / y wr   )r}   r~   r+  r  r_  s     rW   r   z#has_free_symbols.<locals>.<genexpr>  s)     Jz!UZZ(<_<Js   35r  )itrs    rW   r  r    s    JcJJJr{   c            	     x   ddl m} | D ]  }t        ||j                  |j                  |j
                  |j                  |j                  f      r=t        |j                         xs d      st        |j                         xs d      s yt        ||j                        st        dt        |              y)Nr7   r  r   Tzunexpected type for is_dynamic F)r  r  r}   r%  r'  r  rk  r>   r  maybe_get_sizemaybe_get_strider@   	TypeErrorr  )r   r  r  s      rW   r^  r^    s     IbmmR[[":K:KRYYW
   0 0 2 8b9=M""$*> Aryy)=d1gYGHHI r{   c                      e Zd ZdZdZy)PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r   r   r   r  r  r   r{   rW   r  r    s      K *r{   r  c                ~   ddl m} t        j                  dd      5 }t	        j
                         }t	        j
                         } t        |t        |            j                  |  t        d|j                   |       t        |j                  |       t        j                         }t        ||      5   | |j                         d d d        t        j                         |z
  }	 ||j                         |j                  j                          |j                          t        d	|j                   |       t        |j                  |       |j!                         |j!                         k(  }
t"        j%                  d
||j&                  |
|	       d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nr7   )stable_topological_sortr_  zutf-8)modeencoding)r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr  r  NamedTemporaryFileior   r^   rZ   	propagater  rq  r   nowr]   lint	recompiler>  r   rU  r   )r  r  inprV  r  r
  	before_ioafter_io
start_timetime_elapsedr  s              rW   pass_execution_and_saver    sU    9		$	$
 
 
KKM	;;=C	R#3C#89CCSI	"(($1-bhhY'\\^
#B, 	N	||~
2)


#!,bhhX& H$5$5$77hFF	
+
 
	 	
 
s%   BF3;F'CF3'F0	,F33F<c                ^    ddl m} t        | |j                        xr | j	                         S )zB
    Check if input buffer is a multi-outputs template buffer
    r7   r  )r  r  r}   rn  is_multi_outputs_template	input_bufr  s     rW   r  r    s/      	9b//0 	2//1r{   c                    ddl m} t        | |j                        xr2 t	        | j
                        dk(  xr t        | j
                  d         S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r7   r  r   )r  r  r}   MultiOutputrR   rj  r  r  s     rW   #is_output_of_multi_outputs_templater    sL      	9bnn- 	;	  !Q&	;%i&6&6q&9:r{   c                &   | yddl m} t        | |j                        xr- t        | |j                         xr |d u xs | j
                  |u xsA t        |       |j                  u xr' t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  xs t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  xsa t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  S )NFr7   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  r}   _CollectiveKernel_WaitKernelop_overloadr  FallbackKernelr  rP   r  torchrecr  defaultr  r  rg  r  r  s      rW   is_collectiver    sG    | 	4--. 	3400	34Z14++r1 	T
b''' 	

 		**,?@ U$$		(:(:(L(L(T(TT
 		**,DE E$$99%%<<DDE 		**,CD Y$$		(:(:(P(P(X(XX/r{   c                <    ddl m} t        |       |j                  u S Nr7   r  )r  r  r  r  )rg  r  s     rW   is_waitr  (  s    :''r{   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        xr |d u xs  ||       S )Nr   GroupedSchedulerNodec              3  2   K   | ]  }t        |        y wr   )contains_collectiver_  s     rW   r   z&contains_collective.<locals>.<genexpr>5  s     @a&q)@r  )rP  r  r}   r|  snodesr  rg  )snode	filter_fnr  s      rW   r  r  .  sJ     ?%-.@5<<@@@$P)t*;*Oy?OPr{   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        S )Nr   r  c              3  2   K   | ]  }t        |        y wr   )contains_waitr_  s     rW   r   z contains_wait.<locals>.<genexpr>>  s     :=#:r  )rP  r  r}   r|  r  r  rg  )r  r  s     rW   r  r  :  s4    >%-.:U\\:::uzz""r{   c                    ddl m} t        |t        j                  j
                        r|g}t        | |j                        xr | j                  |v S r  )r  r  r}   rP   r  r  r  r  r  s      rW   is_fallback_opr  C  sE     "ejj++,TdB--.I43C3Cr3IIr{   c                B    |||    j                   j                            S r   )defining_opr  )buf_namename_to_bufname_to_fused_nodes      rW   buf_name_to_fused_snoder  N  s#     k(3??HHJKKr{   c                     yr  r   r  s    rW   r  r  Y  r  r{   c                     ||       ry |j                  |        | j                  D ].  }t        |j                  ||      }||v rt	        |||||       0 y )Ncriteria_cb)rK  unmet_dependenciesr  r   find_recursive_deps_of_node)r  collected_node_setr  r  r  depdefining_op_for_deps          rW   r  r  T  sn     55!'' 
5HHk#5
 "44##	

r{   c                     yr  r   r  s    rW   r  r  r  r  r{   c           	     z    ||       ry |j                  |        | j                         D ]  }|j                  D ]}  }|j                  J |j                  j	                         dk(  r/|j                  j	                         |vrL||j                  j	                            }||v rnt        |||||         y )NOUTPUTr  )rK  get_outputsr  rg  r  find_recursive_users_of_node)r  r  r  r  r  org  user_ops           rW   r  r  m  s     55!  GG 	D99(((yy!!#x/yy!!#+==(););)=>G,,(""'	r{   c                b    t         j                  j                  j                  rdnd}|| z
  |z
  S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)ry  r   )rP   
_functorchri   functionalize_rng_ops)dynamo_gm_num_inputsaot_fw_gm_num_inputsnum_rng_seed_offset_inputss      rW   num_fw_fixed_argumentsr    s6     $$::   "669SSSr{   c                    dd}d}g }| j                   j                  D ]0  }|j                  dk(  s ||      r|j                  |       |dz  }2 |t	        t        t        |                  k(  sJ t        |      S )z>
    Infers which inputs are static for a backwards graph
    c                ~    d| j                   vxr. d| j                   vxr d| j                   vxr d| j                   vS )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater  r  s    rW   is_saved_tensorz'count_tangents.<locals>.is_saved_tensor  sH    aff$ .!&&(.!/.  qvv-		
r{   r   r  r7   )rT   r5   r   r1  )rq  rF  r  r  r]  r   rR   )fx_gr  	arg_countstatic_arg_idxsr  s        rW   count_tangentsr    s    

 IOZZ 44= q!&&y1NI	 d5_)=#>????r{   c                    g }t        | j                  j                        D ]D  \  }}|j                  dk7  r |S |j                  j                  d      s4|j                  |       F |S )a  
    Returns indices of backward graph inputs that are always at fixed
    addresses: primals (parameters/buffers/user inputs saved for backward).
    Excludes saved activations which may not be at fixed addresses when
    the forward is partitioned for CUDA graphs.
    r  primals_)rE  rq  rF  r  r   r  r  )r  static_idxsrU  r  s       rW   get_static_bw_input_idxsr    sl     KDJJ,,- $Q44=   66Z(s#	$
 r{   c                  2    e Zd ZU ded<   ddZedd       Zy)	BoxedBoolr1  r   c                    | j                   S r   )r   rD  s    rW   rO  zBoxedBool.__bool__  s    zzr{   c                6    t        | t              r	d| _        | S yr  )r}   r  r   r  s    rW   disablezBoxedBool.disable  s    c9%CIJr{   Nr  )r  r   r   zBoxedBool | bool)r   r   r   r   rO  r  r  r   r{   rW   r  r    s     K  r{   r  c              #      K   ddl m} |j                  	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fd}t        j                  j                  |d|      5  d  d d d        y # 1 sw Y   y xY ww)Nr7   r9   c                @    j                  |        | |||||      S r   r$  )r  kernel_namer0  rX  gpucpp_definitionkernel_listorig_define_kernels         rW   define_kernelz.collect_defined_kernels.<locals>.define_kernel  s-     	;'!+{Hc>
 	
r{   r$  )NTN)r  r:   r  r
  r0  r
  rX  r  r   r1  r!  r  r   r   )codegen.wrapperr:   r$  r   r  r  )r"  r:   r$  r#  s   `  @rW   collect_defined_kernelsr&    s     5-;;  $%)
"

 
 	

 
 #
 

 
		/-	P   s   AA*A	A*A'#A*c                    | dz   S )N__original__r   r  s    rW    get_cloned_parameter_buffer_namer)    s    .  r{   c                    | t         v S r   )rN   r  s    rW   r  r    s    Yr{   c                 :    t         j                  j                  duS )z,Check if we're running on ROCm/HIP platform.N)rP   rN  rq   r   r{   rW   is_rocmr,    s    ==D((r{   c                &    | dk7  xr t        |       S )NrI   )r  r  s    rW   device_need_guardr.    s    U?-vf~-r{   c                N   | t         j                  k(  r?t         j                  j                         r!t         j                  j	                         dk  S | t         j                  k(  rt         j
                  j                         ry| t         j                  t         j                  fv S )N)r  r   T)rP   rE  rH   rQ   r  rJ   rU  r1  r$  s    rW   ,needs_fallback_due_to_atomic_add_limitationsr0    sk    5::#:#:#<zz//1F::	%..	 UYY%;%;%=ejj111r{   c                   | j                   t        j                  j                  j                  t        j                  j                  j
                  fv r|y| j                   t        j                  j                  j                  k(  rdnd}|d |fvxs |xr t        |      xr t        |      xs | j                   t        j                  j                  j                  k(  xrW |dk(  xrP |xrL |dk(  xrE t        j                  j                  xr) t        j                  j                  xs t               dk7  xs? ||k(  xr" |t        j                  t        j                  fv xs t        j                          S )NFrK  r   r  r7   )overloadpacketrP   r  atenscatter_reduce_scatter_reducescatter_r  r0  ri   r  fallback_scatter_reduce_sumdynamic_threadsr  r1  rU  $are_deterministic_algorithms_enabled)r  r  
self_dtype	src_dtypesrc_device_typesrc_is_tensor	reduce_tys          rW   use_scatter_fallbackr?    sZ    	""IINN**EIINN,I,IJ	K" ++uyy~~/F/FFE 
 	tY// 	8 H'H<YG		8 &&%))..*H*HH L%'LL  5(L 

66	L
 ++J/C/E/J	8 i'SJ5::u{{:S,S	8 557!r{   c                   ddl m}m} ddlm} t        dt        |        d       t        |       D ]  \  }}t        d|dd       ||u rt        d	       '||u rt        d
       7t        ||      r|j                         }t        |rdnd d       |r:|j                  J t        d|j                  j                  j                          t        d       |j                  j                  D ]  }t        |        t        d       |j                  j                  D ]  }t        |        t!        dt#        |              y)z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr!  3r  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdrA  rB  rP  rC  r  rR   rE  r}   is_reductionrg  r&  reduction_hintr9  r:  r;  r   r  )r  rA  rB  rC  rU  rg  is_redr  s           rW   dump_node_schedulerK  %  s=   
 O7	M 236
:;}- H	T#al?"$%%%%&m,&&(FfU$/?@yy,,,01N1N0OPQ*''-- c
+''.. c
 !9$t*FGG+Hr{   c                z    ddl m}  || j                         t        | j                        z  t
        z  dk(        S )Nr   )r+  )r  r+  storage_offsetr&  r   GPU_ALIGN_BYTES)r   r+  s     rW   tensor_is_alignedrO  F  s:     L 				 >&,,#?	??RVWW r{   c                |    t        | j                  j                        syt        j                  xs t        |       S r  )r  r   r  ri   assume_aligned_inputsrO  )example_inputs    rW   should_assume_input_alignedrS  T  s2     -&&++,''K+<]+KKr{   c                 6   t         j                  j                  j                         } | st	        j
                         S | j                  r| j                  j                  st	        j
                         S | j                  j                  }|j                         S r   )	rP   _guardsTracingContexttry_getrc  nullcontextr  rs  suppress_guards)tracing_contextrs  s     rW   #maybe_get_suppress_shape_guards_ctxr[  ]  sv    
 mm22::<O%%'' $$O,E,E,O,O%%''))33I$$&&r{   c                   t         j                  j                  j                  t        dd      5  t
        j                  j                          dd l}dd l	} |j                         } |j                  |      }ddlm} |j                  |       |j                  }|j!                  |j"                          | |i |}	|j%                         }
|j!                  |       |j'                  |       d d d        |	|
fS # 1 sw Y   	
fS xY w)Nr   Tr   )output_code_log)rQ  r   r  r  ri   rP   r  r  r  loggingr   StreamHandlertorch._inductor.codecacher]  
addHandlerlevelsetLevelDEBUGr>  removeHandler)r   r   r  r  r^  log_capture_stringchr]  
prev_levelr  r   s              rW   run_and_get_cpp_coderi  m  s     
			#	#FGT	: *(R[[]"W""#56=""2&$**
  /T$V$'')  ,%%b)*  19!*  19s   CC>>D
c                   t        |       }||j                  S | D ]  }t        |t        j                        r|j
                  j                  c S t        |t        j                        sP|j                         D ]6  }t        |t        j                        s|j
                  j                  c c S  |j                         D ]6  }t        |t        j                        s|j
                  j                  c c S   y r   )	rZ   rs  r}   rP   r2   rg  r  r  r4  )rj  r  inputr  r4  s        rW   shape_env_from_inputsrl    s     (I """  1eU\\*::''' eU\\*

 /dELL199.../  ,,. 1fell3!;;00011 r{   c                <     t              dk(  r S d fd}|S )Nr   c                z    t        |       \  }} |       }t        |      rt        j                  ||       |S r   )copy_misaligned_inputsrR   rP   _foreach_copy_)
new_inputsold_tensorsnew_tensorsrM  inputs_to_checkr  mutated_input_idxss       rW   r  z)align_inputs_from_check_idxs.<locals>.run  sE    #9);$
 [ J {  k:
r{   )rq  list[InputType]r   r   )rR   )r  rt  ru  r  s   ``` rW   align_inputs_from_check_idxsrw    s#    
 ?q  Jr{   c                T   d| j                         v rd}n;t        d t        | j                         | j                               D              dz   }t	        j
                  | |fd      j                         }t	        j
                  || j                         | j                               S )Nr   c              3  2   K   | ]  \  }}|d z
  |z    ywr^  r   )r   r0  r4  s      rW   r   z)clone_preserve_strides.<locals>.<genexpr>  s     Tf$Tr  r7   r   )r  r   r   r4  rP   
as_stridedclone)rT   needed_sizer*  s      rW   clone_preserve_stridesr}    s    AFFH} T#affh
:STTWXX 	 a+6<<>FFAFFHahhj99r{   c                2   g }g }|du}|D ]  }| |   }t        |t        j                        sJ dt        |              |j	                         t
        z  sMt        |      | |<   |s^||v sc|j                  |       |j                  | |           ||fS )z
    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
    cloned tensor which is in `return_pair_idxs`.
    Nz Expected tensors only, but got: )r}   rP   r  r  data_ptr	ALIGNMENTr}  r  )rq  check_inputs_idxsreturn_pair_idxsrr  rs  ret_pair_definedr   _inps           rW   ro  ro    s     ')K&(K (t3 
2!}$- 	
.tDzl;	
- ==?Y&248JqMA)9$9""4("":a=1
2 ##r{   c                    g }|D ]N  }| |   }t        |t        j                        s#|j                         t        z  dk(  s>|j                  |       P t        |      t        |      k7  r|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )r}   rP   r  r  r  r  rR   )rj  static_input_idxsaligned_static_input_idxsrU  rk  s        rW   remove_unaligned_input_idxsr    st     !#  2seU\\*0@90LQR/R%,,S12 $%->)??((r{   c                P   ddl m} t        j                  t        j                        j
                  }|j                  j                  j                  }|j                  j                  j                  j                  }t        j                  r'|j                  j                  j                  | |       y|j                  j                  j                  | |k        ry|j                  r)|j                  j                  j                  | dk        ry ||       xr  ||       |k  S )zCheck if an expression fits within 32-bit integer range.

    NOTE: This function intentionally does not install guards. Callers are
    responsible for guarding (e.g. via check_leq) when they decide to use
    32-bit indexing based on this result.
    r7   rm  Tg@xDF)rp  rn  rP   r'  rT  r   rq  rr  guarding_hint_or_throwrs  has_guarding_hintri   assume_32bit_indexing	check_leqr+  r  )r   rn  int_maxr  r  s        rW   expr_fits_within_32bitr    s     kk%++&**GWW--DD((22DD##	""1g. 	ww--a7l; 	 7711!d(;  QH$:1$=$HHr{   c                   t         j                  j                  j                         }||j                  t        |j                        dk(  sJ t        |       |j                  J |j                  D ]  }||j                  j                  d        !dt         j                  j                  j                         x}r|j                  dfd|j                  j                  t        fd|D                      y y y )Nr   Fc                f    t        |       S rj                  |       S j                  |       S r   )r   deserialize_symexprevaluate_symexpr)r   fakify_first_callrs  s    rW   map_exprz4set_tracing_context_output_strides.<locals>.map_expr5  s7     ("1v((<<Q??$55a88r{   c              3  .   K   | ]  } |        y wr   r   )r   r   r  s     rW   r   z5set_tracing_context_output_strides.<locals>.<genexpr>=  s     5!(1+5r  )r   r   r   z)float | int | SymInt | SymFloat | SymBool)
rP   rU  rV  rW  output_stridesrR   rl  r  r  r  )r  compiled_graphr)  r!  r  r  r  rs  s        @@@rW   "set_tracing_context_output_stridesr  $  s     mm**224Gw55A7))*a///).9	,,888#22 	E}&&--d3$)!--66>>@@3@(+(=(=%9 &&--5u55		  Br{   c                    t         j                  t         j                  S t        j                         syt        j                  j                         ry	 ddlm}  | t        j                  j                  d      k\  S # t        $ r Y yw xY w)NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
ri   fx_graph_remote_cache	is_fbcoderP   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorjustknobs_getval_intr  s    rW    should_use_remote_fx_graph_cacher  A  s    ##/+++,,.H  5#8#8#M#M8$    s   A> >	B
	B
c                0    t        j                  dd|       S )Nz[^a-zA-Z0-9_]r   )r   subr  s    rW   normalize_namer  T  s    66"C..r{   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2z^.*[.]c                l    t         j                  dt        |             }t        j	                  ||      S )z"Convert torch.dtype to triton typetl.)_triton_type_rer  r
  _triton_type_mappingrH  )r   triton_type_names     rW   triton_typer  j  s.    &**5#e*=##$46FGGr{   c                    t         j                  | |       }|j                  dd      }t        t        |      }t        |t        j                        sJ |S )Nr  r  )_torch_triton_mappingrH  r  rO   rP   r}   r   )r   adjusted_type	type_namer  s       rW   triton_type_to_torchr  p  sL    )--eU;M%%eR0Iy)Ii---r{   c                   | j                    xr | j                         |j                         k(  xr | j                         |j                         k(  xr | j                  |j                  k(  xr{ | j                  |j                  k(  xr` | j                         j                         |j                         j                         k(  xr! | j                         |j                         k(  S r   )	is_mkldnnr  r4  r   r   untyped_storager  rM  r&  r   s     rW   is_same_tensorr  x  s    NN 	<IIK5::<'	<KKMU\\^+	< JJ%++%	< KK5<<'		<
   "++-1F1F1H1Q1Q1SS	< !U%9%9%;;r{   c                v   | j                   xr | j                         |j                         k(  xr | j                  |j                  k(  xrn | j                  |j                  k(  xrS t        j
                  j                  j                  |       t        j
                  j                  j                  |      k(  S r   )r  r  r   r   rP   r  mkldnnr  r  s     rW   is_same_mkldnn_tensorr    s     	PIIK5::<'	PJJ%++%	P KK5<<'	P II%%d+uyy/?/?/H/H/OOr{   c                      y)N)rq  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorr   r   r{   rW   boolean_opsr    s    r{   c                  "    e Zd ZU ded<   ded<   y)OpDtypeRuler3   type_promotion_kindtorch.dtype | Noneoverride_return_dtypeNr*  r   r{   rW   r  r    s    88--r{   r  zdict[str, OpDtypeRule]op_dtype_propagation_rulesc                *    t        ||      t        | <   y r   )r  r  )r   r  r  s      rW   #register_op_dtype_propagation_rulesr    s    
 (32(t$r{   zOrderedSet[str]op_requires_libdevice_fp64c                .    t         j                  |        y r   )r  rK  r  s    rW   #register_op_requires_libdevice_fp64r    s    ""4(r{   c                   ddl m} | s$|j                  j                         j                  } | dk(  rt
        j                  S | dk(  ry| dk(  rt
        j                  S | dk(  rt
        j                  S t
        j                  S )Nr   rm  r  rI   rJ   tpu)
r  rn  rq  get_current_device_or_throwr  ri   cpu_backendxpu_backendtpu_backendcuda_backend)r   rn  s     rW   get_current_backendr    ss    -gg99;@@e!!!				!!!		!!!"""r{   c                    | t         j                  t         j                  fv r7t        j                  j
                  rt               dk(  rt         j                  S | S )z"Maybe upcast [b]float16 to float32r  )rP   r   rE  ri   r  codegen_upcast_to_fp32r  rG  r$  s    rW   upcast_compute_typer    s@     	%--00MM00!X-}}Lr{   KeyTypeValTypec                  Z    e Zd ZdZddZddZddZddZdddZddZ	dd	Z
dd
ZddZy)
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    c                     || _         i | _        y r   original_dict	new_items)r  r  s     rW   r5  zScopedDict.__init__  s    *13r{   c                Z    || j                   v r| j                   |   S | j                  |   S r   r  r  r  s     rW   r  zScopedDict.__getitem__  s.    $.. >>#&&!!#&&r{   c                "    || j                   |<   y r   )r  )r  r^  r   s      rW   __setitem__zScopedDict.__setitem__  s    #sr{   c                >    || j                   v xs || j                  v S r   r  r  s     rW   __contains__zScopedDict.__contains__  s!    dnn$At/A/A(AAr{   Nc                t    || j                   v r| j                   |   S | j                  j                  ||      S r   )r  r  rH  )r  r^  r  s      rW   rH  zScopedDict.get  s6    $.. >>#&&!!%%c733r{   c                z    t        | j                        }| j                  D ]  }|| j                  vs|dz  } |S rr  )rR   r  r  )r  r  ru  s      rW   r  zScopedDict.__len__  sC    ""# 	A***Q	 r{   c              #     K   | j                   E d {    | j                  D ]  }|| j                   vs|  y 7 )wr   r  )r  ru  s     rW   __iter__zScopedDict.__iter__  s@     %%%% 	A***	 	&s   ><!>>c                H    t        | j                  xs | j                        S r   )r1  r  r  rD  s    rW   rO  zScopedDict.__bool__  s    D&&8$..99r{   c                    t         r   r  r  s     rW   __delitem__zScopedDict.__delitem__  s    !!r{   )r  Mapping[KeyType, ValType])r^  r  r   r  )r^  r  r   r  r   r  )r^  r  r   r1  r   )r^  r  r  ValType | Noner   r  r  )r   zIterator[KeyType]r  )r^  r  r   r  )r   r   r   r   r5  r  r  r  rH  r  r  rO  r  r   r{   rW   r  r    s5    4'
$B4
:"r{   r  )frozen_defaultc              (    dfd}| |S  ||       S )Nc                4    t        j                  | d      S )NT)kw_onlyr   )dataclasses	dataclass)r   r   s    rW   wrapzir_dataclass.<locals>.wrap  s    $$S$vFFr{   )r   rl   r   rl   r   )r   r   r  s    ` rW   ir_dataclassr    s    G {9r{   c                     t         j                  j                  j                         } | "| j                  r| j                  j
                  S y r   )rP   rU  rV  rW  fw_metadatabw_donated_idxs)rZ  s    rW   get_donated_idxsr    s=    mm22::<O"'B'B**:::r{   c                       e Zd ZdZdZdZdZdZy)TritonAttrsDescriptorVersionr   r7   ry  r  r0  N)r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTr   r{   rW   r  r    s     LKK	  Gr{   r  c                 P   t         j                  j                  d      t        j                  S dd l} dd l} t        | j                  j                  d      rt        j                  S t        | j                  j                  d      rt        j                  S t        j                  S )Nr  r   AttrsDescriptor)rF  rG  rH  r  r  triton.backends.compilertriton.compiler.compilerr  r  compilerr  r  r  )r  s    rW   #get_triton_attrs_descriptor_versionr  '  s{    ~~)1+888##v''):; ,777	))+<	=+777 ,333r{   c                 8    t               t        j                  k(  S r   )r  r  r  r   r{   rW   triton_version_uses_attrs_dictr  A  s    .04P4X4XXXr{   c                    | j                         }t        | t        j                  j                        r| d| j
                   n|}||fS )Nrp   )r   r}   rP   r  r  _overloadname)r  op_overload_packet_nameop_overload_names      rW   get_op_namesr  E  sR    #%779 b%**//0 #
#1R%5%5$67$ 
 #$444r{   c                j   ddl m} | j                  }t        |t        j
                  j                        sy|t        j                  j                  j                  j                  t        j                  j                  j                  j                  t        j                  j                  j                  j                  fv rq ||| j                  | j                  d      }|O|\  }}|d   }|D ]@  }||j                  d   j                   t        j"                  t        j$                  fv s@ y y)a  
    Check if an FX node is cudagraph-unsafe based on its input arguments.

    Some ops are only cudagraph-unsafe depending on their inputs (e.g., index_put
    with boolean indices triggers .nonzero() during capture, but integer indices
    are safe).
    r   )normalize_functionFT)normalize_to_only_use_kwargsindicesrj  )torch.fx.operator_schemasr  r  r}   rP   r  r  r  r3  	index_putr  
index_put__unsafe_index_putr   r  r  r   r1  rV  )r  r  r  
normalizedr   r  r  rU  s           rW   ,_fx_node_is_input_dependent_cudagraph_unsafer  O  s     =^^Ffejj334 		  ((		!!))		((00 
 (GLL'..t

 !"IAvY'G  ?sxx'<'<JJKKA (    r{   c                   | j                   }t        |      t        v ryt        |t        j
                  j                        r1t        j                  j                  j                  |j                  v ryt        |       ry| j                  j                  d      x}Kt        |t        t        f      s|gn|}|D ]+  }t        |t        j                         s|j"                  s+ y y)a   
    Check if an FX node is cudagraph-unsafe.

    This includes:
    - Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
    - Ops with the cudagraph_unsafe tag
    - Input-dependent unsafe ops (e.g., index_put with boolean indices)
    - Ops with sparse tensor outputs
    Trj  F)r  r
  FORBIDDEN_CUDAGRAPH_OPSr}   rP   r  r  r  r  cudagraph_unsafer  r  r  rH  r]  r  r  	is_sparse)r  r  rj  valsr   s        rW   r  r  s  s     ^^F 6{-- 	65::001HHLL))V[[8 4G< ||&&3&sT5M:u 	A!U\\*q{{	 r{   c                    ddl m} t        | |j                  |j                  f      ryt        | |j
                  |j                  f      syt        | dd      }|t        |      ryy)ah  
    Returns True if the node is an op that is not cudagraphable.
    This includes:
    - Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
    - Ops with the cudagraph_unsafe tag
    - index_put_ with boolean indices (triggers .nonzero() during capture)
    - Control flow nodes (Conditional, WhileLoop)
    - Ops with sparse tensor outputs
    r7   r  TFr  N)	r  r  r}   Conditional	WhileLoopr  r?   rO   r  )rg  r  r  s      rW   is_cudagraph_unsafe_opr(    sa      $67dR..@AdIt,G:7Cr{   c                    t         j                  j                  dd      } t        j                         rUddlm}  |       }|rFt         j                  j                  |dd      }| r!t         j                  j                  || g      n|} | S )NLD_LIBRARY_PATHr  r   )get_runtime_pathr  lib)
r  r  rH  ri   r  libfb.py.parutilr+  r  r  pathsep)r  r+  runtime_pathlib_paths       rW   get_ld_library_pathr1    sg    ::>>+R0D5')ww||L)UCH8<2::??Hd#34(DKr{   c                F    ddl m} t        | |      xr | j                  d uS )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperr3  r}   partition_signatures)r  r3  s     rW   #is_codegen_graph_partition_subgraphr6    s*    L 	789 	5((4r{   c                     t         j                  j                  j                  j                  xs t
        j                  d uxr$ t         j                  j                  j                  S r   )rP   r  ri   r  
cudagraphs&_unstable_customized_partition_wrapperr  graph_partitionr   r{   rW   is_using_cudagraph_partitionr;    sN    %%00 	F199E1 //
 
 
0
01r{   c                    ddl m} |j                  j                  j	                  | d      r6|j                  j                  j                  | d      rt        j                  S t        j                  S )Nr7   rm  l        i   )	rp  rn  rq  rr  statically_known_ltr  rP   rT  rU  )r  rn  s     rW   dtype_from_sizer>    sP    ww++e
''


/
/h
?{{{{r{   )r  rJ   c                h    | dk(  r(t         j                  j                  j                         S d| v ryy)z;
    Returns True if the device supports MKL-DNN BF16.
    r  rJ   TF)rP   r  r  _is_mkldnn_bf16_supportedr   s    rW   is_mkldnn_bf16_supportedrB    3     eyy99;;	+	r{   c                h    | dk(  r(t         j                  j                  j                         S d| v ryy)z;
    Returns True if the device supports MKL-DNN FP16.
    r  rJ   TF)rP   r  r  _is_mkldnn_fp16_supportedrA  s    rW   is_mkldnn_fp16_supportedrF    rC  r{   c           
     n   |D cg c]  }t        t        |             }}| D ]R  }t        |      t        |      k(  sJ t        |      D ])  \  }}t        ||   t        t        |                  ||<   + T g }|j	                  dj                  d t        ||      D                     t        |      t        |      dz  z   t        |      dz
  z   }|j	                  d|z         | D ]3  }|j	                  dj                  d t        ||      D                     5 dj                  |      S c c}w )N|c              3  6   K   | ]  \  }}d || dd   ywr!  r  Nr   )r   hr_  s      rW   r   ztabulate_2d.<locals>.<genexpr>  s$     H41aAa0tWA,H   ry  r7   r  c              3  6   K   | ]  \  }}d || dd   ywrJ  r   )r   r   r_  s      rW   r   ztabulate_2d.<locals>.<genexpr>  s$     Htq!!QCp4lHrL  rB  )rR   r
  rE  r   r  r  r   r   )elementsheadersr   widthsrowr   r[  total_widths           rW   tabulate_2drS    s   #*+ac#a&k+F+ 43x3w<'''cN 	4DAqF1Is3q6{3F1I	44 E	LLH3w3GHHIf+Vq1S[1_EK	LL{"# JSXXHs37GHHIJ99U ,s   D2c              #     K   t        | j                               t        |j                               z  }|D ]3  }| j                  |      }|j                  |      }|||n|||n|f 5 yw)a  
    Zip two dictionaries together, replacing missing keys with default values.

    Args:
        dict1 (dict): The first dictionary.
        dict2 (dict): The second dictionary.
        d1_default (Any): the default value for the first dictionary
        d2_default (Any): the default value for the second dictionary

    Yields:
        tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
               and the value from dict2 (or d2_default if missing).
    N)r!   rI  rH  )dict1dict2
d1_default
d2_defaultall_keysr^  value1value2s           rW   	zip_dictsr\    sv     ( %**,'*UZZ\*BBH  	
33 (Fj(Fj
 	
	
s   A-A/c                T   	 	 	 	 	 	 	 	 dd}	 	 	 	 	 	 	 	 dd}| j                  dt        j                  j                        }| j	                         } |rm || dd        || dd        || dt
        j                  j                           || dd	        || d
t        j                  j                          || dd       | j                  dt        j                  j                        }| j                  dt        j                  j                        }|dk(  r|rt        d      | S )a6  
    Ensures the configuration is internally consistent for standalone AOTInductor.

    If `aot_inductor_mode.compile_standalone` is set to True in the provided
    `config_patches` (or falls back to the global config), this function ensures
    that the following configs are also enabled:
        - `aot_inductor.package_cpp_only`

    Args:
        config_patches (dict[str, Any]): A dictionary of user-provided config
            overrides for AOTInductor compilation.

    Returns:
        dict[str, Any]: The possibly-updated `config_patches` dictionary.
    c                    | j                  |t        t        |            }||| |<   y |s||k7  rt        d| d| d      y y )NzInvalid config: =z3 when aot_inductor_mode.compile_standalone is True.)rH  rO   ri   r   config_patchesconfig_nameconfig_valuer   s       rW   patch_configz2maybe_aoti_standalone_config.<locals>.patch_config9  s]     "";0LM=*6N;'5L0";-q>qr  1r{   c                    | j                  |t        t        |            }||k7  rt        j	                  d||       || |<   y )NzDOverriding: %s=%s when aot_inductor_mode.compile_standalone is True.)rH  rO   ri   r   r  r`  s       rW   force_patch_configz8maybe_aoti_standalone_config.<locals>.force_patch_configD  sF     "";0LML KKV
 '3{#r{   z$aot_inductor_mode.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_modelzaot_inductor.link_libtorchzaot_inductor.dynamic_linkageFz"aot_inductor.cross_target_platformz$aot_inductor.package_constants_in_sowindowszconfig.aot_inductor.package_constants_in_so is not supported for windows cross-compilation. Please use config.aot_inductor.package_constants_on_disk_format = binary_blob.)ra  dict[str, Any]rb  r
  rc  r   r   r  )rH  ri   aot_inductor_modecompile_standalonecopyrP   rN  rq   test_configsuse_libtorchaot_inductorcross_target_platformpackage_constants_in_sor   )ra  rd  rf  rk  rp  rq  s         rW   maybe_aoti_standalone_configrr  (  sk   "	&	58	HK			
3&
358
3HK
3	
3 (++.  33
 $((*N^%DdK^%GNAu}}GXGXCX	
 	I<	
 	(,,	

 	>+I5Q*..,11
 -00.33
 	).E]
 	

 r{   c                   t         j                  j                  r(t         j                  j                  dk(  rt	        d      t         j                  j                  r0t         j                  j
                  dk(  rt	        d      d}d}||fS t         j                  j                  dk(  rd}d}||fS | dk  ryd}t        j                          }||fS )	a  
    Decide whether we should mmap weights, and whether to store the weights with .so.

    If force_mmap_weights or package_constants_on_disk_format == "binary_blob" configs are set, respect the config.

    Returns tuple (use_external_weights, use_mmap_weights).
    binary_blobzconfig.aot_inductor.package_constants_on_disk_format = binary_blob and config.aot_inductor.force_mmap_weights cannot both be True.rh  zKwhen cross_target_platform is windows, use_mmap_weights should not be true.TFi 5w)FF)ri   ro  force_mmap_weights package_constants_on_disk_formatr   rp  r  )consts_sizeuse_mmap_weightsuse_external_weightss      rW   determine_aoti_mmap_flagsrz  }  s     	..@@MQJ
 	

 --44	A]   $#%555;;}L# #%555m# !++--!111r{   c                     ddl m}  | j                  j                  }|yt	        |t
              st        d      |dk(  ryt        j                  d|      st        d      y)zL
    Validates if a model name is suitable for use in code generation.

    r   rh   Tz4Invalid AOTI model name: Model name must be a stringr  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	rC  ri   ro  model_name_for_generated_filesr}   r
  rZ  r   r   )ri   
model_names     rW   is_valid_aoti_model_namer~    sh    
 '$$CCJj#&OPPR 88/<d
 	
 r{   c                2    |rt        |       S t        |       S r   )r)   r(   )rT   unbacked_onlys     rW   get_free_symbolsr    s    $Q''Ar{   c                    i t         j                  dt         j                  j                  dt         j                  j	                  t
        j                              i} t        j                         rt        j                  d      | d<   | S )zA
    Get a base environment for running Python subprocesses.
    
PYTHONPATHTORCH_CUSTOM_PYTHONPATHr&  
PYTHONHOME)r  r  rH  r.  r  r  r  ri   r  	sysconfigget_path)envs    rW   python_subprocess_envr    sl    

** 	bjjnn%rzzsxx'@
	C  %..v6LJr{   c                  &    e Zd ZU dZded<   ded<   y)CUDAGraphWrapperMetadataz
    Metadata for Customized CUDAGraphWrapper.

    Currently assumes there is 1 dynamo graph and will extend to
    multiple graphs in the future.
    r   num_partitionspartition_indexNr   r   r{   rW   r  r    s      r{   r  .c                      e Zd ZU dZded<   y)CUDAGraphWrapperNzCUDAGraphWrapperType | Noner  )r   r   r   r  r   r   r{   rW   r  r    s    +/G(/r{   r  c                    | t         _        y r   )r9  r  )r  s    rW   !set_customized_partition_wrappersr    s    5<*2r{   c                8   | j                   j                  }| j                   j                  g || j                   j                  | j                   j                        }| j                   j                  }t        j                  ||f      \  }}dd}|D cg c]7  } ||      r+t        j                  j                  j                  |d      n|9 }}dddfd}|D cg c]
  } ||       }}t        j                  ||      \  }}||fS c c}w c c}w )	Nc                   t        | t        j                  j                  j                        xrS t        | t        j                  j                  j
                  t        j                  j                  j                  f       S r   )r}   rP   r  r  r@   GeneratorStateOpaqueObjectStater  s    rW   _is_tensor_irz(snode_args_kwargs.<locals>._is_tensor_ir  s^    !U__//667 

__..0B0B0T0TUA
 =
 	
r{   T)replace_symbols_with_hintsc                2    t        j                  | ||      S )Nr   )rP   r   )r  r   r   s      rW   _tensorz"snode_args_kwargs.<locals>._tensor&  s    {{4uV<<r{   c                    t        | t        j                        s| S  | j                         | j                  | j
                        }|S r   )r}   rP   r  r  r   r   )r   rM  r  s     rW   to_real_tensorz)snode_args_kwargs.<locals>.to_real_tensor)  s7    !U\\*Haffh2
r{   r  )r   r  )r   r   r   r   )rg  rj  fill_non_provided_argsconstant_argsr  pytreer"   rP   r  r  ir_node_to_tensortree_unflatten)	r  r   r  	flat_argsflat_args_pytree_specr  r   r  r  s	           @rW   snode_args_kwargsr    s   ::D::,,*$*))*

D ZZF'-':':D&>'J$I$
 	   	,,Q4,P	I = -66q"6I6((4IJLD&<%  7s   <D$Dc                    ddl m} | j                  }|j                  j                  r(|j	                  |j                  j                  dz         }|j                  d      S )Nr7   rm  r   )r  r  fwd_rng_stater  r  )rp  rn  r   rq  removeprefixr  )r  rn  dep_names      rW   is_nonfreeable_buffersr  4  sN    xxH 	ww||(();<I r{   c                p    t        ||  dz        5 }|j                         cddd       S # 1 sw Y   yxY w)z,Load a template file and return its content.z	.py.jinjaN)openread)r   template_dirr
  s      rW   load_templater  B  s6    	lvY//	0 Avvx  s   ,5c                v   | j                   }t        |t        j                  j                  t        j                  j
                  f      sJ dt        |              t        j                  syt        t        j                  j                  j                  j                  t        j                  j                  j                  j                  g      }||v ryt        t        j                  j                  j                   g      }t        |t        j                  j
                        r||v S t#        |        S )zLDecide whether fallback for a node. This is only used in inductor lite mode.z6Expected OpOverload or HigherOrderOperator, but found F)r  r}   rP   r  r  r  r  ri   fallback_by_defaultr!   r  r3  _assert_scalarr  lift_fresh_copyhigher_order triton_kernel_wrapper_functionalr   )rg  r  "skip_fallback_due_to_dynamic_shapefallback_hopss       rW   should_fallback_by_defaultr  H  s    [[F&&

(F(FG O	?V~NO  %% *4IINN))11IINN**22	
*& 33 				@	@AM &%**889&&&t,,,r{   )z-torch.ops._c10d_functional.all_reduce.defaultz.torch.ops._c10d_functional.all_reduce_.defaultz9torch.ops._c10d_functional.all_gather_into_tensor.defaultz8torch.ops._c10d_functional.reduce_scatter_tensor.defaultz4torch.ops._c10d_functional.all_to_all_single.defaultz6torch.ops._c10d_functional_autograd.all_reduce.defaultzBtorch.ops._c10d_functional_autograd.all_gather_into_tensor.defaultzAtorch.ops._c10d_functional_autograd.reduce_scatter_tensor.defaultz=torch.ops._c10d_functional_autograd.all_to_all_single.defaultz(torch.ops._c10d_functional.isend.defaultz(torch.ops._c10d_functional.irecv.defaultz0torch.ops._c10d_functional.batch_p2p_ops.defaultc                    | t         v S )z0Check if an operation is a collective operation.)COLLECTIVE_OPS)r  s    rW   is_collective_opr    s    n$$r{   c                 b    t        j                         r		 ddlm}  | S g S # t        $ r g cY S w xY w)Nr   tlx_only_cuda_options)ri   r  )torch._inductor.fb.tlx_templates.registryr  r	  r  s    rW   r  r    s9    	W(( 		  	I	s     ..c                    | |z   dz
  |z  |z  S )z(Round x up to the nearest multiple of y.r7   r   )rT   ys     rW   	_round_upr    s    UQY1!!r{   c                   ddl m}m}  ||d      r|j                  |j                  fS t        |      dk\  r  ||d   | d         r ||d   d      s ||d   d      r' ||d   | d         r|j                  |j                  fS  ||d   | d         r ||d   t        | d   d            s( ||d   | d         r1 ||d   t        | d   d            r|j                  |j                  fS  ||d   t        | d   d            r1 ||d   t        | d   d            r|j                  |j                  fS |t        j                  k(  rdnd}|t        j                  k(  r|t        j                  k(  rt        | d   d      t        t        || d   z  d      d      z  }	t        | d   d      t        t        || d   z  d      d      z  }
 |||	      s	 |||
      r|j                  |j                  fS |t        j                   k(  rt        j"                  j$                  st        | d   d      t        t        || d   z  d      d      z  }	t        | d   d      t        t        || d   z  d      d      z  }
 |||	      s	 |||
      r|j&                  |j                  fS y	t        | d   d      |z  | d   z  }	t        || d   z  d      | d   z  }
 |||	      s	 |||
      r|j&                  |j                  fS y	)
z:
    Core implementation for scale/swizzle inference.
    r   )r6   SwizzleTyper7   ry  rs   rr   r0  r  NN)torch.nn.functionalr6   r  
TensorWise
NO_SWIZZLErR   RowWiserj   BlockWise1x128BlockWise128x128rP   rC  r;  r  BlockWise1x16SWIZZLE_32_4_4rA  rN  rq   BlockWise1x32)mat_size
scale_sizescale_numel	mat_dtypescale_dtypeeq_fnr6   r  K_multiplierexpected_numel_aexpected_numel_bs              rW   _infer_scale_swizzle_implr    sD    = [!%%{'='=== :!*Q-!-%
1q2I*Q-#jmXa[(I&&(>(>>> *Q-!-jmWXa[#%>?*Q-!-jmWXa[#%>?--{/E/EEE AS 9:uqM78A;4@
 //1G1GGG "U%;%;;1L E***{e>Q>Q/Q$Xa[#6L8A;.3Q:
 
 %Xa[#6L8A;.3Q:
 
 ./5FV3W,,k.H.HHH e***}}  (!c:Yx{2B7>    )!c:Yx{2B7>   ["23u-8 #00+2L2LLL   'x{B7,FRST&|hqk'A2FRST["23u-8 #00+2H2HHHr{   c                    t        | j                  d   | j                  d   ft        |j                        |j                         | j                  |j                  d       S )a  
    Infer the scaling type and swizzle mode from matrix and scale tensor shapes/dtypes.

    This function determines how scale factors are laid out relative to the matrix:
    - TensorWise: Single scale for entire tensor
    - RowWise: One scale per row
    - BlockWise1x128/128x128: Block-scaled with float32 scales
    - BlockWise1x32: MXFP8 with float8_e8m0fnu scales (swizzled on NVIDIA)
    - BlockWise1x16: NVFP4 with float8_e4m3fn scales (swizzled)

    Args:
        mat: The matrix tensor (FP8 or FP4)
        scale: The scale factor tensor

    Returns:
        Tuple of (ScalingType, SwizzleType) or (None, None) if unrecognized
    r   r7   c                    | |k(  S r   r   r  s     rW   r  z%infer_scale_swizzle.<locals>.<lambda>  s
    16 r{   r  r  r  r  r  r  )r  r0  r  numelr   )matscales     rW   infer_scale_swizzler    sO    ( %))A,		!-%KKM))KK! r{   c                \   ddl m | j                         }|j                         }|r
|d   |d   f}|r%t        j                  t
        j                  |d      nd}dfd}t        t        |      dk\  r
|d   |d   fn|d   dft        |      || j                  |j                  |      S )z
    Infer the scaling type and swizzle mode for IR nodes (used during graph lowering).

    This is the IR-compatible version of infer_scale_swizzle, using symbolic
    size comparisons via V.graph.sizevars.statically_known_equals.
    r   rm  r7   c                P    j                   j                  j                  | |      S )z5Compare values using symbolic equality when possible.)rq  rr  r  )r   r!  rn  s     rW   symbolic_eqz+infer_scale_swizzle_ir.<locals>.symbolic_eq  s     ww771==r{   ry  r  )r   r   r!  r   r   r1  )r  rn  r  r  r  r  r  r  rR   r  r   )r  r  	transposer  r  r  r  rn  s          @rW   infer_scale_swizzle_irr    s     .||~H!J QK!- DN)""8<<Q?STK> %/28}/A(1+x{+QRUVGW$))KK r{   r  )ry   r   r   r   )r   r   r   r1  )   d   )r   Callable[[], Any]r   r   r   r   r   rn  )r  r  F)
r   r  r   r   r   r   r   r1  r   rn  r  )r   ztorch.device | None | strr   torch.device)r  zIterable[sympy.Expr]r   r   )r#  r!  r$  r!  r   r   )r  zIterable[_T]r   zValuesView[_T])r.  r  r/  r  r   r  )r^  r  r   r
  )rd  zIterable[int | torch.SymInt]r   zlist[sympy.Expr])rj  int | torch.SymIntr   r  )r   r  r   r  )rd  zIterable[int | sympy.Expr]r   zlist[int | torch.SymInt])r  torch._ops.OpOverloadr   r1  )r  r5   r  z'Callable[[torch._ops.OpOverload], bool]r   r1  )r  r   r   r  r  ri  r   z&tuple[GraphModule, list[torch.Tensor]])rH   )r   r
  r   r  )r7   rH   )
r  Callable[..., Any]r  Sequence[Any]r   r   r   r
  r   rn  )r   r  r  g      ?rH   )r  r  r  r  r   r   r  r   r  rn  r   r
  r   rn  )r  r   r  r
  r   r  )r  r   r  r   r   r  )r   r   r!  r   r   r   )rT   zint | Sequence[int]r  r   r   Sequence[int])rT   ztuple[_T, ...]r   zlist[_T])r   z!Callable[Concatenate[Any, P], RV]r   zCachedMethod[P, RV])r  r
  r   z*Callable[[FN_TYPE[P, RV]], FN_TYPE[P, RV]])r  *Sequence[BaseSchedulerNode] | ExternKernelr   zOrderedSet[Node])r  Sequence[BaseSchedulerNode]r  z8Literal[True, 'torch', 'original_aten', 'inductor_node']r   r
  )r  r  r  r:   r   tuple[str, str]r   )rd  zIterable[torch.fx.Node]re  zCallable[[Any], bool] | Noner   OrderedSet[torch.fx.Node])r   zSequence[IRNode]r  zdict[str, IRNode]r   r  r  )r  r   r   zValueRanges[Any])r  r
  r   r1  )r  re   rU  r   r   sympy.Symbol)r  r1  r   r1  )r   r
  r   r  )rh  r   r  zdict[sympy.Expr, Any]r   r   )r   r   r   z&TypeGuard[torch.SymInt | torch.Tensor])r   r   r   r1  )r  torch.fx.GraphModuler   ztorch.fx.Node | None)r  r  r   r5   )r  r  r   zOrderedSet[torch.device]r  )r  r   r   r   )r^  r
  r   r
  r   rb  )NNT)r  zdict[str, Any] | Noner  r  r  r1  r   rb  )r  r  r  r1  r   	list[int])rs  r+   r  z)Sequence[int | torch.SymInt | sympy.Expr]r  r1  r   r  )r   r"  r   r   ra  r  )r  zint | torch.devicer   r1  r  )r  r   r   r  r  
int | Noner   r8   )rr   )r  r   r   r   )r,  rA   r  zlist[torch.dtype]r   r1  )r  r
  r   r1  )
r,  rA   r  r1  r  r1  r  r1  r   r1  )r$  r@   r  r   r  r1  r   r1  r  )r  r!  r  r1  r   r1  )r$  r@   r  rA   r  r1  r   r1  )r>  r6   r?  r6   r@  zlist[ScalingType]r   r1  )r`  r   ra  r   r,  rA   rb  r1  rc  r1  rd  
Any | Nonere  r  rf  r  r   r1  )
r,  rA   r  r   r  r   ru  r   r   r1  r  )r,  rA   r  rx  r  rx  ru  rx  r`  r@   ra  r@   rd  zIRNode | Noner  z_IntLike | Noner   r1  )r  r
  r   r1  r   )
r  rx  r  rx  ru  rx  r  r   r   r1  )r  rx  r  rx  ru  rx  r   r1  )r  rx  r  rx  ru  rx  r   r  )r   r
  r   r
  )r   zNtuple[str | None, Callable[[], list[Any]], Callable[[], list[Any]], type[Any]])r,  rA   r   r1  )r,  rA   r  zReinterpretView | Bufferr  r@   r   r1  )FTFN)r,  rA   r  r@   r  r@   r  r1  r  r1  r  r1  r  r  r   r1  )r   Callable[P, _T]r   r  r  r  r   ztuple[_T, list[str]])r   r  r   ztuple[Any, list[str]])r   r  r   r  r  r  r   r   )r   r  r   r  r  r  r   r
  )r   r  r   r  r  r  r   ztuple[Any, list[GraphLowering]])rF  r  rG  r  r   rb  )rO  r  rN  zCallable[..., Any] | Noner   r   )rV  r
  r   r  r  )rJ  r  r   r1  )rj  zSequence[torch.Tensor]r   r1  )rj  r   r   r"  )ro  r1  r   r   r  r   r   zIterator[Any])r   r"  r   rn  )r  r
  r   r1  )r  r
  r   r   )r  zIterable[Any]r   r1  )
r  r  r  r4   r  r  rV  r
  r   r  )r  zBuffer | Operation | Noner   r1  )rg  zNode | Operation | Noner  ztorch._ops.OperatorBase | Noner   r1  )rg  zIRNode | Operation | Noner   r1  )r  rF   r  z*Callable[[BaseSchedulerNode], bool] | Noner   r1  )r  rF   r   r1  )rg  zOperation | Noner  z9torch._ops.OpOverload | Collection[torch._ops.OpOverload]r   r1  )r  r
  r  ri  r  ri  r   r   )r  rF   r  zMutableSet[BaseSchedulerNode]r  zdict[str, SchedulerBuffer]r  zdict[str, BaseSchedulerNode]r  zCallable[[Any], bool]r   r  )r  r   r  r   r   r   )r  r  r   r   )r  r  r   r  )r"  r   r   rb  )r   r
  r   r
  )r   r  r   r1  )r   r
  r   r1  )r   r"  r   r1  )r  r  r  r  r:  r"  r;  r"  r<  r
  r=  r1  r   r1  )r  r  r   r  )r   r  r   r1  )rR  r  r   r1  )r   r  )r   r  r   r  r  r  r   ztuple[_T, str])rj  Sequence[InputType]r   zShapeEnv | None)r  Callable[[list[InputType]], _T]rt  r  ru  zOrderedSet[int]r   r  )rT   r  r   r  )rq  rv  r  r  r  zOrderedSet[int] | Noner   z-tuple[list[torch.Tensor], list[torch.Tensor]])rj  r  r  r  r   r  )r   r   r   r1  )r  r  r  rE   r   r  )r   r"  r   r
  )r   r
  r   r"  )r&  r  r   r  r   r1  )r   ztuple[str, ...])r   r
  r  r3   r  r  r   r  )r   r
  r   r  )r   r  r   r
  )r   r"  r   r"  )r   ztype[Any] | Noner   r1  r   r   )r   zlist[int] | None)r   r  )r  ztorch._ops.OperatorBaser   r  )r  torch.fx.Noder   r1  )rg  rB   r   r1  )r  r:   r   r1  )r  r   r   r"  )r   r
  r   r1  )rN  zSequence[Sequence[T]]rO  zSequence[T]r   r
  )
rU  r  rV  r  rW  r  rX  r  r   zEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None])ra  ri  r   ri  )rw  r   r   ztuple[bool, bool])rT   r*   r  r1  r   zOrderedSet[sympy.Symbol])r   zdict[str, str])r  CUDAGraphWrapperTyper   r  )r  rF   r   z tuple[list[Any], dict[str, Any]])r  r;   r   r1  )r   r
  r  r/   r   r
  )rg  r  r   r1  )r   r   )rT   r   r  r   r   r   )r  ztuple[Any, Any]r  ztuple[Any, ...]r  r   r  r"  r  r"  r  zCallable[[Any, Any], bool]r   tuple[Any | None, Any | None])r  r  r  r  r   r  )r  r>   r  r>   r  r1  r   r  (  
__future__r   rC  rc  r  enumr  rF  r  r  rp  r^  rp  r  r  r  r   r  r   r  r  r  rr  r  rQ  ro  collections.abcr   r   r   r   r   r	   r
   r   r   r   typingr   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r~   rP   torch.utils._pytreer  _pytreer  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr   !torch.fx.passes.regional_inductorr   torch.utils._dtype_abbrsr    torch.utils._ordered_setr!   r"   r#   OPTIMUS_EXCLUDE_POST_GRAD#torch.fx.experimental._size_hintingr'   r  r(   r)   r*   r+   r,   r-   r.   pathlibr/   r0   r1   r2   torch._prims_commonr3   torch.fxr4   torch.fx.noder5   r  r6   r  r8   r%  r:   dependenciesr;   rq  r=   r  r>   r?   r@   rA   rB   rC   output_coderE   r  rF   rG   rN   rL   r   rX   torch._dynamo.device_interfacerY   torch._dynamo.utilsrZ   torch.autogradr[   torch.autograd.profiler_utilr\   (torch.fx.passes.graph_transform_observerr]   torch.fx.passes.shape_propr^   torch.utils._sympy.functionsr_   r`   ra   rb   rc   torch.utils._sympy.symbolrd   re   torch.utils._sympy.value_rangesrf   rg   r  ri   runtime.runtime_utilsrj   r-  _IS_WINDOWS	getLoggerr   r   rl   r  r+  	VarRangesr  r   	InputTypegetenvXPU_KERNEL_FORMATGPU_KERNEL_BIN_EXTSrN  r  r  r  rV  rK  rW  rM  rX  rT  rU  r   rE  rG  rI  r;  r<  r=  r?  rt   r   rx   rz   r   Functionr   r  r   r   r   r   r  r  r  r%  r(  ra  re  rk  ru  rw  r  r  r  r   r  r  r  r  r  r  r  r  r  FN_TYPEr  r  r  r  r  r  rb  rh  rv  r  r  r  r  r  r  r  r  r  	frozensetr!  r  r  r  r  r  r  r  r  rd  r  r  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacher  r"  r&  r(  r,  r/  r  r  r;  r  r  r  r  r  r  r  r  r  r  r  r%  r/  r6  r<  rA  rI  rO  rM  rZ  rg  rw  r  r  rx  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r!  r3  r6  r8  r@  rI  rR  rW  r[  rc  rg  rk  rm  rp  r  rz  r  r  r  r  r  r  r  r  r  r^  Enumr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r&  r)  r  r,  r.  r0  r?  rK  rO  rS  r[  ri  rl  rw  r}  ro  r  r  r  r  r  r  rJ  r  compiler  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r(  r1  r6  r;  r>  SUPPORTED_MKLDNN_DEVICESrB  rF  rS  r\  rr  rz  r~  r  r  r  PartitionFnTyper  r  r9  r  r  r  r  r  r  r  r  r  r  r  r  )ru  r   s   00rW   <module>r*     s   "        	     	  	   
               C B    $ $ ? : E 0 / ; ($ 
 <  >>//C$"/,5!$TT,= +	CL
   D 0 % 2 K 0  8 D  = llg%g! T]UZZ'(	LL3-4	 Eibii(I7S 
  !"  	 
 2<

2 . ( {Q'A-+2B XDX XB5
LENN  d#  $"GX #(	 
 !	
 4 #(	[[[ 
[ !	[
 [|  ;@
+	*%AP+	%++	"/	#//G @OI	I<I 
I0 *8+0' 	!  	
 ( %'!  	
    )'#$  cNTT"
;sAv&*
+E8WQU^ E:)++/+\=,4).4)O4) 	4)n_2=_2!_2 _2H 15*- (G
G$5GG:,^%	DU	>+- $ $'& 
< !# I "	 " "( +/7(7	7 7 	7 7v !5 $ "  49 ( 	$$	2$ 	$
 $N Q7 7*  , , ,
R' R'j
 
 @ @ @?' ?  8 J J ) )I  $  	(+<	  #  	
  
< 7;uBB&3BHLB	BL 5:-1	4 BGOO&,O:>O	O2 BGVV&,V:>V	V OOO %O 
	O Q	  	 Q  B0 Q  >>> > 	>
 > > > > 
>B-n EEE E 	E
 E E E E 
EPJ EJJ&) & EF!)?B	 (  . 5( 5(p @ @ R R:"JH&2:@	: ""&#==
= = 	=
  = = = 
=@'C C"&&& & 	&$ &2:&/(V		 &	2:		## &#2:#$#* ...@.. .$ FJFF)BFF*	B&&   D D '6 '6T  Q	.0(#K(*$)) * 

 
"- 
4A 
HK 
	 
F	(	" *.#
!#&# 
#L( =A	Q	Q9	Q 
	Q#J
JAJ 
JLL .LDRLL *=

5
 ,
 5	

 '
 

< *=5 , 5	
 ' 
:T 2      ,!)
.2$&$$ $ 	$
 $ $ 
$NHBL'  &2:2:*" ( %	0	: 04$$$$ -$ 3	$<$ $*IZ!3B	:&/ '#)* $%
  +?*D*D*FG$!QAG  "**Y'H	  & . . .
 68 2 7
8 . 
	 /9l O :)# #" )

)
-" 01 -"` D)D  *499  4 42Y5!H"J4
1 * 		& "&!%	 
$ 
$ 
  
 	 

 K 
FRj&2R66 d#  $ 38$./@ 0 0 *:); &=!H
"-L $%
  "
MMM M 	M
 M &M #M`	*"B "	"" " #	"G Hs   w"