
    9jZ                       d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
 d dlZd dlmZ d dlZd dlmZ d dlmZ d dlmZmZ d d	lmZ d
dlmZmZmZ d
dlmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+ e
rd
dl,m-Z-m.Z. d
dl/m0Z0m1Z1 ddl!m2Z2  ejf                  e4      Z5ejl                  dejn                  dejp                  dejr                  dejt                  dejv                  dejx                  dejz                  dej|                  di	Z?d&dZ@ G d de      ZA G d d e&      ZBeBj                  d!       eBj                           G d" d#e*      ZE G d$ d%e+      ZFy)'    )annotationsN)Path)AnyTYPE_CHECKING)
PRECEDENCE)_embed_headers)
OrderedSet)
CppPrinterExprPrinter)ValueRanges   )ceildivget_bounds_index_exprget_kernel_metadata)ops
OpsWrapperV   )CSEVariableDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferOpOverridesPythonPrinter)IterationRangesEntry
SIMDKernelSIMDScheduling)ReductionType	StoreMode)	SchedulerSchedulerNode)OpVarTboolcharshortintlongucharfloathalfbfloatc                    t        | t              r:| t        j                  k(  ry| t        j                   k(  ry| | k7  ryt	        |       S t        | t
              r| rdS dS t	        |       S )N	HUGE_VALFz
-HUGE_VALFNANtruefalse)
isinstancer)   torchinfstrr#   )vals    [/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/codegen/mps.pyvalue_to_metalr7   6   s_    #u%))UYYJCZ3x	C	v)')s8O    c                      e Zd ZdZddZddZddZddZddZddZ	ddZ
dd	Zdd
ZddZddZddZeZddZddZddZy)MetalExprPrinterz/Converts sympy expression to Metal code snippetc                    |j                   \  }}| j                  |      }| j                  |      }|j                  r	d| d| dS d| d| dS )Nc10::metal::floor_divide(, )metal::floor() / (argsdoprint
is_integer)selfexprxdivs       r6   _print_FloorDivz MetalExprPrinter._print_FloorDivG   s[    3LLOll3??.qcC5::qcse1--r8   c                   |j                   \  }}}|dk(  xr ||dz
  z  dk7  }| j                  |      }|dk7  r0| j                  |      }|j                  r
d| d| d}n	d| d| d}| j                  |      }|r	d| d	| dS d| d
| dS )Ni   r   r   (r@   r>   r?   zc10::metal::safe_mod(r=   z) % (rA   )rE   rF   rG   rH   moduse_safe_mods         r6   _print_ModularIndexingz'MetalExprPrinter._print_ModularIndexingO   s    ii3e|>aQ(>LLO!8,,s#Cs%uA&#A3eC52ll3*1#RuA661#U3%q!!r8   c                    t        |j                        dk7  rt        d      t        | j                  |j                        \  }}d| d| d| d}d| d| d| d}d| d| dS )	Nr   z$metal::min only supported for 2 argsstatic_cast<decltype(+)>(r>   zmetal::min(r=   lenrB   RuntimeErrormap_printrE   rF   ab
typecast_a
typecast_bs         r6   
_print_MinzMetalExprPrinter._print_Min`       tyy>QEFF4;;		*1,QCq3qc;
,QCq3qc;
ZL:,a88r8   c                    t        |j                        dk7  rt        d      t        | j                  |j                        \  }}d| d| d| d}d| d| d| d}d| d| dS )	Nr   z$metal::max only supported for 2 argsrP   rQ   rR   r>   zmetal::max(r=   rS   rX   s         r6   
_print_MaxzMetalExprPrinter._print_Maxi   r^   r8   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr   metal::abs(r   r>   rT   rB   rW   rE   rF   s     r6   
_print_AbszMetalExprPrinter._print_Absr   s9    499~"""T[[167q99r8   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr   zstatic_cast<long>(metal::rint(r   ))rc   rd   s     r6   _print_RoundToIntz"MetalExprPrinter._print_RoundToIntw   s9    499~"""/DIIaL0I/J"MMr8   c                    t        |j                        dk(  sJ |j                  \  }}|j                  r|dk  sJ t        d| d      | j	                  |t
        d         }d| d| d|  d	S )
Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulz!static_cast<float>(metal::rint(1e * z) * 1er>   )rT   rB   rD   
ValueErrorparenthesizer   )rE   rF   numberndigits
number_strs        r6   _print_RoundDecimalz$MetalExprPrinter._print_RoundDecimal|   s    499~"""))Q;;abiajjkl  &&vz%/@A
27)3zl&RYQYPZZ[\\r8   c                n    |j                   \  }}d| j                  |       d| j                  |       dS )Nstatic_cast<float>(z) / static_cast<float>(r>   )rB   rW   )rE   rF   lhsrhss       r6   _print_IntTrueDivz"MetalExprPrinter._print_IntTrueDiv   s=    99S %T[[%5$66MdkkZ]N^M__`aar8   c                    t        |j                        dk(  sJ t        | j                  |j                        \  }}d| d| dS )Nr   z'metal::precise::pow(static_cast<float>(z), static_cast<float>(rg   )rT   rB   rV   rC   )rE   rF   rG   ys       r6   _print_PowByNaturalz$MetalExprPrinter._print_PowByNatural   sI    499~"""4<<+18;QRSQTTVWWr8   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   rt   r>   rT   rB   rC   rE   rF   rG   s      r6   _print_ToFloatzMetalExprPrinter._print_ToFloat   s=    499~"""LL1&$QCq))r8   c                X    |j                   rt        t        |            S t        |      S N)rD   r4   r&   rd   s     r6   _print_FloatzMetalExprPrinter._print_Float   s#    ?? s4y>!t9r8   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   z1static_cast<int>(metal::floor(static_cast<float>(z)))r|   r}   s      r6   _print_FloorToIntz"MetalExprPrinter._print_FloorToInt   s=    499~"""LL1&B1#SIIr8   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   zstatic_cast<int>(metal::trunc(rg   r|   r}   s      r6   _print_TruncToIntz"MetalExprPrinter._print_TruncToInt   s=    499~"""LL1&/s"55r8   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   zmetal::precise::log2(r>   r|   r}   s      r6   _print_OpaqueUnaryFn_log2z*MetalExprPrinter._print_OpaqueUnaryFn_log2   s=    499~"""LL1&&qc++r8   c                J      fd|j                   D        \  }}}| d| d| S )Nc              3  V   K   | ]   }j                  |t        d    dz
         " yw)Atomg      ?N)rn   r   ).0argrE   s     r6   	<genexpr>z0MetalExprPrinter._print_Where.<locals>.<genexpr>   s-      
ADDc:f#5#;<
s   &) ? z : )rB   )rE   rF   cpqs   `    r6   _print_WherezMetalExprPrinter._print_Where   s5    
HL		
1a Cs#aS!!r8   N)rF   
sympy.Exprreturnr4   )__name__
__module____qualname____doc__rI   rN   r]   r`   re   rh   rr   rw   rz   r~   r   r   _print_floorr   r   r    r8   r6   r:   r:   D   s\    9.""99:
N

]bX
*
J
 %L6
,
"r8   r:   c                  b   e Zd ZdZe	 	 d0	 	 	 	 	 	 	 	 	 d1d       Ze	 	 	 	 	 	 	 	 d2d       Zed3d       Zed4d       Zed5d       Z	ed6d       Z
ed7d	       Zed8d
       Zed8d       Zed8d       Zed8d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed:d       Zed9d       Zed9d       Zed9d       Z ed9d       Z!ed9d        Z"ed8d!       Z#ed9d"       Z$ed9d#       Z%ed8d$       Z&ed9d%       Z'ed8d&       Z(ed9d'       Z)ed;d(       Z*ed;d)       Z+e	 	 	 	 	 	 	 	 	 	 d<d*       Z,ed9d+       Z-ed8d,       Z.d=d-Z/d>d.Z0e1d?d/       Z2y)@MetalOverrideszXImplements Metal-specific overrides for ops. Base class emits Python-friendly overrides.Nc                ~    |t         j                  k(  rt        j                  d       d|  dS dt        |    d|  dS )Nz>float64 cast requested, probably from tensorify_python_scalarsrt   r>   static_cast<>()r2   doublelogwarningDTYPE_TO_METAL)rG   dtype	src_dtypeuse_compute_typess       r6   to_dtypezMetalOverrides.to_dtype   sK     ELL KKP )1--nU34Bqc;;r8   c                6    dt         |    dt         |    d|  dS )Nzas_type<z>(static_cast<r   rg   r   )rG   r   r   s      r6   to_dtype_bitcastzMetalOverrides.to_dtype_bitcast   s/     ./0~i?X>YY[\][^^`aar8   c                    t        |       S r   r7   )r5   r   s     r6   constantzMetalOverrides.constant   s    c""r8   c                @   t         j                  j                  t         j                  j                  |             }t         j                  j                  j                  t         j                  j                  |t        |             }t        j                  ||      S )N)bounds)
r   kernelindex_to_strprepare_indexingcsegeneratecomputer   r   r   )rF   r   idx_strvars       r6   
index_exprzMetalOverrides.index_expr   sl    ((''(A(A$(GHhhll##HHg.CD.I $ 
 ||C''r8   c           	        t        |      }t               }t        j                  j	                  |      5  |j                         5  t        j                         t        j                  j                  _	        t        j                  j                  xj                  dz  c_
         |       }d d d        d d d        |  d|j                          d| }t        j                  j                  j                  |      }|sct        j                  j                  j                  j                        }t        j                  j                  j                  ||       t        j                  j                   j#                  t$        |j                      d| dd|  dg       t        j                  j                   j                         5  t        j                  j                   j'                  |       t        j                  j                   j)                  | d| d	| d
       d d d        t        j                  j                   j)                  d| d| d	| d
       |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   XxY w)N_scoped:r    ;if () {z = static_cast<decltype(rR   );z} else )r7   r   r   r   swap_buffersindent	itertoolscountr   iter_buffer_idsname_prefixgetvaluetry_getnewvarr   putr   
writelinesr   splice	writeline)maskbodyother	other_strscoped_bodyrc	cache_keyr   s           r6   maskedzMetalOverrides.masked   s   
 #5)	$&XX"";/ 	1C1C1E 	 ,5??+<AHHLL(HHLL$$	1$B	 	 fAk2245QykB	hhll""9-((,,%%BHH%5CHHLLY,HH''"288,-Qse15dV47HI !!((*   ''4  **e3C5B4rB
 HH&&3%7uC	{"M 
5	 	 	 	$ s1   I2A&I%,I2AI?%I/	*I22I<?Jc           	     0    |  d| d| dt        |       dS )Nr   z : static_cast<decltype(rR   r>   r   )rY   rZ   r   s      r6   wherezMetalOverrides.where   s*     Cs21#S9J8K1MMr8   c                    d|  d| dS )Nzc10::metal::remainder(r=   r>   r   rY   rZ   s     r6   	remainderzMetalOverrides.remainder  s    's"QCq11r8   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrP   rQ   rR   r>   zc10::metal::max(r=   r   rY   rZ   r[   r\   s       r6   maximumzMetalOverrides.maximum  M     -QCq3qc;
,QCq3qc;
!*R
|1==r8   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrP   rQ   rR   r>   zc10::metal::min(r=   r   r   s       r6   minimumzMetalOverrides.minimum  r   r8   c                    |  d| S )Nz || r   r   s     r6   
logical_orzMetalOverrides.logical_or       D}r8   c                    |  d| S )Nz && r   r   s     r6   logical_andzMetalOverrides.logical_and  r   r8   c                    d|  dS )Nzmetal::isnan(r>   r   rG   s    r6   isnanzMetalOverrides.isnan        qc##r8   c                    d|  dS )Nzmetal::isinf(r>   r   r   s    r6   isinfzMetalOverrides.isinf%  r   r8   c                    d|  dS )Nzmetal::precise::log(r>   r   r   s    r6   r   zMetalOverrides.log*       &aS**r8   c                    d|  dS )Nzmetal::precise::exp(r>   r   r   s    r6   expzMetalOverrides.exp/  r   r8   c                    d|  dS )Nrb   r>   r   r   s    r6   abszMetalOverrides.abs4  s     QCq!!r8   c                    d|  dS )Nzmetal::signbit(r>   r   r   s    r6   signbitzMetalOverrides.signbit9  s     !1%%r8   c                    d|  dS )Nzmetal::precise::sin(r>   r   r   s    r6   sinzMetalOverrides.sin>  r   r8   c                    d|  dS )Nzc10::metal::sinc(r>   r   r   s    r6   sinczMetalOverrides.sincC  s    "1#Q''r8   c                    d|  dS )Nzmetal::precise::cos(r>   r   r   s    r6   coszMetalOverrides.cosG  r   r8   c                    d|  dS )Nzmetal::precise::tan(r>   r   r   s    r6   tanzMetalOverrides.tanL  r   r8   c                    d|  dS )Nzmetal::precise::asin(r>   r   r   s    r6   asinzMetalOverrides.asinQ       'qc++r8   c                    d|  dS )Nzmetal::precise::acos(r>   r   r   s    r6   acoszMetalOverrides.acosV  r   r8   c                    d|  dS )Nzmetal::precise::atan(r>   r   r   s    r6   atanzMetalOverrides.atan[  r   r8   c                    d|  d| dS )Nz::metal::precise::atan2(r=   r>   r   )rG   ry   s     r6   atan2zMetalOverrides.atan2`  s     *!Bqc33r8   c                    d|  dS )Nzmetal::precise::sqrt(r>   r   r   s    r6   sqrtzMetalOverrides.sqrte  r   r8   c                    d|  d|  dS )NrP   z)>(-r>   r   r   s    r6   negzMetalOverrides.negj  s     'qcaS22r8   c                    d|  dS )Nzmetal::precise::rsqrt(r>   r   r   s    r6   rsqrtzMetalOverrides.rsqrtp       (s!,,r8   c                    d|  dS )Nzmetal::precise::tanh(r>   r   r   s    r6   tanhzMetalOverrides.tanhu  r   r8   c                    d|  dS )Nzmetal::precise::atanh(r>   r   r   s    r6   atanhzMetalOverrides.atanhz  r  r8   c                    d|  d| dS )Nr<   r=   r>   r   r   s     r6   floordivzMetalOverrides.floordiv  s     +1#Rs!44r8   c                    d|  dS )Nr?   r>   r   r   s    r6   floorzMetalOverrides.floor  r   r8   c                    d|  dS )Nzmetal::sign(r>   r   r   s    r6   signzMetalOverrides.sign       aS""r8   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrP   rQ   rR   r>   zmetal::fmod(r=   r   r   s       r6   fmodzMetalOverrides.fmod  sM     -QCq3qc;
,QCq3qc;
j\J<q99r8   c                    d|  dS )Nmetal::trunc(r>   r   r   s    r6   trunczMetalOverrides.trunc  r   r8   c                    |  d| }| j                   | j                   j                  s"|j                   |j                   j                  rd| dS |S )Nz / r  r>   )r   is_floating_point)rY   rZ   quots      r6   truncdivzMetalOverrides.truncdiv  sS     Cs|GGAGG$=$=GGAGG$=$="4&**r8   c                    d|  dS )Nzmetal::ceil(r>   r   r   s    r6   ceilzMetalOverrides.ceil  r  r8   c                f    t         j                  j                  j                  d       d|  d| dS )Nrandomzc10::metal::rand(r=   r>   r   r   headersaddseedoffsets     r6   randzMetalOverrides.rand  s/    	X&"4&6(!44r8   c                f    t         j                  j                  j                  d       d|  d| dS )Nr$  zc10::metal::randn(r=   r>   r%  r(  s     r6   randnzMetalOverrides.randn  s/    	X&#D6F8155r8   c           	     r    t         j                  j                  j                  d       d|  d| d| d| d	S )Nr$  zc10::metal::randint64(r=   r>   r%  )r)  r*  lowhighs       r6   	randint64zMetalOverrides.randint64  s=     	
X&'vRxr#baHHr8   c                    d|  dS )Nzmetal::rint(r>   r   r   s    r6   roundzMetalOverrides.round  r  r8   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrP   rQ   rR   r>   zmetal::pow(r=   r   )rY   rZ   cast_acast_bs       r6   powzMetalOverrides.pow  sK    (1QCs1#Q7(1QCs1#Q7VHBvha00r8   c                f    t         j                  j                  j                  d       d| d| dS )Nspecial_mathc10::metal::rK   r>   r%  )rE   rY   names      r6   _special_unaryzMetalOverrides._special_unary  s/    	^,dV1QCq))r8   c                l    t         j                  j                  j                  d       d| d| d| dS )Nr9  r:  rK   r=   r>   r%  )rE   rY   rZ   r;  s       r6   _special_binaryzMetalOverrides._special_binary  s5    	^,dV1QCr!A..r8   c           
        dD ].  }t        | |t        j                  | j                  |             0 t        j                  | j                  d      | _        dD ]1  }t        | |t        j                  | j                  |dz                3 dD ].  }t        | |t        j                  | j
                  |             0 dD ]1  }t        | |t        j                  | j
                  |dz                3 y )N)erferfinvi0i0ei1i1edigammaspherical_bessel_j0)r;  	log_gamma)
	bessel_j0	bessel_j1	bessel_y0	bessel_y1modified_bessel_i0modified_bessel_i1modified_bessel_k0modified_bessel_k1scaled_modified_bessel_k0scaled_modified_bessel_k1_forward)	polygammaigammaigammaczeta)
chebyshev_polynomial_tchebyshev_polynomial_uchebyshev_polynomial_vchebyshev_polynomial_whermite_polynomial_hhermite_polynomial_heshifted_chebyshev_polynomial_tshifted_chebyshev_polynomial_ushifted_chebyshev_polynomial_vshifted_chebyshev_polynomial_w)setattr	functoolspartialmethodr<  lgammar>  )clsr;  s     r6   _initialize_special_opsz&MetalOverrides._initialize_special_ops  s    	
 
	WD Cy66s7I7IPTUV
	W ,,S-?-?kR

 	D ''(:(:
ARS	&
 	XD Cy66s7J7JQUVW	X
 	D ''(;(;$BST	r8   NT)
rG   r   r   torch.dtyper   ztorch.dtype | Noner   r#   r   r4   )rG   r   r   ri  r   ri  r   r4   )r5   zbool | float | intr   ri  r   r4   )rF   r   r   ri  r   r4   )r   r   r   r   r   r   r   r4   )rY   r"   rZ   r"   r   r"   r   r4   )rY   r"   rZ   r"   r   r4   )rY   r   rZ   r   r   r4   )rG   r   r   r4   )rG   r   ry   r   r   r4   )r)  r   r*  r   r   r4   )
r)  r   r*  r   r/  r   r0  r   r   r4   )rY   r   r;  r4   r   r4   )rY   r   rZ   r   r;  r4   r   r4   r   None)3r   r   r   r   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r	  r  r  r  r  r  r  r  r  r   r"  r+  r-  r1  r3  r7  r<  r>  classmethodrg  r   r8   r6   r   r      s   b )-"&	<<< &<  	<
 
< < bb*b7Bb	b b
 # # ( (    D N N 2 2 > >
 > >
     $ $ $ $ + + + + " " & & + + ( ( + + + + , , , , , , 4 4 , , 3 3
 - - , , - - 5 5 $ $ # # : :
 $ $   # # 5 5 6 6 II#.I5@IHSI	I I # # 1 1
*/ = =r8   r   mpsc                      e Zd ZU dZeZdZdZdZdZ	 e
       j                  Z e       j                  Z e       j                  ZeZ edg      Zded<   g Zd	ed
<   	 	 	 	 	 	 d fdZddZddZ	 d	 	 	 	 	 	 	 	 	 ddZd dZddd ej8                         f	 	 	 	 	 	 	 	 	 	 	 d!dZ	 	 	 	 	 	 	 	 	 	 d"dZ	 	 	 	 	 	 	 	 	 	 d"dZd#dZ d$dZ!d%d&dZ"	 d'	 	 	 	 	 	 	 d(dZ#d)dZ$	 	 	 	 	 	 	 	 	 	 d*dZ% xZ&S )+MetalKernelz;Implement Metal codegen based on the SIMDKernel abstractionr   auto i       utilszOrderedSet[str]r&  zlist[IterationRangesEntry]multistage_reduction_entryc                X    t        |   |fi | t        j                         | _        y r   )super__init__r   r   acc_var_ids)rE   tilingkwargs	__class__s      r6   rw  zMetalKernel.__init__!  s&    
 	*6*$??,r8   c                    t         |   S r   r   )rE   r   s     r6   dtype_to_strzMetalKernel.dtype_to_str)  s    e$$r8   c                   | j                   j                  |      }| j                  |      }t        j                  j                  |      }| d| j                  |       d}|t        j                  t        j                  fv rd| d}t        j                  }| j                  j                  | j                  ||      S )z"Codegen a load from an InputBuffer[]rt   r>   r   )rB   inputr   r   graph	get_dtyper   r2   float16bfloat16float32r   r   loads)rE   r;  indexr   r   lines         r6   loadzMetalKernel.load,  s    iiood#%%e,!!$'a))%013U]]ENN33 )a0DMMExx  T ??r8   Nc                h   | j                   j                  |      }| j                  |      }| j                  t        j
                  j                  |            }d| d| d}|| d| j                  |       d| d}n[|dk(  rH| j                  j                  d       d	| d
}	d|	 d| d}
|	 d|
 d| j                  |       d| d}nt        d|       | j                  r&| j                  j                  t        ||             y | j                  j                  t        ||             y )Nr   r   r>   r  ] = r   
atomic_addatomiczc10::metal::AtomicType<>zreinterpret_cast<device z
::type *>(z::atomic_add(r=   r   zUnimplemented store mode )rB   outputr   r}  r   r  r  r   r&  r'  rU   inside_reductionr   r   r   stores)rE   r;  r  valuemoder   	dtype_strcast_valr  atomic_typecast_vars              r6   storezMetalKernel.store:  s<    iit$%%e,%%agg&7&7&=>	!)BugQ7<U!D--e45T(1ED\!LLX&3I;a@K1+jQOH!]-zD<M<Me<T;UUWX`WaacdD!:4&ABB  LL""<d#;<KK!!,tT":;r8   c                   | j                   j                  |      }| j                  |      }| j                  t        j
                  j                  |            }t        d | j                  D              }| d| j                  |       d| d| d}d|j                   d| }| j                  j                  t        ||             y )Nc              3  :   K   | ]  }|j                   s|  y wr   is_reductionr   ts     r6   r   z.MetalKernel.store_reduction.<locals>.<genexpr>T  s     K1ANNQK   r  z] = static_cast<r   r   r   z == 0) )rB   r  r   r}  r   r  r  nextrange_treesr   r;  r  r   r   )rE   r;  r  r  r   r  reduction_dimr  s           r6   store_reductionzMetalKernel.store_reductionO  s    iit$%%e,%%agg&7&7&=>	K(8(8KKa))%011A)BugUWXm(()7l467r8   Tc                   t        |t        j                        r| j                  |      }dt	        | j
                         }t        j                  j                  |||      }|rdnd}|| d| z  }|r|d| j                  |       dz  }||rJ d       |d| z  }| j                  j                  || j                  z          |S )	Ntmp_acc_zthreadgroup  r   r  r  z+Thread group var can not have default value = )r1   r2   r   r}  r  rx  r   r   create_cse_varsexprindexing_coder   suffix)	rE   r   
elem_countdefault_valueis_threadgroupr   var_namer   var_defs	            r6   _new_idxvarzMetalKernel._new_idxvarZ  s     eU[[)%%e,Ed4#3#3456hh%%h>$2.eWAhZ((4::j12!44G$%T'TT%]O,,G$$Wt{{%:;
r8   c                    |||f}|| j                   j                  v r| j                   j                  |   S | j                  ||||      }|| j                   j                  |<   |S )z)Caching wrapper around _reduction_nocache)r   reduction_cache_reduction_nocache)rE   r   r   reduction_typer  r   results          r6   	reductionzMetalKernel.reductionp  sf     6	00088++I66((	>5Q.4  +r8   c                P   | j                   sJ | j                  rJ dAd}d}d}| j                  D ]  }|j                  s|r|dz  }||j                   d| z  }t        |j                  t        j                        r||j                  z  }_|t        j                  |j                   ddd      z  } t        j                  || j                        }| j                  |      }	t        |t        j                        rt        || j                        n| j                  }
|d	k(  r| j!                  |      }| j"                  j%                  | d
       | j"                  j%                  d       | j&                  j)                  d| d| d       | j*                  j%                  d       |S | j,                  j/                  d       |dv rt0        |   }| j!                  ||
      }| j2                  s|}nD|dk(  rdnd\  }}| j!                  ||d      }| j&                  j)                  | d| d| d       | j4                  j7                  | j*                  d| d| d| d| d|	 dt0        |         S |dv r| j!                  ||
      }t8        |   }d| d | d}| j2                  s|}nY|j;                  d!      rd"nd!}d#| d$| d%}| j!                  ||d      }| j&                  j)                  | d&| d| d| d'       | j4                  j7                  | j*                  d| d| d| d| d|	 dt0        |         S |d(v r]| j!                  ||
      }| j!                  ||
      }t8        |   }d| d | d}| j2                  s|}dt8        |    d | d}n|j;                  d!      rd"nd!}d#| d$| d%}| j!                  ||d      }| j!                  |d)d      }t=        d* | j>                  jA                         D              }|d+k(  rd,nd-}|jB                  rd.| d/nd}| j&                  j)                  d| d| d| | d| d0| d1| d0|j                   d2       | j4                  j7                  | j*                  d| d| d| d| d| d| d|	 d|      S |d3k(  r+| j2                  s~| j!                  ||      }| j&                  j)                  | d4| d5| d       | j4                  j7                  | j&                  d| d| d|	 dtD        jF                        } ||      S | j!                  d6|      }| d4| d7}| j"                  j)                  | d8       | j&                  j%                  | d9| d:| d;       | j4                  j7                  | j*                  d<| d| dtD        jF                        } ||      S |d=k(  r4t        |tH              sJ d>       | j!                  d6|      }| d4| d7}d?|d)    d|d    d|d@    d}| j"                  j)                  | d8       | j2                  rC| j"                  j)                  | d8       | j&                  j%                  | d9| d| d'       n!| j&                  j%                  | d0| d       | j4                  j7                  | j2                  r| j*                  n| j&                  d| d| d|	 dtD        jF                        } ||      S tK        |      )BzeCodegen a reduction operation.
        Only sum and prod operations are somewhat reasonable optimizedc           
         t        j                  dD cg c](  }t        |  d| | j                  | j                        * c}      S c c}w )Nxyzrj   )r   _unwrapr   r   r   )res3r  s     r6   _unwrap_helperz6MetalKernel._reduction_nocache.<locals>._unwrap_helper  sA    %%NSTvQqc]DKKDT Ts   -Ar  r    + rl   numelTintegerpositiveanyz	 = false;z7threadgroup_barrier(metal::mem_flags::mem_threadgroup);z
                if (z) {
                    z' = true;
                }
            reduction_utils)prodsumr  )r   rQ   )r   *F)r  r  r   z= r   zc10::metal::threadgroup_rK   r=   r>   r   )maxminr   r   r  lowestz::metal::numeric_limits<z>::z()z = ::c10::metal::r   )argminargmaxr   c              3  :   K   | ]  }|j                   s|  y wr   r  r  s     r6   r   z1MetalKernel._reduction_nocache.<locals>.<genexpr>  s      Ar  r  r  <z || ::metal::isnan(z) r  z;
                    z$;
                }
                welford_reducer  r  float3r  z = 0.0;z! = ::c10::metal::welford_combine(z	, float3(z, 0.0, 1.0));z(c10::metal::threadgroup_welford_combine(welford_combinez&Input to welford combine must be tuplezfloat3(r   )r  r   r   ztuple[CSEVariable, ...])&r  
_load_maskr  r  r;  r1   r  sympyIntegerSymbolprefixMinmax_threadgroup_sizer  r   simd_group_sizer  r  r   r   r   r  r&  r'  r   rt  r   r   r   endswithr  range_tree_nodesvaluesr  r2   r  tupleNotImplementedError)rE   r   r   r  r  r  reduction_idxacc_buf_sizerdacc_buf_size_strshmem_buf_sizeacc	acc_dtypeacc_bufr5   default_valreduction_opsrc_metal_type
cast_valuelim_fn	limit_valdata_acc_bufidx_acc_bufidx_validx_varcmp_op
nan_suffixwf_resacc_thread_var	inp_values                                 r6   r  zMetalKernel._reduction_nocache  s^    $$$$??""	 "" 	B??&yL>::M"((EMM2(yyk'! 	 yyt/H/HI::l3 ,6 L$"6"67%% 	 U"""5)C((C5	):;((I LLG E  KK!!I J*+_,29=I&&y.AG22 !/% 7HX *\ &&[ '  ##se1\N"UG1$EF88$$*>*:!G9Bse2m_\^_o^ppqr07 %  
 ^+&&y.AG+I6N''7r%BJ22 %3%<%<U%C6~6Fc&QST	&&Yu '  ##e,^,<AcU"ZLPRS 88$$*>*:!G9Bse2m_\^_o^ppqr07 %  
 11++I~FL**5.AK+I6N''7r%BJ22 ()>(?r-PQR%3%<%<U%C6~6Fc&QST	&&Yu '  **5RW*X #44;;=  !/( : !22 *%3 
 ## )G1VHAcU:, 7EUG $IS /%  88$$*>*:!L>K=XZ%r'"]O26F5GqJ	 %   --22**9lC##wiqtE7!$LM**LL.~.>ayK[J\\]^-- + 
 &f--&&x>G 'y-:N%%(8&@ALL""!""CNCSS\]b\ccpq XX&&:7)2l^STUmm ' F
 "&))..eU+U-UU+&&x>G 'y-:N!%(2eAhZr%(1EI%%(8&@A..""))^,<G*DE&&%&&GGWWYZcYddfg &&.)9YKq'IJXX&&#>>DLL*>*:!G9BGWFXXYZmm ' F
 "&))!.11r8   c                   | j                  j                        }| j                  |      }j                  rQt	        j
                  j                  t        j                        r\j
                  j                  | j                  k  r9| j                  j                  | j                   dj                   d| d       y t	        j
                  j                  t        j                        rj
                  j                  n.t        j                  j
                  j                   ddd      }t!        fd| j"                  D              }j
                  j                   d}|s\| j"                  j%                         |t'        | j                  d	z
        z   t'        | j                        z  }| j                  |      }j
                  j                  }	| j(                  j                  d
j
                  j                   dj
                  j                   d| dj
                  j                   d	       | j(                  j+                         5  t	        |t        j                        rS| j(                  j                  | j                   d| d| j                   dj
                  j                   d|	 d
       nH| j(                  j                  | j                   d| d| d|	 dj
                  j                   d
       t	        |t        j                        s|| j                  z  |k7  r"| j(                  j                  d| d| d       |j-                  j
                  j                  |      }
| j(                  j                  | j                   dj                   d|
 d       d d d        y | j(                  j+                         5  |j-                  j
                  j                  |      }
| j(                  j                  | j                   dj                   d|
 d       d d d        y # 1 sw Y   y xY w# 1 sw Y   y xY w)Nr   r  r   r  Tr  c              3  N   K   | ]  }|j                   j                   u   y wr   )root)r   eentrys     r6   r   z=MetalKernel.codegen_iteration_ranges_entry.<locals>.<genexpr>I  s#      %
%&AFFejj %
s   "%_linear_idxr   z	for(auto z
_cnt = 0; z_cnt < z; ++z_cnt) {rl   z_cnt + r  z_cnt;r    >= z) break;)rename_indexingrF   r  r  r1   r  r  r  r  r  r  r   index_dtyper;  r  r  r  rt  appendr)   r   r   replace)rE   r  r   	index_stracc_sizeroot_already_processedlinear_idx_name	loop_sizeloop_size_str	root_namesub_index_strs    `         r6   codegen_iteration_ranges_entryz*MetalKernel.codegen_iteration_ranges_entry5  s   ))%**5
JJz*	!!uzz''7

  D$=$==((##$Aejj\YKqA  %****EMM: JJ!2!2 3594RVW 	 "% %
*.*I*I%
 "
 #ZZ../{;%++2259 "E$*C*Ca*G$HHU))N I !JJy1M

IIIEJJ--.j9J9J8K7S`Raaefkfpfpfwfwex  yA  B !!# h5II''++,Ao->c445S9J9J8K7S\R]]^`
 II''++,Ao->c-PST]S^^abgblblbsbsattyz x6 4#<#<<HII''$.?tH:X(VW !* 1 1%**//? S		##''(%**SqI) 2 !!#  ) 1 1%**//? S		##''(%**SqI 3 2 s   "E"Q'AQQQ$c                   | j                   r-| j                  j                         5  | j                  j                  | j                         | j                  j                  | j
                         ddd       | j                  j                  dt        | j                         z         | j                  j                  t        d | j                  j                  j                         D                     | j                   r| j                   j                         j                          | j                   r5nJ| j                  j                  | j                         | j                  j                  | j
                         | j                  j                  | j                         | j                  j!                          | j
                  j!                          | j                  j!                          y# 1 sw Y   xY w)a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        N}c              3  T   K   | ]   }t        |t              r|n|fD ]  }|  " y wr   )r1   r  )r   itemvs      r6   r   z+MetalKernel.codegen_body.<locals>.<genexpr>  s:      &0u&=dD7  s   &()rt  r   r   r   r  r   r   rT   r   
invalidater	   r  r  popcache_clearr  clear)rE   s    r6   codegen_bodyzMetalKernel.codegen_body  sc    **!!# /		  ,		  ./ IIc$*I*I&J JK
 HH  $ 8 8 ? ? A  11//335AAC 11 IITZZ(IIT\\*		%

1/ /s   AG99Hc                t   | j                          t               }|}t        j                  j                  r|j                  d       n|j                  d       | j                         }|j                         5  t        j                  j                  s'| j                  D ]  }|j                  d| d        nr| j                  D cg c]  }d| d
 }}t        |t        t              j                  j                  j                  dz  gt                     }|j                  |       | j                  rwt        j                   d | j"                  D              }t%        |t&        j(                        rt+        || j,                        n| j,                  }	|j                  d|	 d       |j                  d	| d
       |j                         5  | j.                  j0                  j3                         D ]Z  \  }
}|
| j4                  v r| j7                  t        j                  j9                  |
            }|j                  d| d| d       \ | j.                  j:                  j3                         D ]  \  }
}t        j                  j9                  |
      }|t<        j>                  k(  rBt        j                  jA                  |
      }||jC                         g k7  rtE        d      d}n| j7                  |      }|j                  d| d| d        | j.                  jF                  jI                         D ]  }|j                  d| d        |D ]F  }t%        |jJ                  t&        j(                        r(|j                  d|jL                   d       H d| j                  v r|j                  d       tO        |      dk  sJ d       tO        |      dkD  rdtO        |       nd}tO        |      dk(  r|d   jP                  nd}| j                  rdnd}|j                  | d| d|        | j                  r|j                  | d       ddd       |j                  d        |j                         5  tO        |      dkD  rAtS        |      D ]3  \  }}|j                  d!|jP                   d"tU        d#|z          d$       5 |jW                  | jX                         |jW                  | jZ                         ddd       |j                  d%       ddd       t        j                  j                  r!|j                  d&       |j]                         S |j                  d'       |j]                         S c c}w # 1 sw Y   3xY w# 1 sw Y   xY w# 1 sw Y   xY w)(z3Called at the end to generate a final kernel stringz(R"MTL(compile_mps_shader('''z#include <c10/metal/z.h>includec              3  N   K   | ]  }|j                   s|j                    y wr   )r  r  r  s     r6   r   z-MetalKernel.codegen_kernel.<locals>.<genexpr>  s      1 !AGG1s   %%z$[[max_total_threads_per_threadgroup(z)]]kernel void rK   zdevice z* ,Nzfloat64 is not supported by MPSr)   z	constant zconstant long& znumel,errorz,device c10::metal::ErrorMessages* error_buf,   z%Up to 3 index variables are supportedr   uintr   
thread_posr  r   z [[thread_position_in_grid]]z- group_pos [[thread_position_in_threadgroup]]r   rq  z = thread_pos.x   r   r  z)MTL");'''))/r  r   r   r  cpp_wrapperr   active_range_treesr   r&  r   r   __file__parentr	   r  mathr  r  r1   r  r  r  r  rB   output_buffersitemsremoved_buffersr}  r  input_buffersr2   float64try_get_bufferget_sizerU   sizevarsr  r  r  rT   r;  	enumeratechrr   r  r   r   )rE   r;  codefn_nameidx_varsheaderr&  header_contentstotal_reduction_sizethreadgroup_sizeouterinnerr  r   	outer_bufr  thread_pos_dtypethread_pos_var_namethread_pos_suffixidxr   s                        r6   codegen_kernelzMetalKernel.codegen_kernel  s/   77NN9%NN34**,[[] U	 77&&"ll GFNN%9&#EFG FJ\\;A*6(#6  #1(^**11889DEL#
 /$$'+yy 1%)%5%51 ($ ""6F ,d.G.GH22 !
 :;K:LCP NN\'!45 -$(II$<$<$B$B$D DLE5 4 44  $ 1 1!''2C2CE2J KINNWYKr%#BC	D
 %)II$;$;$A$A$C 
FLE5GG--e4E-$%GG$:$:5$A	$,	0B0B0D0J"./P"QQ$+	$($5$5e$<	NNYykE7!#DE
F "YY//668 ?ENN_UG1#=>?  ( QG!'--?8H'OP	Q dll*NN#QR8}q(Q*QQ(.1(ma.?d3x=/*V ! ),H(:HQK$$ $ ,0+@+@Cb!'(*=)>>Z[lZmn ((NN+,,YZW-\ NN5! 'x=1$$-h$7 S#CHH:^Cc	N;K1M D../DII&' NN3kU	 n 77NN9% }} NN6"}}o2- -^' '[U	  U	 sR   :AV.
VC?V.JV$)V.BV"V.V.V	V."V+	'V..V7c           
     ,	   t         j                  j                  }| j                  j                  D ]  }|j                  |        | j                  j                         \  }}}}t        ||      D 	
ci c]  \  }	}
t        |	      |
 }}	}
g | j                  j                  j                         | j                  j                  j                         }|D cg c]  }|| j                  vs| }}|| j                  j                  D cg c]  }t        |       c}z  }|D cg c]  }||   	 }}| j                  D ]  }t        |j                  t         j"                  t$        f      r.t        |j                  t         j&                        r|j                  }n4t         j                  j                  j)                  ||      j*                  }|j,                  r| j.                  s|j1                  t        |             |j1                  t$                t         j                  j2                  r| j4                  n| j6                  }dd}t9        | j;                               dkD  r| j;                         D cg c]J  } ||j,                  r*t!        j<                  |j                  | j>                        n|j                        L }}|j1                   ||d             |j1                  t@               n%t         j                  j2                  rtC        d      | j.                  r| j;                         D cg c]@  }|j,                  r0 |t!        j<                  |j                  | j>                              ndB }}|j1                   ||d             |j1                  t@               n1t         j                  j2                  r|dgz  }|j1                  d       d| jD                  v rMt         j                  j2                  s3|j1                  d	t9        |D cg c]  }|d
|vs| c}              |jG                  ||tI        jJ                  d      d|       yc c}
}	w c c}w c c}w c c}w c c}w c c}w c c}w )z0
        Codegens a call to this kernel
        threadsc                    t         j                  j                  r(| D cg c]  }d| d
 } }ddj                  |        dS | ddj                  |        dS c c}w )Nzstatic_cast<uint64_t>(r>   {r=   r  z=[r  )r   r  r!  join)r@  kwargr  s      r6   format_threadsz/MetalKernel.call_kernel.<locals>.format_threads3  sh    ww""BIJQ3A3a8JJDIIg./r22499W#5"6a88 Ks   Ar   zWe should always have threads?1
group_sizeNr  zerror_buf_idx==rn  F)devicetriton	arg_types)r@  z	list[str]rD  r4   r   r4   )&r   r  wrapper_coderB   r-  ensure_size_computedpython_argdefszipr4   r&  keysr)  r(  r  r1   r  r  r  r&   r  generate_numel_exprr8  r  r  r   r!  cexprpexprrT   r"  r  r  listrU   r&  generate_kernel_callr2   rI  )rE   r;  nodedeallocate_wswrapperr  _	call_argsrK  call_argarg_typearg_name_to_typerB   r   treerF   expr_printerrE  r@  s                      r6   call_kernelzMetalKernel.call_kernel  s    ''&&## 	,A((+	, &*YY%=%=%?"9a>A)Y>W
(:(CM8#
 
 S))..0R4993J3J3O3O3QR#Gs$2F2F'FGG!3!34AQ446:;s%c*;	; $$ 	&D$**u}}c&:;DJJ5zzww++??dKQQ$$(=(=CI&  %	& &'WW%8%8tzzdjj	9 t&&()A- 002  ~~ IIaggt'@'@AG  KKw	:;T"ww"""#CDD  
 002	  >> UYYqww0I0IJKG  KKw=>T"ww""   & dll"177+>+>KK T%`cS_QT\_Q_c%`!a bc 	$$<<& 	% 	
S

 H4;8 & &as>   5Q2Q80Q8Q=*RARAR*R2R7Rc                (   t         j                  j                  r-| j                  j	                  | j
                  d| dd       y | j                  j                  d       | j
                  j                  d| ddt        |       dd	d
g       y )Nzif (!) returnF
assignmentr  r   z"    TORCH_REPORT_ERROR(error_buf, r       return;r  )
r   r  r!  r   r   r   r&  r'  r   repr)rE   condmsgs      r6   device_assert_asynczMetalKernel.device_assert_asynck  s    77HHdlleD6,BuULLW%LL##D6&8c2F!	r8   c                   |s|sy | j                  |      }| j                  |      }|r|rd| d| d| d}n|r| d}n| d| }t        j                  j                  r-| j                  j                  | j                  d| dd	       y | j                  j                  d
       | j                  j                  d| dd| d| dddg       y )NrK   z < 0 || r  r>   z < 0r   rb  Frc  r  r   z,    TORCH_REPORT_ERROR(error_buf, "Index ", z, " out of range [0, ", z, ")");re  r  )
r   r   r  r!  r   r   r   r&  r'  r   )rE   rF   sizelowerupperexpr_strsize_str	conditions           r6   check_boundszMetalKernel.check_boundsy  s     $$T*$$T* UH:XhZtH:QGI#*D)I#*D
3I 77HHYKx8U  
 LLW%LL##9+T*B8*Ldemdnnuv!	r8   )ry  zdict[str, sympy.Expr]rz  r   r   rk  )r   ri  r   r4   )r;  r4   r  r   r   r   r   )
r;  r4   r  r   r  r   r  r   r   rk  )r;  r4   r  r   r  r   r   rk  )r   zstr | torch.dtyper  z
int | Noner  z
Any | Noner  r#   r   zValueRanges[Any]r   r   )
r   ri  r   ri  r  r   r  %CSEVariable | tuple[CSEVariable, ...]r   rr  )r  r   r   rk  rj  )generated_kernel)r;  r4   r   r4   rh  )r;  r4   rV  r   rW  r#   r   rk  )rg  r   rh  r4   r   rk  )
rF   r   rk  r   rl  r#   rm  r#   r   rk  )'r   r   r   r   r   	overridesr  newvar_prefixr  r  r   rC   rS  r
   rR  r:   r  kexprr	   r&  __annotations__rt  rw  r}  r  r  r  r   unknownr  r  r  r
  r  r>  r`  ri  rq  __classcell__r{  s   @r6   rp  rp    s   EIFMOO##EL  E&&EE)7)4G_4=? :?-%- - 
	-%@ SW<< *<3><FO<	<*	8 "&$(##6;#6#6#8   "	
  ! 
,  &	
 5 
/ s2s2 s2 &	s2
 5s2 
/s2jHT#JhV BF[
[
"[
:>[
	[
z""&0"9="FJ"	"r8   rp  c                  L     e Zd ZU eZdZded<   d fdZ	 	 	 	 	 	 	 	 ddZ xZ	S )MetalSchedulingr   r&   _kernel_fn_counterc                $    t         |   |       y r   )rv  rw  )rE   	schedulerr{  s     r6   rw  zMetalScheduling.__init__  s    #r8   c           	        t         j                  j                  }||j                  v r|j                  |   S t         j                  j                  rXd|j                          }|}||j                  |<   d| d|z   }t        ||      \  }}|j                  ||| d| d       |S d| j                   }	| xj                  dz  c_        |	|j                  |<   d	}
|j                  |
      t        |
      z   }|j                  d
      }||| }g }|j                  d      D ]G  }|j                         j                  d      r#|j                  |j!                  dd|	 d             I t#        t%        |j&                              }|j(                  j+                  |	 d|	d       |D ]  }|j(                  j+                  |        |j(                  j+                  d| d       |	S )Nmps_lib_zconst char* z
_source = 
F)gpugenerated_kernel_r   r  r   z#includezkernel void generated_kernel(r  rK   z = async_compile.metal(z, '''z''', r>   )r   r  rL  src_to_kernelr!  next_kernel_suffixr   define_kernelr}  r  rT   rindexsplitstrip
startswithr   r  rf  sortedr&  r3  r   )rE   src_codenode_scheduler   rX  mps_lib_namekernel_nameoriginsdetailed_originsr1  metal_src_startstartend	metal_src
body_linesr  headers_reprs                    r6   r  zMetalScheduling.define_kernel  s    ''&&w,,,((2277%g&@&@&B%CDL&K.9G!!(+%l^:>IH(;M7(S%G%!!h7)26F5G(He "   &d&=&=%>?1$*1h' 3/#o2FFoof%U3'	 
OOD) 	Dzz|&&z2<WIUV>WX	 F6>>23  G9,CG;e!TU 	+DNN$$T*	+  5a!89r8   )r  zScheduler | Noner   rk  )r  r4   r  zlist[SchedulerNode]r   rp  r   r4   )
r   r   r   rp  kernel_typer}  rw  rw  r  ry  rz  s   @r6   r|  r|    s=    K$..,?.IT.	.r8   r|  )r5   z&float | int | bool | str | CSEVariabler   r4   )G
__future__r   rc  r   loggingr%  pathlibr   typingr   r   r  sympy.printing.precedencer   r2   torch.utils._cpp_embed_headersr   torch.utils._ordered_setr	   torch.utils._sympy.printersr
   r   ExprPrinter_torch.utils._sympy.value_rangesr   rs  r   r   r   virtualizedr   r   r   commonr   r   r   r   r   r   simdr   r   r   ops_handlerr   r   r  r    r!   r"   	getLoggerr   r   r#   int8int16int32int64uint8r)   r*   r  r   r7   r:   r   _initialize_pointwise_overridesrg  rp  r|  r   r8   r6   <module>r     s$   #      %  0  9 / O 7 G G , ,  C B 64g! 
JJ	JJ	KK	KK	KK	KK	KK	JJ	NNH
r"| r"jR[ Rj
  . .u 5  & & (I
* I
X5n 5r8   