
    9j                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZm Z m!Z! ddl"m#Z#m$Z$m%Z%  G d de!      Z& e&       jN                  Z(e
rd dl)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 dZ2ejf                  ji                  e5d      Z6 G d d      Z7 G d de8      Z9 G d de       Z:ejv                   G d d             Z<ejv                   G d d              Z=ejv                   G d! d"             Z>ejv                   G d# d$             Z? G d% d&e$      Z@ G d' d(e%      ZAy))    )annotationsN)AnycastTYPE_CHECKING)
OrderedSet)ModularIndexing   )configtorch_dtype_to_jax)get_fused_kernel_nameget_kernel_metadata)V   )BlockPatternMatcher)BackendFeatureCSEVariableIndentedBufferOpOverridesPythonPrinter)IterationRangesEntry
SIMDKernelSIMDSchedulingc                  (    e Zd ZdZddZddZddZy)PallasPrinterzO
    Custom sympy printer for Pallas that handles JAX-specific constructs.
    c                    | j                  |j                  d         }| j                  |j                  d         }| j                  |j                  d         }d| d| d| dS )z!Convert sympy Where to jnp.where.r   r   r	   
jnp.where(, ))doprintargs)selfexprcpqs        ^/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/codegen/pallas.py_print_WherezPallasPrinter._print_Where$   s_    LL1&LL1&LL1&A3b2aS**    c                    |j                   D cg c]  }| j                  |       }}|d   }|dd D ]  }d| d| d} |S c c}w )z7Convert sympy Min to jnp.minimum for JAX compatibility.r   r   Njnp.minimum(r   r   r!   r    r"   r#   argr!   results        r'   
_print_MinzPallasPrinter._print_Min+   ^    -1YY7cS!77a8 	5C#F82cU!4F	5	 8   Ac                    |j                   D cg c]  }| j                  |       }}|d   }|dd D ]  }d| d| d} |S c c}w )z7Convert sympy Max to jnp.maximum for JAX compatibility.r   r   Njnp.maximum(r   r   r,   r-   s        r'   
_print_MaxzPallasPrinter._print_Max3   r1   r2   N)r#   
sympy.Exprreturnstr)__name__
__module____qualname____doc__r(   r0   r5    r)   r'   r   r      s    +r)   r   )CallableSequence)IRNode)ReductionType)BaseSchedulerNodemainkernel_codec                  &    e Zd ZdZdddZdddZy)PallasKernelWrapperz6Wrapper to provide .run() interface for Pallas kernelsNc                L    || _         || _        t        j                  d|       y )NzPallas kernel path: %s)	kernel_fnkernel_pathkernel_code_loginfo)r"   rH   rI   s      r'   __init__zPallasKernelWrapper.__init__R   s"    "&5{Cr)   )streamc               *     | j                   |d|i|S )a5  
        Execute the Pallas kernel.

        Args:
            *args: Arguments to pass to the kernel function
            stream: CUDA stream to pass to the kernel function
            **kwargs: Additional keyword arguments for the kernel

        Returns:
            Result of the kernel execution
        rM   )rH   )r"   rM   r!   kwargss       r'   runzPallasKernelWrapper.runW   s     t~~t=F=f==r)   N)rH   zCallable[..., Any]rI   
str | None)r9   r:   r;   r<   rL   rP   r=   r)   r'   rF   rF   O   s    @D
 !% >r)   rF   c                      e Zd ZdZy)UnsupportedzJException raised when an operation is not supported by the Pallas backend.N)r9   r:   r;   r<   r=   r)   r'   rT   rT   f   s    Tr)   rT   c                     e Zd ZdZedsd       Zedsd       Zedsd       Zedsd       Zedsd       Z	edsd       Z
edsd       Zedsd	       Zedsd
       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedtd       Zedtd       Z edtd       Z!edud       Z"edvd        Z#e	 	 dw	 	 	 	 	 	 	 	 	 dxd"       Z$edyd#       Z%edzd$       Z&ed{d%       Z'edsd&       Z(edsd'       Z)edsd(       Z*edsd)       Z+edsd*       Z,edsd+       Z-edtd,       Z.edtd-       Z/edtd.       Z0edtd/       Z1edtd0       Z2edsd1       Z3edsd2       Z4edsd3       Z5edtd4       Z6edtd5       Z7edtd6       Z8edsd7       Z9edtd8       Z:edtd9       Z;edtd:       Z<edtd;       Z=edtd<       Z>edtd=       Z?edtd>       Z@ed|d?       ZAeAZBedsd@       ZCedsdA       ZDedsdB       ZEedsdC       ZFedsdD       ZGedsdE       ZHedsdF       ZIedsdG       ZJedsdH       ZKedsdI       ZLedsdJ       ZMedsdK       ZNeLZOedsdL       ZPeMZQedsdM       ZRed}dN       ZSed}dO       ZTeSZUeTZVed}dP       ZWedsdQ       ZXed}dR       ZYed}dS       ZZed}dT       Z[ed~dU       Z\ed~dV       Z]ed~dW       Z^ed~dX       Z_ed~dY       Z`ed~dZ       Zaed~d[       Zbed~d\       Zced~d]       Zded~d^       Zeed~d_       Zfed~d`       Zgedsda       Zhedsdb       Zieddc       Zjedtdd       Zkedtde       Zledtdf       Zmedsdg       Znedsdh       Zoedtdi       Zpedtdj       Zqedtdk       Zredsdl       Zsedtdm       Ztedtdn       Zueddo       Zveddp       Zweddq       Zxeddr       Zyy!)PallasKernelOverridesz
    Map element-wise ops to JAX/Pallas operations.

    For now, we use the default Python operators which are compatible
    with JAX numpy broadcasting semantics.
    c                    d|  dS )Nzjnp.sin(r   r=   xs    r'   sinzPallasKernelOverrides.sinr        !Ar)   c                    d|  dS )Nzjnp.cos(r   r=   rX   s    r'   coszPallasKernelOverrides.cosw   r[   r)   c                    d|  dS )Nzjnp.tan(r   r=   rX   s    r'   tanzPallasKernelOverrides.tan|   r[   r)   c                    d|  dS )Nz	jnp.sinh(r   r=   rX   s    r'   sinhzPallasKernelOverrides.sinh        1#Qr)   c                    d|  dS )Nz	jnp.cosh(r   r=   rX   s    r'   coshzPallasKernelOverrides.cosh   rb   r)   c                    d|  dS )Nz	jnp.tanh(r   r=   rX   s    r'   tanhzPallasKernelOverrides.tanh   rb   r)   c                    d|  dS )Nzjnp.arcsin(r   r=   rX   s    r'   asinzPallasKernelOverrides.asin        QCq!!r)   c                    d|  dS )Nzjnp.arccos(r   r=   rX   s    r'   acoszPallasKernelOverrides.acos   ri   r)   c                    d|  dS )Nzjnp.arctan(r   r=   rX   s    r'   atanzPallasKernelOverrides.atan   ri   r)   c                    d|  dS )Nzjnp.exp(r   r=   rX   s    r'   expzPallasKernelOverrides.exp   r[   r)   c                    d|  dS )Nz	jnp.exp2(r   r=   rX   s    r'   exp2zPallasKernelOverrides.exp2       1#Qr)   c                    d|  dS )Nz
jnp.expm1(r   r=   rX   s    r'   expm1zPallasKernelOverrides.expm1       A3a  r)   c                    d|  dS )Nzjnp.log(r   r=   rX   s    r'   logzPallasKernelOverrides.log   r[   r)   c                    d|  dS )Nz
jnp.log10(r   r=   rX   s    r'   log10zPallasKernelOverrides.log10   ru   r)   c                    d|  dS )Nz	jnp.log2(r   r=   rX   s    r'   log2zPallasKernelOverrides.log2   rr   r)   c                    d|  dS )Nz
jnp.log1p(r   r=   rX   s    r'   log1pzPallasKernelOverrides.log1p   ru   r)   c                    d|  dS )Nz	jnp.sqrt(r   r=   rX   s    r'   sqrtzPallasKernelOverrides.sqrt   rb   r)   c                    d|  dS )Nzjax.lax.rsqrt(r   r=   rX   s    r'   rsqrtzPallasKernelOverrides.rsqrt   s      s!$$r)   c                    d|  dS )Nzjnp.abs(r   r=   rX   s    r'   abszPallasKernelOverrides.abs   r[   r)   c                    d|  dS )Nz(-r   r=   rX   s    r'   negzPallasKernelOverrides.neg   s    A3ayr)   c                    d|  dS )Nz
jnp.floor(r   r=   rX   s    r'   floorzPallasKernelOverrides.floor        A3a  r)   c                    d|  dS )Nz	jnp.ceil(r   r=   rX   s    r'   ceilzPallasKernelOverrides.ceil   rb   r)   c                    d|  dS )Nz
jnp.trunc(r   r=   rX   s    r'   trunczPallasKernelOverrides.trunc   r   r)   c                    d|  dS )Nz
jnp.round(r   r=   rX   s    r'   roundzPallasKernelOverrides.round   r   r)   c                    d|  dS )Nzjax.nn.sigmoid(r   r=   rX   s    r'   sigmoidzPallasKernelOverrides.sigmoid        1%%r)   c                    d|  dS )Nr4   z, 0)r=   rX   s    r'   reluzPallasKernelOverrides.relu   s    aS%%r)   c                    d|  d| dS )Nz
jnp.power(r   r   r=   abs     r'   powzPallasKernelOverrides.pow   s    A3b1%%r)   c                    d|  d| dS )Nr4   r   r   r=   r   s     r'   maximumzPallasKernelOverrides.maximum        aS1#Q''r)   c                    d|  d| dS )Nr+   r   r   r=   r   s     r'   minimumzPallasKernelOverrides.minimum   r   r)   c                    d|  d| d| dS )Nr   r   r   r=   )condr   r   s      r'   wherezPallasKernelOverrides.where   s     D6A3b1--r)   c                     |       }t        |t              rCt        j                  |      rd}n6t        j                  |      r
|dkD  rdnd}nt        |      }nt        |      }d|  d| d| dS )z
        Computes body, but only uses the result where mask is true.
        Where mask is false, uses the 'other' value instead.
        jnp.nanr   jnp.inf-jnp.infr   r   r   )
isinstancefloatmathisnanisinfrepr)maskbodyotherr/   	other_strs        r'   maskedzPallasKernelOverrides.masked   sl     eU#zz% %	E").I
	 K	UID6F82i[::r)   Nc                    |t         j                  k(  r;t        j                  j	                         j
                  dk(  rt         j                  }t        |      }d|  d| dS )Ntpuzjnp.asarray(	).astype(r   )torchint64r   graphget_current_device_or_throwtypeint32r   )rY   dtype	src_dtypeuse_compute_types	jax_dtypes        r'   to_dtypezPallasKernelOverrides.to_dtype  sS     EKKAGG$G$G$I$N$NRW$WKKE&u-	aS	)A66r)   c                F    t        |      }t        |      }d|  d| d| dS )z=Bitcast a value from one dtype to another with the same size.z)jax.lax.bitcast_convert_type(jnp.asarray(r   z), r   r   )rY   r   r   r   jax_src_dtypes        r'   to_dtype_bitcastz&PallasKernelOverrides.to_dtype_bitcast!  s7     'u-	*95:1#Y}oUXYbXccdeer)   c                   ddl m} t        j                  j                  j                  t        j                  j                  |              t        j                  j                  |       }t        j                  j                  |      }t        j                  j                  |      }t        j                  j                  j                  t        j                  j                  | ||             }t        j                  ||      S )z>Convert a sympy expression to a JAX array indexing expression.r	   )get_bounds_index_expr)bounds)utilsr   r   kernelused_iter_varsupdate_get_used_iter_varsprepare_indexingrename_indexingkexprcsegeneratecomputerV   r   )r#   r   r   preparedrenamedidx_strvars          r'   
index_exprz PallasKernelOverrides.index_expr)  s     	2 	
&&qxx'C'CD'IJ 88,,T2((**84((..)hhll##HHg.CD.I $ 
 %--c599r)   c                    t        |      }|t        j                  k(  r| rdS dS t        | t              r4t        j                  |       ryt        j                  |       r	| dkD  rdS dS d|  d| d	S )
z/Convert a constant value to JAX representation.TrueFalser   r   r   r   z
jnp.array(z, dtype=r   )r   r   boolr   r   r   r   r   )valr   r   s      r'   constantzPallasKernelOverrides.constant:  sp     'u-	EJJ 6-g-c5!zz# zz#$'!Gy;;C5155r)   c                    d|  dS )Nz	jnp.real(r   r=   rX   s    r'   realzPallasKernelOverrides.realH  rr   r)   c                    d|  dS )Nz	jnp.imag(r   r=   rX   s    r'   imagzPallasKernelOverrides.imagL  rr   r)   c                    d|  dS )Nz	jnp.conj(r   r=   rX   s    r'   conjzPallasKernelOverrides.conjP  rr   r)   c                    d|  dS )Nz
jnp.angle(r   r=   rX   s    r'   anglezPallasKernelOverrides.angleT  ru   r)   c                    d|  d|  dS )z8View complex tensor as real tensor with extra dimension.zjnp.stack([jnp.real(z), jnp.imag(z)], axis=-1)r=   rX   s    r'   view_as_realz"PallasKernelOverrides.view_as_realX  s     &aSQC|DDr)   c                    d|  d|  dS )z#View real tensor as complex tensor.(z[..., 0] + 1j * z	[..., 1])r=   rX   s    r'   view_as_complexz%PallasKernelOverrides.view_as_complex]  s     1#%aS	22r)   c                    d|  d| dS )Nr   z == r   r=   r   s     r'   eqzPallasKernelOverrides.eqc      1#T!Ar)   c                    d|  d| dS )Nr   z != r   r=   r   s     r'   nezPallasKernelOverrides.neg  r   r)   c                    d|  d| dS )Nr   z < r   r=   r   s     r'   ltzPallasKernelOverrides.ltk      1#S1~r)   c                    d|  d| dS )Nr   z <= r   r=   r   s     r'   lezPallasKernelOverrides.leo  r   r)   c                    d|  d| dS )Nr   z > r   r=   r   s     r'   gtzPallasKernelOverrides.gts  r   r)   c                    d|  dS )Nz
jnp.isnan(r   r=   rX   s    r'   r   zPallasKernelOverrides.isnanw  r   r)   c                    d|  dS )Nz
jnp.isinf(r   r=   rX   s    r'   r   zPallasKernelOverrides.isinf|  r   r)   c                    d|  dS )Nzjnp.isfinite(r   r=   rX   s    r'   isfinitezPallasKernelOverrides.isfinite  s    qc##r)   c                    d|  d| dS )Nr   z >= r   r=   r   s     r'   gezPallasKernelOverrides.ge  r   r)   c                    d|  d| dS )Nzjnp.logical_and(r   r   r=   r   s     r'   logical_andz!PallasKernelOverrides.logical_and       "!Bqc++r)   c                    d|  d| dS )Nzjnp.logical_or(r   r   r=   r   s     r'   
logical_orz PallasKernelOverrides.logical_or  s     !2aS**r)   c                    d|  dS )Nzjnp.logical_not(r   r=   rX   s    r'   logical_notz!PallasKernelOverrides.logical_not      !!A&&r)   c                    d|  d| dS )Nzjnp.logical_xor(r   r   r=   r   s     r'   logical_xorz!PallasKernelOverrides.logical_xor  r   r)   c                    d|  d| dS )Nzjnp.arctan2(r   r   r=   r   s     r'   atan2zPallasKernelOverrides.atan2  r   r)   c                    d|  d| dS )Nz
jnp.hypot(r   r   r=   r   s     r'   hypotzPallasKernelOverrides.hypot       A3b1%%r)   c                    d|  d| dS )Nz	jnp.fmod(r   r   r=   r   s     r'   fmodzPallasKernelOverrides.fmod  s     1#Rs!$$r)   c                    d|  d| dS )Nzjnp.remainder(r   r   r=   r   s     r'   	remainderzPallasKernelOverrides.remainder  s    s"QCq))r)   c                &    d|  d| d|  d| d|  dS )Nz
(jnp.sign(z) * jnp.sign(z) * (jnp.abs(z) // jnp.abs(z))).astype(.dtype)r=   r   s     r'   truncdivzPallasKernelOverrides.truncdiv  s.    
 A3mA3mA3mA3kZ[Y\\cddr)   c                    d|  d| dS )Nr   z // r   r=   r   s     r'   floordivzPallasKernelOverrides.floordiv  r   r)   c                    d|  d| d| dS )Nz	jnp.clip(r   r   r=   )rY   min_valmax_vals      r'   clampzPallasKernelOverrides.clamp  s    1#Ry7)155r)   c                    d|  d|  dS )Nzjnp.where(jnp.isnan(z), 0.0, jnp.sign())r=   rX   s    r'   signzPallasKernelOverrides.sign  s     &aS(9!B??r)   c                    d|  dS )Nzjnp.signbit(r   r=   rX   s    r'   signbitzPallasKernelOverrides.signbit  s     aS""r)   c                    d|  dS )Nzjax.scipy.special.erf(r   r=   rX   s    r'   erfzPallasKernelOverrides.erf  s     (s!,,r)   c                    d|  dS )Nzjax.scipy.special.erfc(r   r=   rX   s    r'   erfczPallasKernelOverrides.erfc  s    (1--r)   c                    d|  dS )Nzjax.scipy.special.erfinv(r   r=   rX   s    r'   erfinvzPallasKernelOverrides.erfinv  s     +1#Q//r)   c                    d|  dS )Nzjax.scipy.special.gammaln(r   r=   rX   s    r'   lgammazPallasKernelOverrides.lgamma  s     ,A3a00r)   c                    d|  dS )Nzjax.scipy.special.digamma(r   r=   rX   s    r'   digammazPallasKernelOverrides.digamma  s    +A3a00r)   c                    d|  d|  d|  dS )Nr   z>.astype(jnp.float64) == 0.0, 1.0, jax.scipy.special.bessel_jn(z&.astype(jnp.float64), v=0)[0]).astype(r  r=   rX   s    r'   	bessel_j0zPallasKernelOverrides.bessel_j0  +      ++,# .c"	
r)   c                    d|  d|  d|  dS )Nr   z>.astype(jnp.float64) == 0.0, 0.0, jax.scipy.special.bessel_jn(z&.astype(jnp.float64), v=1)[1]).astype(r  r=   rX   s    r'   	bessel_j1zPallasKernelOverrides.bessel_j1  r&  r)   c                    d|  d|  dS )Njax.lax.bessel_i0e() * jnp.exp(jnp.abs(r  r=   rX   s    r'   modified_bessel_i0z(PallasKernelOverrides.modified_bessel_i0       %QC';A3bAAr)   c                    d|  d|  dS )Njax.lax.bessel_i1e(r+  r  r=   rX   s    r'   modified_bessel_i1z(PallasKernelOverrides.modified_bessel_i1  r-  r)   c                    d|  d|  d|  dS )Nr   z == 0.0, 1.0, jnp.sin(z) / r   r=   rX   s    r'   spherical_bessel_j0z)PallasKernelOverrides.spherical_bessel_j0
  s      A34QCtA3a@@r)   c                    d|  dS )Nr*  r   r=   rX   s    r'   i0ezPallasKernelOverrides.i0e       %QCq))r)   c                    d|  dS )Nr/  r   r=   rX   s    r'   i1ezPallasKernelOverrides.i1e  r5  r)   c                    d|  d| dS )Nzjax.scipy.special.gammainc(r   r   r=   rY   ys     r'   gammainczPallasKernelOverrides.gammainc  s     -QCr!A66r)   c                    d|  d| dS )Nzjax.scipy.special.gammaincc(r   r   r=   r9  s     r'   	gammaincczPallasKernelOverrides.gammaincc$  s     .aS1#Q77r)   c                    d|  d| dS )Nzjax.scipy.special.polygamma(z.astype(jnp.int32), r   r=   r9  s     r'   	polygammazPallasKernelOverrides.polygamma-  s     .aS0DQCqIIr)   c                    d|  dS )Nzjax.scipy.special.ndtri(r   r=   rX   s    r'   ndtrizPallasKernelOverrides.ndtri3  s     *!A..r)   c                    d|  d| dS )Nzjax.scipy.special.zeta(r   r   r=   r9  s     r'   zetazPallasKernelOverrides.zeta8  s     )2aS22r)   c                    d|  d| dS )Nzjax.scipy.special.xlogy(r   r   r=   r9  s     r'   xlogyzPallasKernelOverrides.xlogy=  s     *!Bqc33r)   c                    d|  d| dS )Nzjax.scipy.special.xlog1py(r   r   r=   r9  s     r'   xlog1pyzPallasKernelOverrides.xlog1pyB  s     ,A3b155r)   c                >    d|  d| d|  d|  d| d|  d| d| d	|  d
S )Njnp.where(jnp.abs(z) <= 1, jnp.cos(z * jnp.arccos(jnp.clip(z, -1, 1))), jnp.where(z > 1, jnp.cosh(z * jnp.arccosh(jnp.maximum(z, 1.0))), ((-1.0) ** z) * jnp.cosh(z * jnp.arccosh(jnp.maximum(-z
, 1.0)))))r=   rY   ns     r'   chebyshev_polynomial_tz,PallasKernelOverrides.chebyshev_polynomial_tG  s[     ! $c0 4 s5aS 9M!,H:	W	
r)   c                    dj                  g d|  d| d|  d|  d|  d|  d| d	| d
|  d|  d|  d| d| d| d| d|  d|  d      S )N rI  z) < 1, jnp.sin((z + 1) * jnp.arccos(jnp.clip(z&, -1, 1))) / jnp.sqrt(jnp.maximum(1 - z**2, 1e-10)), jnp.where(z >= 1, jnp.where( == 1, z + 1.0, jnp.sinh((z  + 1) * jnp.arccosh(jnp.maximum(z , 1.0))) / jnp.sqrt(jnp.maximum(z**2 - 1, 1e-10))), jnp.where(z == -1, ((-1.0) ** ) * (z + 1.0), ((-1.0) ** z) * jnp.sinh((z! + 1) * jnp.arccosh(jnp.maximum(-z**2 - 1, 1e-10)))))joinrJ  s     r'   chebyshev_polynomial_uz,PallasKernelOverrides.chebyshev_polynomial_uV  s   	; 	;  	; 	; $ 	;s	;6	;78c	;:(	;()s	;+	; 	;	; 		; #		; $%#		;&		;
 	;
 <	;
 =>3	;
?$	; %&3	;'	; 	; /	; 01c	; 27	; 89c	;:	; 	; +	; ,-#	; .O	; PQc	;R$	; %&3	; ':	;	
r)   c                   dj                  g d| d|  d| d|  d| d|  d|  d| d	|  d
|  d|  d| d|  d|  d|  d|  d| d|  d|  d|  d|  d|  d|  d      S )NrN  r    == 0, jnp.ones_like(), jnp.where(	 == 1, 2* - 1, jnp.where(	 == 2, 4*z**2 - 2*	 == 3, 8*z**3 - 4***2 - 4* + 1, jnp.where(
 == 4, 16*z**4 - 8*	**3 - 12*z**2 + 4*
 == 5, 32*z	**5 - 16*	**4 - 32*z	**3 + 12***2 + 6*z - 1, jnp.zeros_like()))))))rQ  rJ  s     r'   chebyshev_polynomial_vz,PallasKernelOverrides.chebyshev_polynomial_vl  
   ) )j ) )0 ) ) 4 ))$)%&C)())$)%&C)'/)01s)3) ) %) &'C) (0) 12s) 3;) <=#)>) 	) &	) '(S	) )1	) 23	) 4=	) >?C	) @H	) IJs	)K	)
 )
 &)
 '(S)
 )2)
 34)
 5>)
 ?@S)
 AJ)
 KL)
 MU)
 VWTW)
X)  S) !()	
r)   c                   dj                  g d| d|  d| d|  d| d|  d|  d	| d
|  d|  d|  d	| d|  d|  d|  d|  d| d|  d|  d|  d|  d|  d|  d      S )NrN  r   rU  rV  rW  r\  rY  z**2 + 2*rX  rZ  z**3 + 4*r[  r]  z**4 + 8*r^  r_  z	**5 + 16*r`  ra  z + 1, jnp.zeros_like(rb  rQ  rJ  s     r'   chebyshev_polynomial_wz,PallasKernelOverrides.chebyshev_polynomial_w}  rd  r)   c                6    t         j                  d|  d|      S Nz(2 * z - 1))rV   rL  rJ  s     r'   shifted_chebyshev_polynomial_tz4PallasKernelOverrides.shifted_chebyshev_polynomial_t        %;;eA3e<LaPPr)   c                6    t         j                  d|  d|      S rh  )rV   rS  rJ  s     r'   shifted_chebyshev_polynomial_uz4PallasKernelOverrides.shifted_chebyshev_polynomial_u  rj  r)   c                6    t         j                  d|  d|      S rh  )rV   rc  rJ  s     r'   shifted_chebyshev_polynomial_vz4PallasKernelOverrides.shifted_chebyshev_polynomial_v  rj  r)   c                6    t         j                  d|  d|      S rh  )rV   rf  rJ  s     r'   shifted_chebyshev_polynomial_wz4PallasKernelOverrides.shifted_chebyshev_polynomial_w  rj  r)   c                    dj                  g d| d|  d| d|  d| d|  d| d	|  d
|  d| d|  d|  d| d|  d|  d|  d|  d      S )NrN  r   rU  rV  z == 1, 2 * , jnp.where(z == 2, 4 * z**2 - 2, jnp.where(z == 3, 8 * z**3 - 12 * z == 4, 16 * z**4 - 48 * z**2 + 12, jnp.where(z == 5, 32 * z**5 - 160 * z**3 + 120 * , jnp.zeros_like(rb  rQ  rJ  s     r'   hermite_polynomial_hz*PallasKernelOverrides.hermite_polynomial_h  s~   ) )j ) )0 ) ) 4 ))&)'(c)*))&)'(c)*) ) ') ()c) *5) 67C)8) 	) (	) )*s	) +6	) 78S	)9	)
 )
 ()
 )*s)
 +7)
 89c)
 :F)
 GHS)
I)  S) !()	
r)   c                    dj                  g d| d|  d| d|  d| d|  d| d	|  d
|  d| d|  d|  d| d|  d|  d|  d|  d      S )NrN  r   rU  rV  rO  rr  z == 2, z**2 - 1, jnp.where(z == 3, 
**3 - 3 * z == 4, z
**4 - 6 * z**2 + 3, jnp.where(z == 5, z**5 - 10 * **3 + 15 * rs  rb  rQ  rJ  s     r'   hermite_polynomial_hez+PallasKernelOverrides.hermite_polynomial_he  s}   ) )j ) )0 ) ) 4 ))")#$#)&))")#$#)&) ) #) $%#) &0) 12s)3) 	) #	) $%#	) &0	) 12s	)3	)
 )
 #)
 $%#)
 &1)
 23)
 4?)
 @Ac)
B)  S) !()	
r)   c                   dj                  g d| d|  d| d|  d| d|  d|  d	| d
|  d|  d|  d| d|  d|  d|  d|  d| d|  d|  d|  d|  d|  d|  d      S )NrN  r   rU  rV  z == 1, 1 - rr  z == 2, (r[  z + 2) / 2, jnp.where(z	 == 3, (-z**3 + 9*z	**2 - 18*z + 6) / 6, jnp.where(z == 4, (z	**4 - 16*z	**3 + 72*z	**2 - 96*z + 24) / 24, jnp.where(z	 == 5, (-z	**5 + 25*z
**4 - 200*z
**3 + 600*z
**2 - 600*z + 120) / 120, jnp.zeros_like(rb  rQ  rJ  s     r'   laguerre_polynomial_lz+PallasKernelOverrides.laguerre_polynomial_l  s	   ) )j ) )0 ) ) 4 ))&)'(c)*))#)$%3)&.)/0c)2) ) %) &'C) (0) 12s) 3<) =>3)?) 	) $	) %&3	) '0	) 12s	) 3<	) =>3	) ?H	) IJs	)K	)
 )
 %)
 &'C)
 (1)
 23)
 4>)
 ?@S)
 AK)
 LM#)
 NX)
 YZWZ)
[)  S) !()	
r)   c                    dj                  g d| d|  d| d|  d| d|  d| d	|  d
|  d| d|  d|  d| d|  d|  d|  d|  d      S )NrN  r   rU  rV  rO  rr  z == 2, (3 * z**2 - 1) / 2, jnp.where(z == 3, (5 * rv  z) / 2, jnp.where(z == 4, (35 * z**4 - 30 * z**2 + 3) / 8, jnp.where(z == 5, (63 * z**5 - 70 * rw  z) / 8, jnp.zeros_like(rb  rQ  rJ  s     r'   legendre_polynomial_pz+PallasKernelOverrides.legendre_polynomial_p  s~   ) )j ) )0 ) ) 4 ))")#$#)&))')()s)+) ) () )*s) +5) 67C)8) 	) )	) *+	) ,7	) 89c	):	)
 )
 ))
 *+)
 ,7)
 89c)
 :E)
 FGC)
H)  S) !()	
r)   c                    d|  dS )Nzjnp.reciprocal(r   r=   rX   s    r'   
reciprocalz PallasKernelOverrides.reciprocal  r   r)   c                    d|  dS )Nzjnp.square(r   r=   rX   s    r'   squarezPallasKernelOverrides.square  s    QCq!!r)   c                    d|  d| d| dS )zFused multiply-add: a * b + c

        JAX doesn't have jnp.fma, so we use the unfused version.
        The compiler may still fuse this on supported hardware.
        z((rP  z) + (r  r=   )r   r   r$   s      r'   fmazPallasKernelOverrides.fma  s     A3eA3eA3b))r)   c                    d|  d| dS )Nzjnp.copysign(r   r   r=   r   s     r'   copysignzPallasKernelOverrides.copysign  s     qcA3a((r)   c                    d|  d| dS )Nzjnp.nextafter(r   r   r=   r   s     r'   	nextafterzPallasKernelOverrides.nextafter  s      s"QCq))r)   c                    d|  d| dS )Nz
jnp.ldexp(r   r   r=   r   s     r'   ldexpzPallasKernelOverrides.ldexp  r  r)   c                    d|  dS )Nz
jnp.frexp(r   r=   rX   s    r'   frexpzPallasKernelOverrides.frexp  r   r)   c                    d|  dS )Nz	jnp.modf(r   r=   rX   s    r'   modfzPallasKernelOverrides.modf	  rr   r)   c                    d|  d| dS )Nzjnp.bitwise_and(r   r   r=   r   s     r'   bitwise_andz!PallasKernelOverrides.bitwise_and      !!Bqc++r)   c                    d|  d| dS )Nzjnp.bitwise_or(r   r   r=   r   s     r'   
bitwise_orz PallasKernelOverrides.bitwise_or       2aS**r)   c                    d|  d| dS )Nzjnp.bitwise_xor(r   r   r=   r   s     r'   bitwise_xorz!PallasKernelOverrides.bitwise_xor  r  r)   c                    d|  dS )Nzjnp.bitwise_not(r   r=   rX   s    r'   bitwise_notz!PallasKernelOverrides.bitwise_not  r   r)   c                    d|  d| dS )Nzjnp.left_shift(r   r   r=   r   s     r'   
left_shiftz PallasKernelOverrides.left_shift  r  r)   c                    d|  d| dS )Nzjnp.right_shift(r   r   r=   r   s     r'   right_shiftz!PallasKernelOverrides.right_shift"  r  r)   c                    t         j                  j                  j                  d|      }dt         j                  j                  j	                  |        d| dS )z)Load the random seed value from a buffer.load_seed_offsetr   z[0] + r   )r   r   r!   seed_offsetinput)nameoffsetr  s      r'   	load_seedzPallasKernelOverrides.load_seed'  sH     hhmm//0BFK188==&&t,-VK=BBr)   c                    d|  d| d| dS )zGenerate uniform random numbers in [0, 1).

        Uses JAX's threefry2x32 PRNG directly for vectorized random generation.
        The seed provides the base key, offset provides per-element uniqueness.
        zWjax.vmap(lambda o: jax.random.uniform(jax.random.fold_in(jax.random.PRNGKey(jnp.uint32(8)), jnp.uint32(o)), (), dtype=jnp.float32))(jnp.asarray(!).flatten()).reshape(jnp.asarray().shape)r=   seedr  s     r'   randzPallasKernelOverrides.rand.  s,    @@Dv F"8#DVHHV	
r)   c                    d|  d| d| dS )zGenerate standard normal random numbers.

        Uses JAX's threefry2x32 PRNG directly for vectorized random generation.
        The seed provides the base key, offset provides per-element uniqueness.
        zVjax.vmap(lambda o: jax.random.normal(jax.random.fold_in(jax.random.PRNGKey(jnp.uint32(r  r  r  r=   r  s     r'   randnzPallasKernelOverrides.randn>  s,    @@Dv F"8#DVHHV	
r)   c                &    d|  d| d| d| d| dS )z,Generate random int64 values in [low, high).zWjax.vmap(lambda o: jax.random.randint(jax.random.fold_in(jax.random.PRNGKey(jnp.uint32(z)), jnp.uint32(o)), (), r   z , dtype=jnp.int64))(jnp.asarray(r  r  r=   )r  r  lowhighs       r'   	randint64zPallasKernelOverrides.randint64M  s>    
@@DvE]^a]bbdeidj k"8#DVHHV	
r)   )rY   r8   r7   r8   )r   r8   r   r8   r7   r8   )r   r8   r   r8   r   r8   r7   r8   )r   r8   r   zCallable[[], str]r   r   r7   r8   )NT)
rY   r8   r   torch.dtyper   ztorch.dtype | Noner   r   r7   r8   )rY   r8   r   r  r   r  r7   r8   )r#   r6   r   r  r7   r8   )r   r  r7   r8   )rY   r8   r  r8   r  r8   r7   r8   )rY   r8   r:  r8   r7   r8   )rY   r8   rK  r8   r7   r8   )r   r8   r   r8   r$   r8   r7   r8   )r  r8   r  r8   r7   r8   )r  r8   r  r8   r7   r8   )
r  r8   r  r8   r  r8   r  r8   r7   r8   )zr9   r:   r;   r<   staticmethodrZ   r]   r_   ra   rd   rf   rh   rk   rm   ro   rq   rt   rw   ry   r{   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r	  r  r  r  r  clipr  r  r  r  r  r!  r#  r%  r(  r,  r0  r2  i0r4  i1r7  r;  r=  igammaigammacr?  rA  rC  rE  rG  rL  rS  rc  rf  ri  rl  rn  rp  rt  rx  rz  r|  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r=   r)   r'   rV   rV   j   s2	                      " " " " " "       ! !   ! !     ! !     % %     ! !     ! ! ! ! & & & & & & ( ( ( ( . . ; ;&  )-"&	777 &7  	7
 
7 7 f f : :  6 6             ! ! E E 3 3
           ! ! ! ! $ $   , , + + ' ' , , ( ( & & % % * * e e
   6 6 D @ @ # # - - . . 0 0 1 1 1 1 	
 	
 	
 	
 B B
 B B
 A A
 
B* * 
B* * 7 7
 8 8 FGJ J
 / / 3 3 4 4 6 6 
 
 
 
( 
 
 
 
 Q Q Q Q Q Q Q Q 
 
" 
 
 
 
 
 
 & & " " * * ) ) * * & & ! !     , , + + , , ' ' + + , , C C 
 
 
 
 
 
r)   rV   c                  N    e Zd ZU dZded<   ded<   ded<   ded<   ded	<   ded
<   y)_IndirectAccessInfoz=Describes a detected indirect (data-dependent) buffer access.r8   table_paramtable_buf_nametupletable_shapeintindirect_dimindirect_varindices_paramNr9   r:   r;   r<   __annotations__r=   r)   r'   r  r  X  s)    Gr)   r  c                  &    e Zd ZU dZded<   ded<   y)_BufferIndexingzHEncapsulates index string and flattening requirements for buffer access.r8   	index_strr   needs_flattenNr  r=   r)   r'   r  r  d  s    RNr)   r  c                  :    e Zd ZU dZded<   ded<   ded<   ded	<   y
)_BroadcastedIterVarzFEncapsulates information needed to codegen a broadcasted iteration varr  idxsympy.Symbolvar_symr   entry
int | None
length_valNr  r=   r)   r'   r  r  l  s    P 
Hr)   r  c                      e Zd ZU dZded<   ded<   ded<   ded<   ded	<   d
ed<   d
ed<   d
ed<   d
ed<   ded<   ded<   d
ed<   d
ed<   d
ed<   d
ed<   ded<   ded<   ded<   y)_CodegenContextz@Bundles local state shared across codegen_kernel helper methods.r   coder8   kernel_namer   is_tpuinterpret_is_cpuinterpret_literal	list[str]kernel_paramspure_out_paramsoutput_paramssize_var_paramszdict[str, str]output_buffer_lookupdict[str, bool]aliasable_flagsalias_paramspointer_tailkernel_input_paramsfull_kernel_paramszOrderedSet[str]non_alias_out_set	list[int]copy_output_indiceslist[tuple[int, int]]alias_pairsNr  r=   r)   r'   r  r  w  sp    J
L(($$""!!&&""&&r)   r  c                      e Zd ZU dZeZeZded<    fdZ		 	 	 	 	 	 	 	 	 	 dQdZ
dRdZdRdZdRdZdSd	ZdTd
ZdUdZdVdZdUdZ	 	 	 	 	 	 dWdZedXd       Z	 	 	 	 	 	 dYdZedZd       Zd[dZ	 	 	 	 	 	 d\dZ	 	 	 	 	 	 d]dZed^d       Zd_dZed`d       Zedad       Zedbd       ZdcdZ dddZ!dedZ"dfdZ#dgdZ$dhdZ%	 	 	 	 did Z&	 	 	 	 	 	 djd!Z'	 	 	 	 	 	 	 	 	 	 dkd"Z(	 	 	 	 	 	 	 	 dld#Z)	 	 	 	 	 	 	 	 dld$Z*	 	 	 	 	 	 	 	 dld%Z+edmd&       Z,dnd'Z-	 	 	 	 	 	 	 	 dod(Z.dpd)Z/	 	 	 	 	 	 	 	 	 	 dqd*Z0drd+Z1	 	 	 	 	 	 	 	 dsd,Z2	 	 	 	 	 	 dtd-Z3	 	 	 	 	 	 	 	 dud.Z4dvd/Z5	 	 	 	 	 	 	 	 dwd0Z6	 dx	 	 	 	 	 	 	 	 	 	 	 	 	 dyd1Z7	 	 	 	 	 	 	 	 	 	 	 	 dzd2Z8e9jt                  d{d3       Z;dRd4Z<e9jt                  	 dx	 	 	 	 	 	 	 	 	 d|d5       Z=e	 d}	 	 	 	 	 	 	 d~d6       Z>	 d	 	 	 	 	 dd7Z?	 	 	 	 	 	 	 	 dd8Z@	 	 	 	 	 	 	 	 dd9ZA	 	 	 	 	 	 	 	 	 	 dd:ZBedd;       ZCdgd<ZDdxdd=ZEe	 	 	 	 	 	 dd>       ZF	 	 	 	 	 	 	 	 dd?ZG	 	 	 	 	 	 	 	 dd@ZHddAZIddBZJ	 	 ddCZK	 	 	 	 	 	 ddDZL	 	 	 	 	 	 ddEZMe	 	 	 	 	 	 	 	 ddF       ZNddGZO	 	 	 	 	 	 ddHZPddIZQddJZR	 	 	 	 	 	 	 	 	 	 ddKZSddLZT	 	 	 	 	 	 ddMZU	 	 	 	 	 	 ddNZVe	 	 	 	 	 	 	 	 	 	 ddO       ZWdxddPZX xZYS )PallasKernelzPallas kernel codegen for TPU and GPU (Mosaic backend).

    Generates Python code that defines a Pallas kernel and a host entrypoint,
    compiled and loaded via async_compile.pallas.
    zCallable[[sympy.Expr], str]r   c                   t        |   |i | t        j                  j	                         }|j
                  dk(  | _        |j
                  dk(  | _        | j                  | _        g | _	        i | _
        t               | _        i | _        i | _        d | _        d | _        g | _        | j$                  j'                         D ]B  }|j(                  j*                  D ]'  }| j"                  j-                  |j.                         ) D t               | _        t               | _        d| _        i | _        t               | _        d | _        i | _        i | _        |j
                  dk(  | _        y )Ncudar   F) superrL   r   r   r   r   is_gpur  use_emit_pipelinestore_with_outputload_index_exprsr   outputs_need_readpermuted_input_bufferscollapsed_reshape_inputscollapsed_output_shape_cpu_max_grid_product_output_buffer_namesfeaturesscheduler_nodesread_writeswritesappendr  r   tile_relative_iter_varshas_flatten_indexingstrided_input_buffersflatten_indexed_buffersindirect_access_cse_to_param_param_to_graph_name)r"   r!   rO   devicesnodedep	__class__s         r'   rL   zPallasKernel.__init__  sL   $)&)446kkV+kkU* "&8:792<,BD#DF%>B#15" 02!]]224 	;E((// ;))00:;	; 9C BL$$)!
 MO" 9C$;?-/46!kkU*r)   c                     y)z)Check array bounds for indirect indexing.Nr=   )r"   r#   sizeloweruppers        r'   check_boundszPallasKernel.check_bounds      r)   c                    | j                  |      }t        |t        j                        ry|j                  rt        |      S | j                  |      S )a  
        Convert an index expression to a string suitable for Pallas indexing.

        Pallas operates on full arrays, so we need to convert index expressions
        to JAX array slicing. For example:
        - x0 -> "..." (contiguous access, full array)
        - 2*x0 -> "::2" (strided access with stride 2)
        - 2*x0 + 1 -> "1::2" (strided access with offset 1, stride 2)

        Args:
            index: The indexing expression to convert

        Returns:
            The indexing string to use in generated code
        ...)r   r   sympySymbol
is_Integerr8   _convert_to_jax_slice)r"   indexprepared_indexs      r'   _get_index_strzPallasKernel._get_index_str  sN    " ..u5 nell3&&~&& --n==r)   c                   | j                   sy| j                  |      }|j                  t              r;| j                  j                  | j                  |             | j                  |      S t        j                  j                  j                  |      }| j                  |      }| j                  j                  |       t        |      dk(  rt        |      S t        |      dk(  rt        t        |            }t!        j"                  ||      }t!        j$                  ||      }|||z
  }t        j                  j                  j                  |      }|dk  r| j                  |      S |dk(  ry|dk7  r| j                  |      S 	 t'        |      }|dk  r| j                  |      S 	 | j                  |       dS ||z
  }t        j                  j                  j                  |      }|dk(  r||k(  ryyt        |      dkD  rBd}|D ]7  }t!        j"                  ||      }t!        j$                  ||      }|dk7  s5d} n |ryyy# t(        t*        f$ r | j                  |      cY S w xY w)aD  
        Convert a sympy index expression to JAX slice notation.

        Handles common patterns like:
        - stride*var -> ::stride
        - stride*var + offset -> offset::stride

        For more complex patterns, falls back to explicit indexing.
        Uses BlockPatternMatcher for robust pattern matching.
        r  r   r   z::1TF)range_treesr   hasr   r   r   r   r   r   r   sizevarssimplifylenr8   nextiterr   get_subexpr_involving_symbolmatch_affine_block_exprr  	TypeError
ValueError)	r"   r  	used_varsr   var_exprstrider  
offset_valall_unit_strides	            r'   r  z"PallasKernel._convert_to_jax_slice  si     $$U+ 99_%&&t'?'?'FG ::e$$   ))%0,,U3	 	""9-y>Qu:^q tI'C +GGsSH )@@3OF!)))226:A:::e,,Q;  Q;::e,,-!$VJ!A~#zz%00 &
 **V,-S11 )))226:Q;8s? , + ^a #O  .KKESVW,DDXsSQ;&+O   A ":. -::e,,-s    I  I32I3c                    |j                   }| j                         }||z  }||k7  rt        d|       | j                  j	                  |       | j                  |      }|S )a  
        Generate JAX code to compute an index array for strided/complex indexing patterns.

        For expressions like `2 * x3 + 32 * x2 + 256 * x1 + 1024 * x0`, we generate
        code that computes the flattened index array using broadcasting.

        The iteration variables (x0, x1, x2, x3) are already defined as jnp.arange arrays
        in the kernel. We just need to convert the sympy expression to JAX code.
        z9Pallas backend does not yet support mixed index pattern: )free_symbols_get_iter_varsrT   r   r   r   )r"   r  r&  	iter_varsr   r  s         r'   _generate_strided_indexz$PallasKernel._generate_strided_indexW  sv     ))'')	 !9,	$KE7S 
 	""9- JJu%	 r)   c                H    t        | j                  j                               S )z*Get the set of iteration variable symbols.)r   range_tree_nodeskeys)r"   s    r'   r'  zPallasKernel._get_iter_varsu  s    $//44677r)   c                <    |j                   | j                         z  S )z4Get iteration variables used in an index expression.)r&  r'  r"   r  s     r'   r   z PallasKernel._get_used_iter_varsy  s    !!D$7$7$999r)   c                6    t        | j                  |            S )z7Check if index expression contains iteration variables.)r   r   r.  s     r'   _has_iteration_varsz PallasKernel._has_iteration_vars}  s    D,,U344r)   c                v    |j                   D cg c]  }t        |      j                  d      s|! c}S c c}w )zDGet list of indirect variable symbols (tmp*) in an index expression.tmp)r&  r8   
startswith)r"   r  ss      r'   _get_indirect_varszPallasKernel._get_indirect_vars  s-     --JaQ1B1B51IJJJs   66c                <    t        | j                  |            dkD  S )z6Check if index expression contains indirect variables.r   )r  r5  r.  s     r'   _has_indirect_varszPallasKernel._has_indirect_vars  s    4**512Q66r)   c                |   | j                  |      s|j                  t              ry|| j                  v ry| j	                  |      }|y|\  }}}}}|D cg c]  }| j                  |       }}t        d |D              ryt        t        t           |      }t        |      }	|	dk(  ry| j                  |      }
| j                  |      }|syt        |	      D cg c]  }ddg }}i }t        j                  j                   j#                  |      }|D ]  }t%        j&                  ||      }t%        j(                  ||      }| y| j                  |      }||dk  r yd}t        |	      D ](  }|
|   dk(  r||
|   z  dk(  s||
|   z  }|dk\  s&|} n | y||
|   z  }|dk  s||   |z  dk7  r y|||   d<   |||<   t        j                  j                   j#                  ||z
        } | j                  |      }|y|dk  ryt        |	      D ]!  }|
|   dkD  s||
|   z  ||   d<   ||
|   z  }# |dk7  ryt+        d |D              ryg }d}t        |	      D ]?  }||   \  }}||z  }||z  }||   |z  }||k\  r y|||z
  z  }|j-                  |||f       A | j/                  |      \  }}||k7  ry|j1                         D ]\  \  }}|| j2                  vr y| j                  | j2                  |   j4                        }| y||   \  } }!}"||   | z  |"z
  }#||#k7  s\ y |S c c}w c c}w )a'  Decompose a flat index into per-dimension (stride, offset, skip) triples.

        Given flat index like ``64*x0 + 2*x1 + 5`` and buffer shape ``(32, 64)``
        with C-contiguous strides ``[64, 1]``:
          - x0 coefficient 64 / buffer_stride[0]=64 -> dim 0: stride=1
          - x1 coefficient 2 / buffer_stride[1]=1  -> dim 1: stride=2
          - constant 5: dim 0 gets 5//64=0, dim 1 gets 5//1=5
          - dim 1 offset 5 with stride 2: skip=5//2=2, offset=5%2=1

        Returns per-dim ``[(stride, offset, skip), ...]`` where:
          - stride: access stride on this dim (1 = contiguous)
          - offset: static index into the stride dim (0 <= offset < stride)
          - skip: number of stride-blocks to skip at the start of this dim
        Returns None if decomposition fails.
        Nc              3  2   K   | ]  }|d u xs |dk    y wNr   r=   .0r4  s     r'   	<genexpr>z9PallasKernel._decompose_strided_access.<locals>.<genexpr>  s      >qqDy"AF">   r   r   c              3  ,   K   | ]  \  }}|d k(    ywr   Nr=   )r<  r4  _s      r'   r=  z9PallasKernel._decompose_strided_access.<locals>.<genexpr>  s     )$!QqAv)s   )r7  r  r   r  _get_buffer_info	_safe_intanyr   listr  r  _c_contiguous_stridesr   ranger   r   r  r  r   r  r  allr   _compute_output_numel_from_indexitemsr+  length)$r"   r  r  rK   rA  buf_sizer4  buf_shape_or_none	buf_shapendim	c_stridesr   r/   
var_to_dim	remainingr   r!  coeff	coeff_intdimdper_dim_strider#  
decomposedoutput_numel_expectedr"  
raw_offsetr  skipn_blocksoutput_numel	var_rangestride_d	_offset_dskip_deffective_sizes$                                       r'   _decompose_strided_accessz&PallasKernel._decompose_strided_access  s   $ ""5)UYY-G 4///$$T*<#8Q18@A1T^^A.AA>,=>>#DI/@A	9~19..y9	 ,,U3	 49;"?aAq6"?"?.0
GG$$--e4	 	HC*GG	SVWH'??#NE}u-I IN C4[ Q<1$y|+q0%.)A,%>N%* {&)C.8N!Ys^n%D%I+F3KN!JsO((11)h2FGI9	H@ ^^I.
>t 	7A|a)Yq\9q	!')A,6
	7 ?
 )&))
 24
 !t 	6A!'FJ&(F'D |v-Hx!X_4!vvt45	6 ??Fa00 #((* 		HC$///t'<'<S'A'H'HII *4S/'Hi&s^x7&@NN*		 U B  #@s   L4L9c                    g }|D ]H  \  }}}|dk(  r|j                  d       |j                  d       |j                  t        |             J |  ddj                  |       dS )z?Build ``buf[:, :, offset]`` for strided dims, ``:`` for others.r   :[r   ])r  r8   rR  )bufdecomppartsr"  r  _skips         r'   _strided_load_exprzPallasKernel._strided_load_expr  sm     %+ 	*!FFE{S!S!S[)	* a		%()++r)   c           
     ,   |D ]  }| j                  |      }||| j                  vr&| j                  |   }| j                  |      }|I|\  }}}}}g }	t        |      D ]x  \  }
\  }}}| j	                  ||
         }| |dkD  r8|	j                  t        ||z               |	j                  t        |             _|	j                  t        |             z |j                  | d| ddj                  |	       d       t        d |D              sg }|D ]I  \  }}}|dk(  r|j                  d       |j                  |d	kD  r| dnd       |j                  d       K |j                  | d| d
dj                  |       d        y)a	  Emit reshape + optional slice for strided input parameters.

        For each strided param, reshapes ``(M, N)`` to ``(M, N/stride, stride)``
        and, when ``skip > 0``, slices off leading blocks so the remaining
        elements align with the output.
        Nr    = 	.reshape(r   r   c              3  .   K   | ]  \  }}}|d kD    ywr   Nr=   )r<  rA  r[  s      r'   r=  z9PallasKernel._codegen_strided_reshapes.<locals>.<genexpr>@  s     :JAq$tax:s   re  r   rf  rg  )
_param_to_buf_namer  rB  	enumeraterC  r  r8   	writelinerR  rD  )r"   r  paramsparambuf_namestridesrK   rA  rL  new_shape_partsrV  r"  _offsetrk  rU  slice_partsr[  s                    r'   _codegen_strided_reshapesz&PallasKernel._codegen_strided_reshapes   s     	SE..u5H843M3M#M00:G((2D|#' AxAq)+O/8/A S++FGUnnXa[1;A:#**3sf}+=>#**3v;7#**3s84S gSy?1K0LAN :'::-/K18 4-!Q;'..s3'..TAX$qz3O'..s34 NNeWCwa		+8N7Oq#QR?	Sr)   c                    t        | dd              }|yt        |dd      }|t        |      |k7  ryg }|D ]D  }t        |t        t        j
                  f      rt        |      nd}| y|j                  |       F |S )z5Extract actual output buffer strides from its layout.
get_layoutc                      y rQ   r=   r=   r)   r'   <lambda>z6PallasKernel._get_actual_out_strides.<locals>.<lambda>M  r
  r)   Nr"  )getattrr  r   r  r  Integerr  )out_bufrK  layout
stride_rawrx  r4  vs          r'   _get_actual_out_stridesz$PallasKernel._get_actual_out_stridesJ  s     >,=?>VXt4
ZA!5 	A$Qemm(<=A4AyNN1		
 r)   c                |   i }g }|D ]K  }| j                   |   }|j                  }||vrg ||<   |j                  |       ||   j                  |       M g }t        |      D ]  }|j	                  ||           i }d}	|D ]8  }| j                  | j                   |   j                        }
|
 y|	||<   |	|
z  }	: |S )a  Compute store-side linearization coefficients from range tree nesting.

        The tree structure encodes the output iteration order: later
        trees (prefix ``x``) are innermost, earlier trees (``y``, ``z``)
        are outer.  Within a tree, dict order goes inner-to-outer.
        The innermost variable gets coefficient 1; each successive
        variable (moving outward) multiplies by the previous range.

        Returns ``{sympy.Symbol: int}`` mapping each RT var to its store
        coefficient, or ``None`` on failure.
        r   N)r+  prefixr  reversedextendrC  rK  )r"   orderedprefix_groupsprefix_orderr  noder%   inner_to_outercoeffsrS  szs              r'   _compute_store_coeffsz"PallasKernel._compute_store_coeffs[  s     *,"$ 	'A((+DA%#%a ##A&!##A&	'  ",' 	4A!!-"23	4 	A 5 5a 8 ? ?@BzF1IRKE	 r)   c                   | j                  |      }|sy|\  }}}}}|D cg c]  }| j                  |       }}t        |      dk  sd|v ryt        t        t
           |      }	|sy| j                  |      }
| j                  j                         D cg c]  \  }}||
v r|j                  s| }}}t        |      t        |	      k7  rGt        |	      }t        |      dk(  r,| j                  | j                  |d      j                        t        j                  |	      k(  r| j                  |	      }| j                  D ]  }t        j                   j#                  |      }|%|j%                         D cg c]  }| j                  |       }}t'        d |D              st        |      |k7  rr| j)                  ||      }| y| j+                  ||      }| yt	        |      t	        t-        |            k(  r yt/        |      c S  y|D cg c];  }| j1                  t        j                   j2                  j5                  |      |      = }}t7        d |D              syt        t        t
           |      }t        |      }| j                  |	      }| j9                  |      }|(| j                  D ]  }t        j                   j#                  |      }|&|j%                         D cg c]  }| j                  |       }}t'        d |D              st        |      |k7  rs| j)                  ||      }|| j+                  |D cg c]  }||   	 c}|      }| j+                  t	        |      |      }|R|Pdg|z  }t-        |      D ]  }||   |||   <    t	        |      t	        t-        |            k(  r yt/        |      c S  y yc c}w c c}}w c c}w c c}w c c}w c c}w )a  Return permutation for a full-array load, or None.

        Computes the permutation by mapping each range-tree variable to
        both an output dimension (via store coefficients + actual output
        strides) and an input dimension (via load coefficients + input
        C-contiguous strides).  The permutation is then:

            perm[out_dim] = in_dim   for each RT variable

        Using actual output strides (not C-contiguous) is critical: the
        scheduler may choose a non-standard output layout (e.g. column-
        major) to optimise for transposed inputs.

        When all dimensions collapse to a single flat RT variable (e.g.
        (2,2,2,2,2) with all dims size 2), infers the permutation
        directly from output strides vs input C-contiguous strides.
        Nr	   r   r   c              3  $   K   | ]  }|d u  
 y wrQ   r=   r;  s     r'   r=  z:PallasKernel._get_full_load_permutation.<locals>.<genexpr>  s     8198   c              3  J   K   | ]  }t        |t              xr |d kD    ywrq  r   r  r<  r$   s     r'   r=  z:PallasKernel._get_full_load_permutation.<locals>.<genexpr>  s#     DA:a%/!a%/D   !#c              3  $   K   | ]  }|d u  
 y wrQ   r=   r;  s     r'   r=  z:PallasKernel._get_full_load_permutation.<locals>.<genexpr>  s     4QqDy4r  )rB  rC  r  r   rE  r  r   r+  rJ  is_reductionrK  r   prodrF  r  r   r   
get_bufferget_sizerD  r  _map_coeffs_to_dimsrG  r  _get_index_coefficientr  r  rH  r  )r"   r  r  rK   rA  rL  is_contiguousr4  in_shape_rawin_shape	iter_useder  rK  
in_stridesout_namer  	out_shapeactualpermr  
coeffs_rawr  store_coeffs	rt_to_outrt_to_inks                              r'   _get_full_load_permutationz'PallasKernel._get_full_load_permutation}  s   ( $$T*+/(8Q=3;<aq)<<|q DL$8"49l; ,,U3	 --335
1I~ann 
 

 w<3x=(
 HA7|q T^^%%gaj188&8$&% "77A
 $ 9 9 'Hgg00:G <C<L<L<N Oq!2 OI O8i88C	Na<O !99'1EF~   33FJGD|  DzT%(^3# ;&#'$  
 ''(8(8(A(A%(H!L

 
 DDD cJ7L//9
11': # 55 '',,X6?8?8H8H8JK1T^^A.K	K4)44I!8K55gqA% $ 8 829:Qa:F!I  $77VjQH ,1E !sQw!&q =A19!D1.=:eAh7#'$T{*/. a =
& !P
$ L ;s$   OO"<O(A O-8O2O7
c                p   | j                  |      }|sy|\  }}}}}|D cg c]  }| j                  |       }}t        |      dk  sd|v ryt        t        t
           |      }	|sy| j                  |      }
| j                  j                         D cg c]  \  }}||
v r|j                  s| }}}t        |      }|dk  s|t        |	      k\  ry|D cg c]*  }| j                  | j                  |   j                        , }}d|v ryt        t        t
           |      }t        j                  |      t        j                  |	      k7  ry| j                  |	|      }|ydg|z  }d}t        |dz
  dd      D ]  }|||<   |||   z  } t        j                   j"                  j%                  |      }|D cg c]  }| j'                  ||       }}t)        d |D              syt        t        t
           |      }t+        |      D ci c]  \  }}||
 }}}g }|D ](  }|j-                  |      }| y|j/                  |       * | j1                  |      }|y| j2                  D ]  }t        j                   j5                  |      }|&|j7                         D cg c]  }| j                  |       }}t9        d |D              st        |      dk  rst        t        t
           |      } t        j                  |       t        j                  |	      k7  r| j                  | t	        |            }!|!dg|z  }"d}t        |dz
  dd      D ]  }||"|<   ||!|   z  } t+        |"      D #ci c]  \  }#}||#
 }$}#}g }%|D ],  }|$j-                  ||         }#|#  y|%j/                  |#       . dg|z  }&t        |      D ]  }'||'   |&|%|'   <    |&t	        t        |            k(  r yt;        |      t;        |&      fc S  yc c}w c c}}w c c}w c c}w c c}}w c c}w c c}}#w )a  Handle permutation when range tree has collapsed dimensions.

        When simplify_and_reorder merges contiguous dims, the range tree
        has fewer variables than the buffer's rank.  This method detects
        the permutation in the collapsed space and returns
        (collapsed_input_shape, perm) so the caller can generate:
            jnp.permute_dims(load.reshape(collapsed_shape), perm)

        Uses index coefficients on both sides: load-index coefficients
        map vars to collapsed input dims, and store-side coefficients
        (derived from the range tree nesting) map vars to collapsed
        output dims.  Both sets of strides are always unique, so
        matching is unambiguous even with duplicate group sizes.
        Nr	   r   r   c              3  J   K   | ]  }t        |t              xr |d kD    ywrq  r  r  s     r'   r=  z?PallasKernel._get_collapsed_load_permutation.<locals>.<genexpr>$  s#     GA:a%/!a%/Gr  c              3  $   K   | ]  }|d u  
 y wrQ   r=   r;  s     r'   r=  z?PallasKernel._get_collapsed_load_permutation.<locals>.<genexpr>:  s     4194r  )rB  rC  r  r   rE  r  r   r+  rJ  r  rK  r   r  _group_dims_to_rangesrG  r   r   r  r  r  rH  rs  getr  r  r  r  r  rD  r  )(r"   r  r  rK   rA  rL  r  r4  r  r  r  r  r  rK  r  
ranges_rawranges	in_groupscollapsed_in_stridesr"  i
simplifiedin_coeffs_raw	in_coeffsin_stride_to_dimvar_to_in_dimrS  rU  r  r  r  out_shape_rawr  
out_groupscollapsed_out_stridesjout_stride_to_dimvar_to_out_dimr  r  s(                                           r'   _get_collapsed_load_permutationz,PallasKernel._get_collapsed_load_permutation  sv   " $$T*+/(8Q=3;<aq)<<|q DL$8"49l;,,U3	 --335
1I~ann 
 

 Lq5AX&OVW!dnnT%:%:1%=%D%DEW
W: cJ799V		( 33 ..x@	 !"sQwq1ub"% 	#A&, #il"F	# WW%%..u5
MTU44ZCUUGGG#DI}=	-67K-LMTQAqDMM 	&E"&&u-C{  %		& 11': 11 #	3Hgg((2G8?8H8H8JK1T^^A.KMK4m44M8JQ8N#'S	=#AIyy#tyy'::33ItIOJ! &'C!G!F1q5"b) (+1%a(*Q-'( 3<<Q2R S$!QA S SN )%)),q/:9%%a(	) 37D1X ;*7*:^A&';tE!H~%)$eDk22G#	3H { =
 X. V
 N" L" !Ts)   PP/P=P"P''P-;P2c                *   t        |      }g }d}t        t        |       dz
  dd      D ]@  }|| |   z  }	 |j                  |      }|j                  |       |j                  |       d}B |dk7  s|ry|j                          |S # t        $ r Y hw xY w)zGroup consecutive dims (right-to-left) to match range values.

        Returns collapsed shape (left-to-right) or None if no valid grouping.
        r   r  N)rE  rG  r  r  r  r  popreverse)dimsr  	availablegroupsproductr  r  s          r'   r  z"PallasKernel._group_dims_to_ranges[  s     L	s4y1}b"- 	AtAwGoog. MM'"MM#G	 a<9  s   B	BBc                   | j                  |      }| j                  |      }|r|rt        | j                  |      d      S |rt        | j	                  |      d      S | j                  |      }|j                  t              xr |dk7  }|s*|dk7  r%d|v s!|j                  d      j                         sd}t        ||      S )z@Get the index expression string and whether it needs flattening.Tr  r  Fr  ::-)
r7  r0  r  _handle_mixed_indexingr   r  r  r   lstripisdigit)r"   r  has_indirecthas_iter_varsr  r  s         r'   _get_index_exprzPallasKernel._get_index_exprr  s    ..u5007M"55e<D  "TZZ->eTT++E2I "IIo6M9;MM !Y%%7	)Y-=-=c-B-J-J-L$(M"YmTTr)   c                D    	 t        |       S # t        t        f$ r Y yw xY w)z0Convert value to int, returning None on failure.N)r  r  r  )r   s    r'   rC  zPallasKernel._safe_int  s'    	s8O:& 		s   
 c                z    t        |       }dg|z  }t        |dz
  dd      D ]  }||dz      | |dz      z  ||<    |S )z0Return C-contiguous strides for the given shape.r   r	   r  )r  rG  )shaperK  rx  r  s       r'   rF  z"PallasKernel._c_contiguous_strides  sW     J#'q1ub"% 	7A Q%A,6GAJ	7r)   c                    i }t        |      D ]  \  }}||v r y|||<    g }| D ](  }|j                  |      }| y|j                  |       * t        t	        |            t        |       k7  ry|S )zMap coefficient values to dimension indices via stride matching.

        Returns a list where entry k is the dimension whose stride equals
        coeffs[k], or None if the mapping is ambiguous or incomplete.
        N)rs  r  r  r  r   )r  rx  stride_to_dimrV  r4  mappingr$   s          r'   r  z PallasKernel._map_coeffs_to_dims  s     )+g& 	!DAqM! M!	!   	A!!!$AyNN1		
 z'"#s6{2r)   c                   d}|j                   j                         D ]^  }t        j                  j	                  |      }|d}'|j                         D ]%  }| j                  |      }|dk(  rd|fc c S |$d}' ` d|fS )z9Return whether an output has a zero or unknown dimension.FTr   )r  valuesr   r   try_get_bufferr  rC  )r"   ctxhas_unknown_dimrw  rh  rU  dim_ints          r'   _zero_dim_output_flagsz#PallasKernel._zero_dim_output_flags  s    00779 
	+H''((2C{"&||~ +..-a<00?&*O+
	+ o%%r)   c                P   | j                   sy| j                  j                         D cg c]  \  }}|j                  s| }}}| j                  j                         D cg c]  \  }}|j                  r| }}}|r|sy| j	                         }|r|dk  ry| j                   j                         D ]D  \  }}| j                  |      }||\  }	}
}	}}	t        |
      dk  r4|D cg c]  }| j                  |       }}t        d |D              ret        t        t           |      |}|D ]  }|j                  |d      } t               t        j                  j!                  |      D ]M  }|j"                  r|j%                         \  }}	| j                  |      }|7|dkD  s=j'                  |       O st)        fdt+              D              }|s6t              dkD  r=t              t        |      k(  r$t        |      dkD  rt-        fd|D              c S t/        t1                    |dz
  z  t3        fdt+        dz
        D              }|rF|d	   }|}|dkD  r!|dz
     kD  r|dz  }|dkD  r|dz
     kD  rt        t+        ||dz               }n,t)        fd
t+              D              }|st        |      }t-        fd|D              c S  t/        t1        | j                   j5                                     }|j7                  |d         }|dk7  r| j                  |      ndd|j7                  |d         }|dk7  r| j                  |      nd}|d}|kD  ryyc c}}w c c}}w c c}w )ah  Determine which axes of the loaded array are reduction axes.

        Finds the innermost reduction stride from the load index
        expression, then walks outward through the buffer's dims
        using stride ratios until the accumulated product reaches
        red_numel.  Falls back to stride-direction analysis for
        gather/flatten loads.
        )r  r   r	   c              3  $   K   | ]  }|d u  
 y wrQ   r=   r;  s     r'   r=  z3PallasKernel._get_reduction_axes.<locals>.<genexpr>  s     6196r  r   c              3  2   K   | ]  }|   v s|  y wrQ   r=   )r<  r  r_coeffsrx  s     r'   r=  z3PallasKernel._get_reduction_axes.<locals>.<genexpr>  s     @qx)?@s   c              3  (   K   | ]	  }|z
    y wrQ   r=   r<  r  nds     r'   r=  z3PallasKernel._get_reduction_axes.<locals>.<genexpr>  s      9AR 9   c              3  :   K   | ]  }|   |d z      kD    ywr@  r=   )r<  r  rx  s     r'   r=  z3PallasKernel._get_reduction_axes.<locals>.<genexpr>  s"     S
WQU^ ;S   r  c              3  H   K   | ]  }|   k  r|   z   k  r|  y wrQ   r=   )r<  r  r_stridespanrx  s     r'   r=  z3PallasKernel._get_reduction_axes.<locals>.<genexpr>  s4      71:-'!*th2N    "c              3  (   K   | ]	  }|z
    y wrQ   r=   r  s     r'   r=  z3PallasKernel._get_reduction_axes.<locals>.<genexpr>  s     .AR.r  r   )r  r+  rJ  r  _compute_reduction_numelrB  r  rC  rD  r   rE  r  subsr   r  Add	make_args	is_numberas_coeff_MuladdsortedrG  r  r  r  rH  r  rS  ) r"   r  r  r_varspw_vars	red_numelrw  
load_indexrK   rA  rL  actual_stridesr4  strides_or_noner_onlypvtermrS  r$   matchedr  innerstartaxesr_coeffpw_coeff	pw_strider  r  r  r  rx  s                               @@@@@r'   _get_reduction_axesz PallasKernel._get_reduction_axes  s    $$ $ 5 5 ; ; =P1!PP!%!6!6!<!<!>UAann1UUW113	IN$($9$9$?$?$A A	/ Hj((2D|04-AxNAXBAv:HIQt~~a0IOI6o66!%d3i!AG  F ,R+,(2H		++F3 $>>,,.qNN5)=QULLO$  @E"I@G 
 8}q x=CL0S[1_  9 999 DN+HMX-DSU2PQ6]SSMaiD7519+=$=QJE aiD7519+=$=E%34  "2Y 
 =D....CA	/H $t44;;=>?
""6!9-.5l4>>'*H##GAJ/08ADNN8,1	Iiq QU  Js   NN)N>N4N#c                    d}|D ]8  }|| j                   v s| j                  | j                   |         }| y||z  }: |S )zBCompute total numel for given prefixes (e.g., pointwise prefixes).r   N)numelsrC  )r"   prefixesr/   r%   numels        r'   _compute_prefix_numelz"PallasKernel._compute_prefix_numel'  sP     	 ADKKt{{1~6=%	  r)   c                    d}| j                   D ]3  }|j                  s| j                  |j                        }| y||z  }5 |S )zCompute total reduction numel.r   N)r  r  rC  r  )r"   r/   treer  s       r'   r  z%PallasKernel._compute_reduction_numel2  sO    $$ 	 D  tzz2=%	  r)   c                d     j                   ry j                         }||dkD  ryg } j                  j                  D ]x  } j	                  |      }| y|\  }}}}}	|	s y t        |dd              }
|
ddl}|
|j                  k(  r yt         fd|D              }d|v r y|j                  |       z |rt        t        |            dkD  ry|r[d}|d   D ]  }||z  }	 d} j                  D ]3  }|j                  r j                  |j                        }| y||z  }5 ||k7  ryy)	a  
        Check if TMA (Tensor Memory Accelerator) approach can be used.
        TMA works for simple element-wise ops but not for:
        - Reductions (need different accumulation patterns)
          TODO: TMA supports float64 for loading but not for reductions
        - Broadcasting (inputs have different shapes or output differs)
        - Non-contiguous tensors (strided, transposed)
        FNr   	get_dtypec                      y rQ   r=   r=   r)   r'   r  z4PallasKernel._can_use_tma_approach.<locals>.<lambda>]  r
  r)   r   c              3  @   K   | ]  }j                  |        y wrQ   rC  r<  r4  r"   s     r'   r=  z5PallasKernel._can_use_tma_approach.<locals>.<genexpr>e  s     Daq 1Ds   T)r  r  r!   input_buffersrB  r  r   float64r  r  r  r   r  r  rC  r  )r"   reduction_numelinput_shapesr  rK   buf_objrL  	buf_numelr  r  	buf_dtyper   shape_tupleinput_numelr4  r]  r  r  s   `                 r'   _can_use_tma_approachz"PallasKernel._can_use_tma_approach=  sq    && 779&?Q+> %'II++ 	-D((.D|JNGGXy.- 
 DlCEI$-   D8DDK{",-	-2 C
< 89A= K!!_ !q ! L(( *(( NN4::6E}$ E)L* {*r)   c                v   t         j                  j                  |      }|y|j                         }d}|D ]  }| j	                  |      }|||n|z  } g }d} t        |dd              }	|	rt        |	dd      nd}
|
t        t        |            D ]'  }| j	                  |
|         }|j                  |       ) t        |      dk(  r|d   c|d   dk7  r[d}nXt        |      dkD  rJd}t        t        |      dz
  d	d	      D ],  }||   }|||k7  rd}| j	                  ||         }|(||z  }. |||||fS )
zGet buffer metadata (buf_obj, buf_size, buf_numel, actual_strides, is_contiguous).

        Returns None if the buffer doesn't exist.
        Nr   Tr~  c                      y rQ   r=   r=   r)   r'   r  z/PallasKernel._get_buffer_info.<locals>.<lambda>  r
  r)   r"  r   Fr  )	r   r   r  r  rC  r  rG  r  r  )r"   r  r!  rL  r"  r4  svalr  r  r  
buf_strider  actual_strideexpected_stridedim_sizes                  r'   rB  zPallasKernel._get_buffer_info  s}   
 ''$$T*?##%	 	9A>>!$D!1q8I	9
  "=,=?8>WVXt4D
!3x=) 5 $z!} =%%m45
 8}!!!$0^A5F!5K$)MX""#s8}q0"b9 4A$21$5M$,0P(-#~~hqk:H+'834 )^]JJr)   c                    | j                  |      }g }|D ]O  }|| j                  v s| j                  |   }| j                  |j                        }|?|j	                  |       Q d}|D ]  }||z  }	 ||fS )zNCompute expected output numel and used vars from iteration variables in index.r   )r   r+  rC  rK  r  )	r"   r  r   used_range_lengthsr   r  r  r]  ls	            r'   rI  z-PallasKernel._compute_output_numel_from_index  s     ,,U3	 	:Cd+++--c2!^^ELL9
)&--j9	: # 	AAL	 Y&&r)   c                    t               }|D ]X  }t        j                  ||      }t        j                  ||      }|d}| j	                  |      }|j                  ||n|       Z |S )zT
        Extract coefficients of iteration variables from index expression.
        r   )r   r   r  r  rC  r  )r"   r  r   coefficientsr   r!  r"  coefs           r'   _get_index_coefficientsz$PallasKernel._get_index_coefficients  su     $.< 	CC*GGsSH(@@3OF~>>&)DT%5T6B	C r)   c                :   dg}t        |      dkD  rPd}g }t        t        |      dz
  dd      D ]0  }|j                  d|       | j                  ||         }|,||z  }2 |rt	        |      }	|D ]  }
|
|	vs y yt	        d |D              }|D ]  }
|
|vs y y)zR
        Check if access pattern requires gather (non-standard striding).
        r   r  r   Tc              3  &   K   | ]	  }||  y wrQ   r=   r;  s     r'   r=  z5PallasKernel._check_gather_pattern.<locals>.<genexpr>  s     *V1*Vs   F)r  rG  insertrC  r   )r"   rL  r  r  r2  expected_stridesr,  r  r-  expected_stride_setr3  actual_stride_sets               r'   _check_gather_patternz"PallasKernel._check_gather_pattern  s     3x=1O!3x=1,b"5 0 ''?;>>(1+6'#x/O	0 ",-=">$  22   !+*Vn*V V$  00  r)   c                    |j                   dk7  s|j                  r|S  j                  |      }||S |\  }}}}}	 j                  |      \  }
} j	                         } j                  ||      } j                  |||	|      }t         fd|D              }t        |      t        |      k  xr. t        |      dkD  xr |dkD  xr t        |      t        |      kD  }t        j                  j                         j                  dk(  }|	 xr t        d |D              }t        d |D              }|xr
 | xr ||
k(  }|
dkD  r7||
k7  s|s|r.t        |      dkD  r |s|st         j!                  |      d	      S |S )
zVCheck if buffer access needs strided indexing due to size mismatch or gather patterns.r  c              3  L   K   | ]  }j                  |      d k7  sd   ywr@  r  r  s     r'   r=  z7PallasKernel._needs_strided_indexing.<locals>.<genexpr>  s!      Oqq8IQ8N Os   $$r   r   r   c              3  $   K   | ]  }|d u 
 y wrQ   r=   r;  s     r'   r=  z7PallasKernel._needs_strided_indexing.<locals>.<genexpr>  s      <
ATM<
r  c              3  L   K   | ]  }t        |t        t        z           y wrQ   )r   r  r   r  s     r'   r=  z7PallasKernel._needs_strided_indexing.<locals>.<genexpr>  s     U1Jq#+$> >Us   "$Tr  )r  r  rB  rI  r'  r4  r;  sumr  r   r   r   r   rH  rD  r  r)  )r"   r  r  indexingrK   r!  rL  r"  r  r  r]  r   all_iter_varsr2  has_non_unit_stridesbuf_effective_dimsnot_all_vars_usedr  is_known_non_contiguoushas_symbolic_coefskip_for_non_contiguouss   `                    r'   _needs_strided_indexingz$PallasKernel._needs_strided_indexing  s    &(*@*@O$$T*<OFJC9nm"&"G"G"Ni++-33E9E  $99nm\ 

 ! OH OO	NS// /I"/"Q&/ IX.	 	 446;;uD&3"3 #
 <
#1<
 9
  UUU#PF
PyL7P 	  1l*.?CWI"+%"66u=T  r)   c                    |j                   s|j                  dk(  r|S t        j                  j	                  |      }||S |j                         }t        |      dk(  rt        d|j                         S t        |      dkD  rT| j                  |      }|st        |j                  d      S d|j                  v rt        | j                  |      d      S | j                  r*d|j                  v rt        | j                  |      d      S |S )z`
        Adjust index expression based on buffer shape (0-dim scalar, multi-dim, etc.).
        r  r   r  r   Tr  )r  r  r   r   r  r  r  r  r0  r)  r  )r"   r  r  rA  r!  rL  r  s          r'   _adjust_index_for_buffer_shapez+PallasKernel._adjust_index_for_buffer_shape/  s    !!X%7%75%@O''$$T*?O##% x=A"x/E/E 
 x=1 44U;M &&00  +++&"::5AQU 
 ;;48#5#55"66u=T  r)   c                   |j                   s|S t        j                  j                  |      }||S |j	                         }t        |      }|dk  r|S | j                  |      }t        |      dk7  r|S t        t        |            }t        j                  ||      }	| j                  t        j                  |	|            }
|
|
dk  r|S t        j                  j                  j                  ||	z
        }	 t        |      }|dk  s||
k\  r|S | j                  |d         }|||
z  dk7  r|S | j$                  j'                  |      }||S | j                  |j(                        }d}|D ]  }| j                  |      }||c S ||z  }  |||
z  |k7  r|S d|dz
  z  }|dk(  r| d|
 }n	| | d|
 }t+        |d      S # t         t"        f$ r |cY S w xY w)	z
        Try to emit multi-dim slice notation instead of flatten + gather.

        For a buffer with shape (d0, ..., dk) and index `stride * var + offset`,
        emit `buf[:, ..., :, offset::stride]` when stride divides dk.
        r	   r   r   r  z:, r  Fr  )r  r   r   r  r  r  r   r  r  r   r  rC  r  r  r  r  r  r  r+  r  rK  r  )r"   r  r  rA  r!  rL  rO  r   r   r!  r"  r  r#  last_dimr  
var_lengthr"  r4  rV  r  	slice_strs                        r'   _try_multidim_slicez PallasKernel._try_multidim_slice\  s    %%O''$$T*?O##%8}!8O ,,U3	y>QO4	?#&CCE3O77#F
 >Vq[O!!**58+;<	VJ >Z61O>>(2,/x&0A5O %%))#.=O^^ELL1
	 	Aq!AyNI		
 f!4	!AO$(#?!("VH-I!(:,b9I%HHA :& 	O	s   :G GGc                    d|  d| dS )a  Generate gather-based permutation instead of jnp.permute_dims.

        Avoids a Mosaic compiler bug where jnp.permute_dims produces
        corrupted output tensors on TPU for 3D+ arrays.  Uses
        pallas_permute which flattens to 1D and does a 1D gather.
        zpallas_permute(r   r   r=   )	load_exprr  s     r'   _gather_permute_exprz!PallasKernel._gather_permute_expr  s     !2dV155r)   c                ^   || j                   v r| j                   |   S | j                  j                  D ]w  }t        |      j	                         }|j                  | d      s1t        j                  d|j                  dd      d         D ]  }| j                  |      }||c c S  y y)zTrace a tmp variable back to its source buffer's kernel param.

        Follows CSE assignments backward through bounds-checking (where/clamp)
        until it finds a variable that was directly loaded from a buffer.
        rn  
\btmp\d+\br   N)
r  r   _linesr8   r  r3  refindallsplit_trace_to_load_source)r"   var_namelineline_strrefr/   s         r'   rZ  z"PallasKernel._trace_to_load_source  s     t)))%%h//LL'' 	"D4y'')H&&(3'78zz-q1I!1LM "33C8%!M"		" r)   c                     j                  |      }|y|\  }}}}}|D cg c]  } j                  |       }}t        |      dk  st        d |D              ryt	        t
        t           |      }	 j                  |      }
t        |
      dk7  ry|
d   } j                  ||      }|dk(  st        |t              sy j                  |	      } j                  |g|      }|y|d   }t        |	      }|t        d|dz
        k\  ryt        |      } j                  |      }|y j                  j!                  |      }| j                  |      }||\  }}}}}t        |      dk7  ry j                  |d         yt#        j$                   fd|D              }t#        j$                   fd j'                  |      D              }||k\  ryt)        ||t+        |	      |||      S c c}w )	a,  Detect a load with data-dependent indexing suitable for scalar prefetch.

        Matches exactly one indirect variable whose coefficient corresponds to
        a C-contiguous stride dimension.  Rejects 1-to-1 gather patterns where
        the indices buffer covers the full iteration space.
        Nr	   c              3  $   K   | ]  }|d u  
 y wrQ   r=   r;  s     r'   r=  z7PallasKernel._detect_indirect_access.<locals>.<genexpr>  s     'HaT	'Hr  r   r   c              3  J   K   | ]  }j                  |      x  y wrQ   r  )r<  r4  r"   r  s     r'   r=  z7PallasKernel._detect_indirect_access.<locals>.<genexpr>  s)      *T^^A5F0F/SA*s   ##c              3     K   | ]?  }|j                   v r/j                  j                   |   j                        x	  A y wrQ   )r+  rC  rK  )r<  r   rK  r"   s     r'   r=  z7PallasKernel._detect_indirect_access.<locals>.<genexpr>  sQ      )d333"&..1F1Fs1K1R1R"SS  )s   AA)r  r  r  r  r  r  )rB  rC  r  rD  r   rE  r  r5  r  r   rF  r  maxr8   rZ  r   r  r   r  r   r  r  )r"   rh  r  r  buf_inforA  rL  r4  buf_size_rawbuf_size_intsindirect_varsr  rS  rx  r  r  rO  indirect_var_namer  indices_graph_nameindices_infoindices_sizeindices_numeliter_productrK  r  s   `                       @@r'   _detect_indirect_accessz$PallasKernel._detect_indirect_access  s(    ((.'8Q13;<aq)<<|q C'H<'H$H#'S	<#@//6}"$Q'++E<@A:Zs3 ,,];**E7G<?qz=!3q$(++-223DE  "66::=I)001CDL'+7(<Aq|$)>>,q/2: $		 *+* !  $yy )#77>)   !L0"m,%*'
 	
k =s   G5c                L   t               }| j                  D ]?  \  }}t        j                  d|      D ]!  }|j	                  |j                                # A g }| j                  j                  D ]  }t        |      j                         }t        j                  d|t        j                        }|r3|j                  |j                  d      |j                  d      |f       v|j                  d||f        d}|rpd}t        |      D ]]  \  }	}
}|	s
|	|v st        j                  d|
      D ]6  }|j                         |vs|j	                  |j                                d}8 _ |rp|D 	cg c]  \  }	}}|	|	|v r| c}}}	| j                  _        yc c}}}	w )aj  Remove dead compute lines after scalar prefetch replaces indirect load.

        When the table load is simplified to buf[0] (scalar prefetch handles
        indexing), the indices load and all derived bounds-checking code become
        dead.  This performs backward liveness analysis from the store variables
        to identify and remove dead lines.
        rU  z^(tmp\d+)\s*=\s*(.*)r   r	   NTF)r   r  rW  finditerr  groupr   rV  r8   r  matchDOTALLr  r  )r"   	live_varsrA  
store_linemassignmentsr\  r]  changedr[  rhss              r'   _eliminate_dead_indirect_codez*PallasKernel._eliminate_dead_indirect_code	  s    &0\	!33 	)MAz[[
; )aggi()	)
 :<LL'' 	;D4y'')H0(BIIFA""AGGAJ
D#AB""D(D#9:	; G$,[$9 + #qI 5[[< +779I5%MM!'')4&*G++  &1
 
!!T8y#8 
 
s   7Fc                8   |j                   r| j                  |||      }|.| j                  || j                  k(  sJ d       || _        | dS d| _        | j                  j                  |       |j                  t        j                        xs |j                  t        j                        }| j                  rdnd}|rd|j                   d| dn|j                  }| d	| d
S | d|j                   d
}	|j                  dk(  r| j                  s| j                  ||      }
|
#| j                  |	|
      }	|
| j                  |<   |	S | j!                  ||      }|6|\  }d|	 d| d}	| j"                  |<   t%        fd|D              | _        |	S )zC
        Build the load expression based on indexing mode.
        z-only one indirect access per kernel supportedz[0]Tz	jnp.int32z	jnp.int64r   r   r   z[...].flatten()[rg  rf  r  zjnp.permute_dims(r   c              3  (   K   | ]	  }|     y wrQ   r=   )r<  r%   collapsed_shapes     r'   r=  z0PallasKernel._build_load_expr.<locals>.<genexpr>k	  s      <34OA.<r  )r  rn  r  r  r  r  r  r  MinMaxr  r  r  r  rS  r  r  r  r  r  )r"   rh  r  r  rA  indirect
has_minmax	idx_dtyper  rR  r  	collapsedcpermr}  s                @r'   _build_load_exprzPallasKernel._build_load_expr7	  s    !!33CuEH#''3 $t';';; G; (0$c{"(,D%((,,T2599-E5991EJ'+{{I  H&&'y1='' 
 U*3%q11 %q!3!3 4A6I!!U*4;;66tUC# $ 9 9)T JI8<D//5  !% D DT5 QI ,1:.&7	{"UG1$M	 ?N55d;6; <8=< 73 r)   c                   |j                  d      s|S t        fd| j                  j                  D              }|rMt        j
                  j                  |      ,j                         }t        |      dk(  r|d   dk(  rd| dS |S )z
        Squeeze (N,1) intermediate buffers when kernel has 1D graph inputs.

        This avoids wrong broadcasting: (N,) op (N,1) -> (N,N) instead of (N,)
        rh  c              3     K   | ]Y  }|j                  d        xrA t        j                  j                  |      xduxr t	        j                               dk(   [ yw)rh  Nr   )r3  r   r   r  r  r  )r<  rw  r!  s     r'   r=  zBPallasKernel._maybe_squeeze_intermediate_buffer.<locals>.<genexpr>{	  sj      
  ##E** -GG..x88E-G$$&'1,-
s   AA"r	   r  r   zjnp.squeeze(z
, axis=-1))	r3  rD  r!   r  r   r   r  r  r  )r"   r  rR  has_1d_inputrL  r!  s        @r'   "_maybe_squeeze_intermediate_bufferz/PallasKernel._maybe_squeeze_intermediate_bufferq	  s     u%  
 !II33	
 
 gg((.G""++-x=A%(2,!*;))J??r)   c                   t         j                  j                  |      }|t        |j	                               dk7  r|S |j                  d      r|S | j                  |j	                         d         }||S t         j                  j                  |      }||j                  s|S d}| j                  j                  D ]  }t         j                  j                  |      }	|	%t        |	j	                               dkD  sB|	j	                         D 
cg c]  }
| j                  |
       }}
t        d |D              r nd} |t        |      dk  r|S | j                  |      }t        |      dk7  r|S t        t        |            }|| j                  vr|S | j                  |   }| j                  |j                         |k7  r|S | j                  j#                         D cg c]1  \  }}| j                  |j                         |k(  r|j$                  s|3 }}}t        |      dk7  r|S d}t'        |      D 
cg c]  \  }}
|
|k(  s| }}}
t        |      dk(  r|d   }n?d}| j                  j#                         D ]   \  }}||k(  r|} n|j$                  r|dz  }" ||S |t        |      dz
  k(  r|S dgt        |      z  }d||<   | ddj)                  t+        t,        |             d	S c c}
w c c}}w c c}
}w )
a  Reshape 1D buffers for higher-dim broadcasting in reduction kernels.

        When a 1D buffer (e.g. a reduction result from a prior kernel, or a
        batch-norm parameter) is loaded into a kernel with 2+ iteration dims,
        JAX right-aligns it for broadcasting: (N,) becomes (1, N).  This is
        wrong when the buffer corresponds to a non-trailing axis; we reshape
        to (N, 1, ...) so broadcasting matches the correct axis.
        Nr   rh  r   c              3  $   K   | ]  }|d u 
 y wrQ   r=   r;  s     r'   r=  z:PallasKernel._maybe_broadcast_1d_buffer.<locals>.<genexpr>	  s     ;q};r  r  ro  r   r   )r   r   r  r  r  r3  rC  r  is_floating_pointr!   r  rH  r   r  r  r+  rK  rJ  r  rs  rR  mapr8   )r"   r  r  rR  r!  
buf_lengthr   ref_buf_sizerw  	other_bufr4  r   used_varr  r  r  matching_varsaxis_posr  matching_dimspw_idxsymreshape_dimss                          r'   _maybe_broadcast_1d_bufferz'PallasKernel._maybe_broadcast_1d_buffer	  s    ''$$T*?c'"2"2"45:
 ??5!^^G$4$4$6q$9:
!!$'U%<%< 		// 	$H**84I$Y-?-?-A)BQ)F;D;M;M;OPaq 1PP;l;;#	$ 3|#4#9 ,,U3	y>QY(4000 %%h/>>%,,':5 --335
1~~ahh':5ann 
 

 }" '0'>Rtq!!z/RR}"$Q'H F//557  Q(?%H~~aKF  s<(1,,sS..!#XIdiiC0F&G%HJJo  Q.
 Ss   K:)6K?LLc                   |j                   dk7  s|j                  r|S | j                  |      }| j                         }| j	                  |      }t        |d      r|j                  n	t               |z  }||z
  }|rt        |      dk  r|S d}| j                  j                         D ]w  \  }	}
| j	                  |
      }|s||k7  r | j                  |
      }t        |d      r|j                  n	t               |z  }||k7  s||k(  ra| j                  |	|
|      rud} n |rt        | j                  |      d      S |S )a
  
        Check for im2col-like patterns where store uses block variables but load doesn't.

        For cat/expand patterns, both load and store prepared indices share block vars.
        For im2col patterns, store compresses to block vars but load doesn't.
        r  r&  r   FTr  )r  r  r   r'  r   hasattrr&  r   r  r  rJ  _check_load_is_strided_inputr  r)  )r"   r  rA  r  r(  store_orig_varsstore_prep_varsnew_varshas_im2col_patternrw  r  load_orig_vars	prep_loadload_prep_varss                 r'   _check_im2col_patternz"PallasKernel._check_im2col_pattern	  s{    &(*@*@O..u5'')	2259 ~~6 ''	
 #_4 3/14O #$($9$9$?$?$A 	 Hj!55jAN! 0 --j9I 9n5 &&\	N /?o3U 44*n &*"5	8 "66~F" 
 r)   c                H   t         j                  j                  |      }|y t        |dd              }|yt        |dd      }|y|j	                         }g }|D ]W  }	t        j                  ||	      }
t        j                  |
|	      }|2| j                  |      }|j                  ||n|       Y t               }t        |      D ]G  \  }}| j                  ||         }||dkD  s"| j                  |      }|j                  ||n|       I t        |      |k(  S )z\
        Check if load coefficients match buffer strides (strided input vs im2col).
        NFr~  c                      y rQ   r=   r=   r)   r'   r  z;PallasKernel._check_load_is_strided_input.<locals>.<lambda>0
  r
  r)   r"  r   )r   r   r  r  r  r   r  r  rC  r  r   rs  r  )r"   rw  r  r  rh  r  buf_strides	buf_sizesload_coeffsr   r!  r3  int_coefbuf_stride_setr  r4  r-  int_ss                     r'   r  z)PallasKernel._check_load_is_strided_input&
  s9    gg  *;9lL9;>fh5LLN	 ! 	OC*GG
TWXH&>>xMD>>$/""x/C8N	O $k* 	FDAq~~il3H8a<q)""E,=51E		F +&.88r)   c                   | j                   ry| j                  |      }|y|\  }}}}}t        |      dk7  st        |      dk7  ry| j                  |d         }| j                  |d         }|d   }|d   }	||	||	k  r||
|dkD  r|dkD  sy| j                  j
                  D ]D  }
| j                  |
      }||\  }}}}}t        |      dk7  r.|d   }|d   }|;|>||k  sD y y)a  
        Check if output needs transpose for column-major storage.

        Transpose on store is needed when:
        - Output has column-major stride (s0 < s1)
        - But input(s) have row-major stride
        - And we haven't already transposed on load
        Fr	   r   r   T)r  rB  r  rC  r!   r  )r"   r  rK   rA  rL  r  size0size1s0s1inp_nameinp_infoinp_stridesinp_s0inp_s1s                  r'   _check_store_needs_transposez)PallasKernel._check_store_needs_transposeN
  s:    &&$$T*<,0)8Q~!#s8}'9x{+x{+AA NR!!		 		// 
	H,,X6H&.#Aq![!;1$ ^F ^F!f&8Vf_
	 r)   c                    d| dg}|r|j                  | d| d       |S |j                  | d| d| d| d| d
       |S )	z
        Build store expression for full array assignment.

        Handles scalar broadcast, shape matching, and optional transpose.
        Returns a list of lines to emit (variable assignment + store).
        _val = jnp.asarray(r   [...] = jnp.full(z8.shape, _val) if _val.ndim == 0 else jnp.transpose(_val)z3.shape, _val) if _val.ndim == 0 else (_val.reshape(z.shape) if _val.size == z".size else jnp.broadcast_to(_val, z.shape)))r  )r"   outvalueneeds_transposeliness        r'   _build_full_array_store_exprz)PallasKernel._build_full_array_store_expr
  s     'ugQ/0LL% 5 !+,  LL% 5 !&&)U*B3% H//2e8= r)   c                ,   |j                   dk(  r$| j                  |      }| j                  |||      S |j                  r/d| _        |dk(  rdnd}| d| d|j                    d| d	| d
| dgS | j                  |      }	t        j                  j                  |      }
|
B|
j                         }t        |      dkD  r$| j                  |      s| j                  ||d      S |	r|dk(  rdnd}d| dg}d|j                    d| d}|dk(  rO| j                  j                  |       | d}|j                  | d| d|j                    d| d| d| d       |S |j                  | d|j                    d|        |S | d|j                    d| gS )z
        Build the store expression based on indexing mode.
        mode can be None (set) or "atomic_add" (accumulate).
        Returns a list of lines to emit.
        r  T
atomic_addr  set[...] = z[...].flatten().at[(z).flatten()].z(jnp.asarray(z).flatten()).reshape(.shape)r   Fr  r   z
(jnp.full(z%.shape, _val) if _val.ndim == 0 else _aliasr   z.flatten()).reshape(rf  z] = )r  r  r  r  r  r7  r   r   r  r  r  r0  r  r  r  )r"   r  r  r  r  rA  moder  
scatter_opr  rh  rL  r  
value_expralias_params                  r'   _build_store_exprzPallasKernel._build_store_expr
  s    &"??EO44S%QQ!!(,D%"&,"6EJ%xu$89K9K8LMZdYe f$g%:3%wH  ..u5gg  &?||~H8}q )A)A%)H88eUKK"&,"6EJ*5'34E%h&8&8%99^_d^eefgJ|#&&**3/!$Vne8K=0DXEWEWDXXefpeqqr!l"6se7D L uAh&8&8%9j\JKL%q++,D899r)   c           
     .   |j                  dd      }| j                  j                  |       | d}|dk(  rdnd}|rt|d   }	|d   }
|d	   }g }t        t	        |            D ]*  }||
k(  r|j                  |	       |j                  d
       , dj                  |      }| d| d| d| d| d
S |d   }	|d   }|d   }t        j                  j                  |      }|t	        |j                               nd}t	        |      t	        |      z   }t	        | j                        }|dz
  }||k(  xr ||k(  }|rv|D cg c]  \  }}|	 }}}t	        |      }t	        |      }|dkD  r|dkD  rd|z  }d|z  }|	 d| d| d}n|	}|j                  |       |j                  d |D               n8|D cg c]  }d }}|j                  |	       |j                  d |D               dj                  |      }| d| d| d| d| d
S c c}}w c c}w )zBBuild store expression for scatter operations (indirect indexing).is_point_scatterFr  r  r  r  r  r  output_shape0r   r  z	[...].at[z].r   r   dims_before
dims_afterr   r   None, , Nonerf  r  rg  c              3  &   K   | ]	  \  }}|  y wrQ   r=   )r<  r[  r  s      r'   r=  z9PallasKernel._build_scatter_store_expr.<locals>.<genexpr>  s     INHdxIs   re  c              3      K   | ]  }d   yw)re  Nr=   )r<  rA  s     r'   r=  z9PallasKernel._build_scatter_store_expr.<locals>.<genexpr>  s     7qs7s   )r  r  r  rG  r  r  rR  r   r   r  r  r+  r  )r"   r  r  scatter_infor  r  r  r  r  r  r  r  index_partsrU  index_tupler  r  rh  output_ndimnum_iter_vars_in_storetotal_kernel_iter_varsremaining_dimsis_element_wiser[  r  	n_leading
n_trailingleading_onestrailing_nonesindirect_reshapedrA  s                                  r'   _build_scatter_store_exprz&PallasKernel._build_scatter_store_expr
  s    (++,>F 	""3'Vn #l2U
'7L'7L'7L KS./ ,,&&&|4&&s+	, ))K0KU(;-yR
|STUZT[[\]] $N3"=1!,/
 gg  &-0_c#,,.)!!$[!1C
O!C!$T%:%:!;$q #n4 A&*@@ 	
 :EF$8FKF K(IZJ1}a')3!)J!6'3nAl^3~FVVW$X!$0!01IjII )44134K4|,7J77ii,e8K=	+bAeWTUV	
+ G  5s   H6	Hc                   | j                   j                  |      }t        j                  j	                  |      }|| j
                  |<   | j                  |      }| j                  |||      }| j                  ||      }|"|| j                  |<   | j                  ||      }n:| j                  |||      }| j                  |||      }| j                  ||||      }|j                  s4|j                  dk(  r%| j!                  ||      }| j#                  |||      }| j$                  j'                  | j(                  ||      }| j                   j                  |      }	|	| j*                  t-        |      <   || j.                  |	<   |S )Nr  r   )r!   r  r   r   r  r  r  rI  rc  r  rl  rK  rP  r  r  r  r  r  r   r   r   r  r8   r   )
r"   r  r  rh  r   rA  ri  rR  cse_var	buf_params
             r'   loadzPallasKernel.load$  s   iiood#!!$' ',d# ''. //eXF
 //t</5D&&t,//V<I ::4QH //eXFH --c4II %%(*<*<*E??iPI77eYOI((##LL $ 
 IIOOD)	+43w<(/3!!),r)   c                	   $%  j                        } j                  j                  |       t        |      dk(  r j	                        S  fd}t        ||d      }|D cg c]
  } ||       }} j	                   j                              } j                        }|D 	cg c]  }	t        |	       }
}	|D ci c]  }t        |       ||       }}t        |      dk(  rt        |
      dk(  r|d   }t        |      }| j                  v xr  j                  |   j                  }|rc| j                  v rS j                  |   }|j                  } j                  |      }d j	                  |       d}|j                  ||      }|S d}t        |
      dkD  r! j                         |z
  }t        |      dk(  }|rdt        |      z   }|
D ]*  }d	t        |      z  }| d
| d}|j                  ||      }, t        |      D ]  \  }}t        |      }| j                  v s  j                  |   }|j                  } j                  |      }dg|z  } j	                  |      ||dz   <   dj                  |      }d j	                  |       d| d}|j                  ||      } |S g }|D ]  }|j!                   ||      d|f        |D ]  }	|j!                   ||	      d|	f        |j#                  d d       t        |      D ]  \  }}t        |      }| j                  v s  j                  |   }|j                  } j                  |      } ||      %d j	                  |       d}t%        %fd|D              }t%        %fd|j'                         D              }||z   }|dkD  rd|z  } | d|  d}|j                  ||      } |
D ]  }||   $t%        $fd|D              }!t%        $fd|D              }|!dkD  r|dkD  rd|!z  }"d|z  }#| d|" d|# d}n(|!dkD  rd|!z  }"| d|" d}n|dkD  rd|z  }#| d|# d}n|}|j                  ||      } |S c c}w c c}	w c c}w )a  
        Handle indexing with both indirect variables and iteration variables.

        For example, x[indices, :] generates index = i0 + stride * tmp0
        where tmp0 is loaded from indices and i0 is the iteration variable.

        We need to convert this to JAX advanced indexing with proper broadcasting.
        When there are multiple iteration variables, they need different shapes
        to form an outer product (grid) rather than broadcasting together.

        Special case: For gather operations where a single iteration variable
        and single indirect variable have the same extent, they should be
        element-wise aligned, not broadcast into an outer product.

        PyTorch advanced indexing semantics: When multiple indirect indices have
        the same shape, they are paired element-wise (not outer product), and
        the combined result dimension appears at the FRONT of the output.
        r   c                >    j                  | t        d            S )Ninf)default)r  r   )r   r  r"   s    r'   _coeffz3PallasKernel._handle_mixed_indexing.<locals>._coeffr  s    ..uc5<.PPr)   Tkeyr  r   jnp.arange(r   Fz, 1z.reshape(-11r   z
).reshape(r  r  c                    | d   S r:  r=   rX   s    r'   r  z5PallasKernel._handle_mixed_indexing.<locals>.<lambda>  s
    !A$ r)   c              3  .   K   | ]  }|k  s	d   ywr@  r=   r<  r$   	var_coeffs     r'   r=  z6PallasKernel._handle_mixed_indexing.<locals>.<genexpr>  s     %NAIa%N   
c              3  .   K   | ]  }|k  s	d   yw)r	   Nr=   r  s     r'   r=  z6PallasKernel._handle_mixed_indexing.<locals>.<genexpr>  s      *1y=A*r  r  z[:rg  c              3  .   K   | ]  }|kD  s	d   ywr@  r=   r<  r$   indirect_coeffs     r'   r=  z6PallasKernel._handle_mixed_indexing.<locals>.<genexpr>  s     I!a.6HAIr  c              3  .   K   | ]  }|k  s	d   ywr@  r=   r  s     r'   r=  z6PallasKernel._handle_mixed_indexing.<locals>.<genexpr>  s     J1q>7IQJr  r  rf  r  z...]z[...)r   r   r   r  r   r  r   r5  r8   r+  r  rK  replacer'  rs  rR  r  sortr@  r  )&r"   r  used_iter_vars_setr  r   r   iter_coeffsr  indirect_var_symsr  rg  r4  indirect_coeffsr[  is_reduction_varrange_entry
range_sizerenamed_sizearange_exprpaired_indirectunused_iter_varsn_output_dimsr  trailing_onesreshape_exprr  shape_parts	shape_strall_componentsn_trailing_itern_trailing_indirectr  trailing_dimsr  leading_nonesr  r  r  s&   ``                                  @@r'   r  z#PallasKernel._handle_mixed_indexingT  s[   & "55e< 	""#56!"a'::e$$
	Q   2M.<=svc{== JJt33E:;	 33E:->?cS?? 7HH3q66!9,HH ~!#M(:a(? #C3xHt,,,X1F1Fs1K1X1X   $///"&"7"7"<K!,!3!3J#'#7#7
#CL$/

<0H/I"KK ) 1 1(K HI    }!#2247II ""23q8O N 33M !. J %N(; ;".{=/K%--lLI	J $N3 I3s8$///"&"7"7"<K!,!3!3J#'#7#7
#CL $'%-"7K)-L)AKA& $		+ 6I%djj&>%?z)TUV   !* 1 1(K HI#I& 
 ! 	>C!!6#;"<=	>$ 	BC!!6#;
C"@A	B=  / 	EFAs3xHd+++"33C8(//
#33J?"3K	 +DJJ|,D+EQG #&%N%N"N&) *.557* '# -/BB
>$,z$9M%0MM?!"DK%--hD	5	E: * 	FL,\:N I{IIIJJJJ 1}a (9 4!)J!6".qs>BRRSTQ (9 4".qtDa!)J!6".tN3C1E+!)),EI+	F. } >
 @ Is   (Q;.R Rc                    ||dk7  rt        d| d      | j                  j                  |      }| j                  j	                  |       t
        j                  j                  |      }|d uxr t        |j                               dk(  }|rd| d| d| d| d	g}n| j                  | j                  ||d
      }n| j                  ||      }	|	A| j                  j                  | j                  |             | j!                  |||	||      g}n9| j#                  |      }
| j%                  ||
      }
| j'                  |||||
|      }|D ]:  }| j(                  j+                  |       | j,                  j/                  ||f       < y )Nr  zpallas store mode 'z' not supportedr   r  r   r  z2.shape, _val) if _val.ndim == 0 else _val.reshape(r  F)rT   r!   outputstore_buffer_namesr  r   r   r  r  r  r  r  _detect_scatter_patternr   r   r   r  r  r  r  storesrt  r  r  )r"   r  r  r  r  r  rh  	is_scalarstore_linesr  rA  r\  s               r'   storezPallasKernel.store  s   
  4 3D6IJJiit$##D) gg  &tO@CLLN(;q(@	%eWA.%(-_`c_ddklK **6"??UER  $;;E4H+''..t/G/G/NO66dD#K  $33E:H  $99%JH #'"8"8T5%4#K   	7DKK!!$'""))3+6	7r)   c                    | j                  |      }|dk(  rt        j                  | |      }	 t        |      S # t        t
        f$ r |cY S w xY w)z=Get integer coefficient of a variable in an index expression.r   )rS  r  diffr  r  r  )r  r   r  rS  s       r'   r  z#PallasKernel._get_index_coefficientN  sQ    
 C A:JJuc*E	u::& 	N	s   
9 AAc                   | j                  |      }t        |      dk7  ry|d   }t        |      }t        | j	                  ||            }|dk(  ry| j                  |      s| j                  |||      S | j                  |||      S )zDDetect scatter operation pattern. Returns scatter info dict or None.r   Nr   )r5  r  r8   r  r  r0  _detect_point_scatter_detect_iter_scatter)r"   r  output_nameindirect_symsindirect_symr  r  s          r'   r  z$PallasKernel._detect_scatter_pattern[  s     //6}"$Q'<(!$"="=e\"RSQ ''.--k<XX ((nMMr)   c                j   |sy	 t         j                  j                  |      }|j                         D cg c]  }t	        |       }}t        |      dk  ryd}t        |      dz
  }t        t        |      dz
  dd      D ]  }	||k(  r|	} n
|||	   z  } ||g g d|dS c c}w # t
        $ r Y yw xY w)z&Detect single-element scatter pattern.Nr	   r   r  Tr  r  r  r  r  r  )r   r   r  r  r  	Exceptionr  rG  )
r"   r  r  r  rh  r4  r  
cumulativer  rU  s
             r'   r  z"PallasKernel._detect_point_scatterp  s     	''$$[1C,/LLN;qCF;L; |q  
<(1,\*Q.B7 	,C+",s++J		, )( $(
 	
! < 		s"   1B& B!B& !B& &	B21B2c                   | j                  |      }g }|D ]{  }t        | j                  ||            }|dkD  s$|| j                  v s3| j	                  | j                  |   j
                        }| y|j                  t        |      ||f       } |j                  |df       |j                  d d       t        fdt        |      D        d      }	|	yd}
t        ||	dz   d       D ]  \  }}}||
k7  r y|
|z  }
 ||
k7  ry|	|d|	 D cg c]
  \  }}}||f c}}}||	dz   d D cg c]
  \  }}}||f c}}}d	dd
S c c}}}w c c}}}w )z0Detect scatter pattern with iteration variables.r   Nr  c                    | d   S )Nr   r=   rX   s    r'   r  z3PallasKernel._detect_iter_scatter.<locals>.<lambda>  s
    AaD r)   Tr  c              3  <   K   | ]  \  }\  }}}|k(  s|  y wrQ   r=   )r<  r  r  rA  r  s       r'   r=  z4PallasKernel._detect_iter_scatter.<locals>.<genexpr>  s"     R?1ltQT\=QQRs   r   Fr"  )r   r  r  r+  rC  rK  r  r8   r  r  rs  r  )r"   r  r  r  r   all_varsr   rS  rK  indirect_posexpectedrA  rK  r0  s     `           r'   r  z!PallasKernel._detect_iter_scatter  s    11%8 02! 	;C33E3?@EqySD$9$99(=(=c(B(I(IJ>S5& 9:	; 	~r:;.$7 Ri&9R
   (,2B2D)E F 	Auf H	 X% )(2:=L2IJJwq!QQFJ19,:J:L1MNNgaAAq6N % 
 	
 KNs   E6Ec           	          j                   sJ |dk(  r j                  ||      S t        |t              rt	        d      |||f}| j
                  j                  v r j
                  j                  |   S ddddddd	d
}t        g d      }t         fd|D              } j                  |      }	 j                         }
t        d  j                  j                         D              }|xr |	duxr |	dkD  xr	 |
xr |dkD  }|xr |dkD  xr |	du }|dk(  r8|r/ j                         }t        |      dk(  r|d   n|}d| d| d}nd| d}n|dv r/||   }|r j                         }| d| d|d    d}n| d| d}ny||v rL||   }|r1 j                         }t        |      dk(  r|d   n|}| d| d| d}n=|r	| d| d}n2| d| d}n)t	        d| dt!        |j#                                d       j
                  j%                   j&                  ||      }| j
                  j                  |<   |S )aJ  
        Generate code for reduction operations in JAX/Pallas.

        Reductions in Pallas work by:
        1. Loading the input data into the kernel
        2. Applying JAX reduction operations (jnp.sum, jnp.max, etc.)
        3. Storing the reduced result

        The reduction happens over the loaded block of data.
        welford_reducezHTuple reductions (e.g., welford_combine) not supported in Pallas backendzjnp.sumzjnp.prodzjnp.maxzjnp.minzjnp.anyz
jnp.argmaxz
jnp.argmin)r@  r  rc  minrD  argmaxargmin)rY   r:  zc              3  :   K   | ]  }|j                   v   y wrQ   )r  )r<  r%   r"   s     r'   r=  z)PallasKernel.reduction.<locals>.<genexpr>  s     IA,Ir  c              3  @   K   | ]  \  }}|j                   sd   ywr@  r  )r<  r   r  s      r'   r=  z)PallasKernel.reduction.<locals>.<genexpr>  s!      
#uASASA
   Nr   r   xor_sumzjnp.bitwise_xor.reduce(z, axis=r   )r.  r/  r   r  z, keepdims=True)z	, axis=0)zReduction type 'z8' not yet supported in Pallas backend. Supported types: z	, xor_sumr  )inside_reductionwelford_reduce_fallbackr   r  rT   r   reduction_cacher   rD  r  r  r@  r+  rJ  r  r  rE  r,  r   r   )r"   r   r   reduction_typer  	cache_keyreduction_opspointwise_prefixeshas_pointwisepointwise_numelr  n_reduction_dimsis_partial_reductionis_symbolic_partialr  	axis_exprreduction_exprreduction_opr/   s   `                  r'   	reductionzPallasKernel.reduction  s   " $$$$ --//u==eU#Z 
 6	00088++I66 ""
 (8I6HII&*&@&@AS&T&*&C&C&E 
 $ 5 5 ; ; =
 

  %t+%!#%  % !1$ 	 N.2N$7N 	 Y&#//1'*4yA~DG4	#:5'ST!U#:5'!C33(8L#//1$0>5'b
!!L$0>5'!;},(8L#//1'*4yA~DG4	#nAeWGI;>NO  % %1>5'!C$0>5'!;">"2 3$$(););)=$>#?yJ  ""LL # 
 /5  +r)   c                    t         j                  j                  |       }|j                         }|j	                         S rQ   )r   r   r  r~  r  )buffer_namerh  r  s      r'   _buffer_is_contiguousz"PallasKernel._buffer_is_contiguous-  s1    gg  -!##%%r)   c                	    | j                   ry| j                  ry| j                  ry| j                  r[dj	                  d | j
                  j                  D              }| j                  D ]   }|| j                  v rt        |      |v s  y t        | j                  j                  j                               }d}|D ]  }| j                  |      }||\  }}}}	}t        |	      dk\  s.t        |      dk\  s=|	d   }
|	d   }| j                  |d         }| j                  |d         }|
r|u|
|k  s{|~||dkD  s|dkD  sd} n t!        | j"                        xs || _        | j'                         }|duxr |dkD  }|rt)        d	 | j*                  D              nd| _        g  |D ]l  }| j                  |      }| y|\  }}}}}|D cg c]  }| j                  |       c}t/        d
 D              r yt              t               kD  sk n  sy| j0                  t        | j0                         t               }t        | j                  j2                        |z   }d}|D ]
  }| j                  |      }| y|\  }}}}}t        |      dk(  r0|| j4                  v rt        | j4                  |         nY| j0                  ||v rt        | j0                        n3|D cg c]  }| j                  |       c}t/        d D              r yt              }||k(  r|| j6                  v }d}t9        |      D ]#  }|    |   k(  s|   dk(  s
 |   dk(  s|r!d} n |rL|| j"                  v r>| j"                  |   t              |k(  rt;         fdt9        |      D              s y|r y|s t/         fdt9        |      D              sud}y||kD  rSd}t9        ||z
  dz         D ]5  }d}t9        |      D ]  } |   dk(  r||z       |   k7  sd} n |s3d} n |s yd}t=        t?              t?                     D ]  \  }}||k7  s|dk7  s|dk7  s  y  |syt@        jB                  jE                         jF                  dk(  }|sd| _$        yd| _$        yc c}w c c}w )a   Check if this kernel can use tiling on CPU/TPU.

        Tiling is compatible with reductions, transpositions, and multi-range-tree
        kernels as long as no flatten-based indexing is used (buf[...].flatten()[idx]).
        Flatten indexing requires global flat indices which don't decompose into
        per-tile local indices.

        Reject:
        - GPU (has its own TMA / padding path)
        - Flatten-based indexing
        - Scatter outputs (indirect indexing complicates tile boundaries)
        F
c              3  2   K   | ]  }t        |        y wrQ   r8   r<  r\  s     r'   r=  z1PallasKernel._can_tile_cpu_tpu.<locals>.<genexpr>L  s     $O4SY$Or>  Nr	   r   r   Tc              3  :   K   | ]  }|j                   sd   ywr@  r3  )r<  r  s     r'   r=  z1PallasKernel._can_tile_cpu_tpu.<locals>.<genexpr>x  s     Dd$2C2CDs   c              3  $   K   | ]  }|d u  
 y wrQ   r=   r;  s     r'   r=  z1PallasKernel._can_tile_cpu_tpu.<locals>.<genexpr>  s     /19/r  c              3  $   K   | ]  }|d u  
 y wrQ   r=   r;  s     r'   r=  z1PallasKernel._can_tile_cpu_tpu.<locals>.<genexpr>  s     3QqDy3r  c              3  h   K   | ])  }|      |   k(  xs |      d k(  xs |   d k(   + ywr@  r=   )r<  r  int_sizer  	ref_shapes     r'   r=  z1PallasKernel._can_tile_cpu_tpu.<locals>.<genexpr>  sY        !" %T!W-1= 1'Q0A51(|q01 s   /2c              3  H   K   | ]  }|   |   k(  xr |   d kD    ywr@  r=   )r<  r  rR  rS  s     r'   r=  z1PallasKernel._can_tile_cpu_tpu.<locals>.<genexpr>  s7      % QK9Q</DIaL14DD%r  r   @   )%r  r  r  r   rR  r   rV  r  r8   rE  r!   output_buffersr,  rB  r  rC  r   r  tile_has_transposer  r@  r  tile_skip_last_nrD  r  r  r  r  rG  rH  zipr  r   r   r   r   r  )!r"   compute_textr  out_bufshas_col_major_outrw  rK   rA  rL  r  r  r  d0d1r  has_reductionr4  ref_ndall_bufshas_tileablebuf_nd
is_stridedmismatchr  foundr  okr   r   r  rR  r  rS  s!                                 @@@r'   _can_tile_cpu_tpuzPallasKernel._can_tile_cpu_tpu3  sN    ;;$$!! 99$O4<<;N;N$OOL.. !d:::w<</ 	! 		005578 "  	H((2D|04-AxNA>"a'CMQ,>#A&#A&^^HQK0^^HQK0NRQQ(,%)	* #'t'B'B"C"XGX
 779't3K!8K  Dd..DD 	  "	  		%H((2D|#' AxAq3;<aq)<H/h//8}s9~-$			% 
 &&2T889IY		//08;  R	%H((2D|#' AxAq8}! 4888 = =h GH,,8X=Q ; ;<7?@!DNN1-@3(33 ]F &)C)CC
 v 	A y|3#A;!+$Q<1,% #H	 D,G,G G66x@DD	V+   &+6]	    %   %"6]% " $(L& v23 
AB"6] "$Q<1,$#AE?il:!&B!"  $
  #   2HY4GH %DAqAv!q&Q!V$%aR	%h  446;;uD)+D&  *.D&i =D As   S	<Sc           
        t               }| j                  j                         \  }}}}|D cg c]  }|j                   }}|D cg c]  }|j	                  d      s| }	}|D cg c]  }|j	                  d      s| }
}t        | j                  j                  j                               }|D cg c]	  }||v s| }}|
st        d      | j                  j                  j                         D ci c]  \  }}t        |t              r|| }}}|xs d}t        j                  j                         j                   dk(  }|rdnd}i }|	D ]  }d||<   	 |	D cg c]  }||   s	| d	 }}|D cg c]  }|j	                  d
      s| }}||z   }||z   }t        |j                         D cg c]
  \  }}|r	| c}}      }|rt#        t%        t'        |
                  }n0| j(                  rg }n!t+        |
      D cg c]  \  }}||v r| }}}t-        dNi d|d|d| j(                  d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|dg }|| _        | j1                  |       t               }|j3                         5  | j5                  ||       | j6                  j8                  D ]  }|j;                  t        |              	 ddd       | j                  j                         \  }}}}|D cg c]  }|j                   }}t        | j                  j                  j                               }|D cg c]	  }||v s| c}|_        |D cg c]  }|j	                  d
      s| c}|_        ||j>                  z   |_         ||z   |_!        | jE                         | _#        d} | jH                  rd} | jK                  ||      |_&        tO        | jP                        }!|!r| jS                          t               }"|"j3                         5  | j6                  j8                  D ]  }|"j;                  t        |              	 ddd       | jU                  |||"       |jW                         S d | d!d"jY                  |jB                         |  d#}#|j;                  |#       |j3                         5  | j[                  |||       ddd       |j;                  d       | d$}$g }%d%t'        |j<                        z   }&t+        |j@                        D ]/  \  }}||v s|j	                  d&      s|%j]                  ||&z          1 |%rd'd"jY                  d( |%D              z   d)z   }'nd*}'t#        t%        d%t'        |j<                        z               }(d'd"jY                  d+ |(D              z   d)z   })|j;                  d,|) d-|' d.       d/d0g|j<                  z   |j@                  z   }*|j;                  d |$ d'd"jY                  |*       d#       d"jY                  d1 |jL                  D              }+| j_                  |      \  },}-d2}.|j3                         5  |,r|ja                  |.       n|-r;|j;                  d3       |j3                         5  |ja                  |.       ddd       |j;                  d4       | jb                  |j;                  d5| jb                   d)       |jL                  D ].  \  }/}0|j@                  |/   }|j;                  | d6| d7|0 d8       0 |j;                  d9       |j;                  d:       |j;                  d;       |j;                  d.       | jF                  r| je                  |       n!| jg                  ||j@                         |j@                  D ]O  }| ji                  |      }1|1r| jj                  jm                  |1      nd}2|26|j;                  | d6| d<|2 d.       Q |j;                  d=       |j;                  d>       |j;                  d;       |j;                  d.       |j;                  d?       |j;                  d@       |j;                  dAd"jY                  |j@                        z   dBz          |j;                  d.       | jH                  rQ| jF                  r#|j;                  dC       |j;                  dD       n"|j;                  dE       |j;                  dF       g }3|j<                  D ]  }4|3j]                  |4 dG|4         | jH                  r"|3j]                  dH       |3j]                  dI       |3rdJ| dKd"jY                  |3       dL}5n| dM}5| jn                  xr | jp                  xr | js                         }6|6r| ju                  ||5       n=| jn                  r| jw                  ||5       n| jy                  ||5|jL                  |+       ddd       | j{                  ||$       |jW                         S c c}w c c}w c c}w c c}w c c}}w c c}w c c}w c c}}w c c}}w # 1 sw Y   xY wc c}w c c}w c c}w # 1 sw Y   -xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w)Oa  
        Generate the complete Pallas kernel code as a Python string.

        This includes:
        - Import statements for JAX/Pallas
        - The kernel function that operates on refs
        - The main wrapper function that handles PyTorch<->JAX conversions via DLPack

        Args:
            name: Optional kernel name (will use placeholder if not provided)

        Returns:
            str: Complete Python source code for the Pallas kernel
        out_ptr)rj  
in_out_ptrz2Pallas backend requires at least one output buffer<KERNEL_NAME>cpur   r   Tr  )rk  in_ptrr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  NrN  z&, _pallas_tile=None, _pallas_ax2g=Nonedef _kernel(r   ):_jit_wrapperr	   rk  r   c              3  2   K   | ]  }t        |        y wrQ   rL  r<  rY   s     r'   r=  z.PallasKernel.codegen_kernel.<locals>.<genexpr>  s     ,LSV,Lr>  ,)z()c              3  2   K   | ]  }t        |        y wrQ   rL  rt  s     r'   r=  z.PallasKernel.codegen_kernel.<locals>.<genexpr>       0PAQ0Pr>  +@functools.partial(jax.jit, static_argnums=z, donate_argnums=r   
out_shapes
out_dtypesc              3  0   K   | ]  \  }}| d |   yw: Nr=   r<  r  os      r'   r=  z.PallasKernel.codegen_kernel.<locals>.<genexpr>  s     %Pfq!2aSk%P   )zPresults = tuple(jnp.empty(s, dtype=dt) for s, dt in zip(out_shapes, out_dtypes))z2return results if len(results) > 1 else results[0]z+if any(0 in shape for shape in out_shapes):zI_pallas_out_shapes = tuple(s if len(s) > 0 else (1,) for s in out_shapes)z_pallas_out_shapes = (rn  z.reshape(_pallas_out_shapes[])zout_shapes_pallas = tuple(z&    jax.ShapeDtypeStruct(shape, dtype)z;    for shape, dtype in zip(_pallas_out_shapes, out_dtypes)ro  out_specs_pallas = tuple(z+    pallas_make_block_spec_non_tiled(shape)in_specs_pallas = tuple(z-    pallas_make_block_spec_non_tiled(i.shape)z    for i in [rg  z_pallas_tile = _tilez_pallas_ax2g = _ax2gz$_pallas_tile = _pallas_out_shapes[0]z_pallas_ax2g = {}=z_pallas_tile=_pallas_tilez_pallas_ax2g=_pallas_ax2gfunctools.partial(	_kernel, ),z_kernel,r=   )>r   r!   python_argdefsr  r3  r   r  r  RuntimeErrorrV  rJ  r   r8   r   r   r   r   rE  rG  r  r  rs  r  aliasable_out_ptrs_codegen_importsindent_codegen_iteration_varsr   rV  rt  r  r  r  r  rh  tile_cpu_tpur  _compute_alias_pairsr  r   r  rz   _codegen_scalar_prefetch_wrappergetvaluerR  _emit_kernel_bodyr  r  
writelinesr  _codegen_tiled_specsr|  rr  r  r  r  r  r&  _codegen_jit_wrapper_tma_codegen_jit_wrapper_legacy_gpu_codegen_jit_wrapper_cpu_tpu_codegen_main_entry)7r"   r  r  arg_defs	call_argsrA  r   r  r%   r  r  size_var_namesr  outerr	  r  r  r  r  r  rv  r  r  r  r  flagr  r  r  r  kernel_bodyr\  extra_kernel_paramsuse_scalar_prefetchkernel_body_spkernel_signaturejit_wrapper_namedonate_indicesbase_offsetdonate_literalstatic_argnumsstatic_argnums_literalwrapper_paramsalias_map_literalhas_zero_dimr  zero_dim_return	input_idxout_idxrw  cshapepartial_argssv_param
kernel_arguse_tmas7                                                          r'   codegen_kernelzPallasKernel.codegen_kernel  s
     %)II$<$<$>!)Q)12A22&3Oq||I7N1OO$
5N(OA
 
 $DII$6$6$=$=$?@&3KqN7J1KKSTT !%		 8 8 > > @ 
u%% 5L 
  
 -o77>>@EEN&6FG+-$ 	*E%)OE"	* +:
!&_U=SugV
 
 %
5M(NA
 
 +\9)M9&$3$9$9$;HjdD4TH
 "&uS-?'@"A[["$ "+=!9#C,, # #  

#
 ;;
 .	

 0
 (
 ,
 (
 ,
 "6
 ,
 &
 &
 !4
  2
  0!
" !4#
$ %
( #2c"$&! 	1((c:++ 1%%c$i01	1 %)II$<$<$>!)Q)12A22#DII$6$6$=$=$?@*7OQ1;NqO$
5M(NA
 #/1A1A"A!-!= !224 ''"J33CI"4#7#78..0+-N&&( 8 LL// 8D",,SY788 11
 ==?" "+htyyAWAW7X6YZmYnnpq'([[] 	;""4c:	; 	r)],7#c1122"3#:#:; 	9IC$)F%%cK&78	9  499,L^,L#LLtSN!NeAC,?,?(@$@AB!$tyy0P0P'P!PSW!W''=&>>Oa!	
 <(3+>+>>AXAXX 	 	./q>1J0K2NO II%P%PP(,(C(CC(H%o
 [[] Z	0"NN#PQ 989 E ..:NN01L1L0MRP +.// &Iw33I>ENN 'UG+GyPRS
 ;<GHQ s#$$--c22249P9PQ!$!8!8 S#'#:#:5#A  ( !99==hG!% 
 "- NNeWCwixq+QRS NN#>?NN#PQNNU NN3'NN#=>NN#RSNN(499S5L5L+MMPSS NN3'//(('=>'=>'MN':;  " # 3 3 BH ''8*AhZ(@AB // ''(CD ''(CD#5k])DIIVbLcKddf!gJ$/=!9J KK 5..5224 
 11#zB[[88jI55Z:KqZ	x 	  &67}}} 3O

 L 


 I#>	1 	1 3O
*8 8	; 	;T9 9Z	 Z	s   ii	i	(i?i6	i i:i#
i.i;i#i#:
i(
i(
i.Ai4j	j j0jj.6jj89j71j*Ej7Hj74i>jj'*j4	/j77k c                P   g }t        | j                        D ]  \  }}|j                  d      rG|j                  |d      s*| d}| j                  j                  |      }|j                  ||f       ^| j                  j                  |      }|j                  ||f        |S )Nrj  Fr  )rs  r  r3  r  r  r  r  )r  r  r  r  r  
alias_namer  s          r'   r  z!PallasKernel._compute_alias_pairs  s     .0&s'8'89 	9MGTy)"&&tU3$(6J # 7 7 = =j II&&	7';<3399$?	""Iw#78	9 r)   c                $   |j                   D ]M  }t        |t              r |j                  |j	                                3|j                   j                  |       O | j                  D ]%  \  }}||j                  v s|j                  |       ' y)z:Emit the kernel body lines and store operations into code.N)rV  r   r8   rt  r  r  r  r  )r"   r  r  r  r\  rj  ru  s          r'   r  zPallasKernel._emit_kernel_body  s}      && 	)D$$t{{}-""4(		)
 $(#9#9 	+GZ#000z*	+r)   c                0   | j                   J | j                   }|j                  }t        |j                        }|j                  D cg c]&  }||j
                  k7  r||j                  k7  r||vr|( }}|j
                  g|j                  gz   |z   t        |j                        z   |j                  z   }	|j                  d| ddj                  |	       d       |j                         5  | j                  |||       ddd       |j                  d       | d}
dd	g|j                  z   |j                  z   }t        t        d
t        |j                        z               }ddj                  d |D              z   dz   }|j                  d| d       |j                  d|
 ddj                  |       d       |j                         5  |j                  }|j
                  }|j                   }t        |j"                        }|j                  d       t        |      D ]   }||k7  s	|j                  d| d| d       " |j                  d| d       |dk(  r|j                  d| d| d       nA|gt        |      D cg c]
  }||k7  s	| c}}|j                  d| d| d| d| d	       g }|D ]  }d| d}|j                  d| d| d        |j                  d!| d"| d#       |j                  d$| d%       |j                  d!| d"| d&| d'       |j                  d(       |j                  d!| d"| d)       |j%                  |        g }|j                  D ]0  }d| d}|j                  | d"| d#       |j%                  |       2 |j                  D cg c]	  }| d*|  }}|rd+| d,dj                  |       d}n| d-}|j                  d.       d/t        |      z   }|j                  d0       |j                         5  |j                  d1       |D ]  }|j                  d2| d3| d4        |j                  D ]  }|j                  d5        	 ddd       |j                  d       t        |j                        }|j                  d6dj                  d7g|z        z   dz          g }t'        |j                        D ]#  \  }}d/|z   |z   } |j%                  |  d8|        % dj                  |      }!t        |      D cg c]  }d9| d:
 }"}d;dj                  |"      z   dz   }#|j                  d<       |j                         5  |j                  | d=       |j                  d>|# d=       |j                  d?       |j                         5  |j                  d@       |j                  dA       |j                  dB       |j                  dC       ddd       |j                  dD       |r|j                  dE|! dF       | j(                  s|j                  dG|j*                   d=       ddd       |gdHgz   |z   |z   }$|j                  dIdj                  |$       d       |j                  dJ       ddd       | j-                  ||
       yc c}w # 1 sw Y   xY wc c}w c c}w # 1 sw Y   hxY wc c}w # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   bxY w)Kz=Emit kernel, JIT wrapper, and main entry for scalar prefetch.Nro  rp  r   rq  rN  rr  ry  rz  r	   r   c              3  2   K   | ]  }t        |        y wrQ   rL  rt  s     r'   r=  z@PallasKernel._codegen_scalar_prefetch_wrapper.<locals>.<genexpr>V  rw  r>  ru  rx  r   z_D = 1z
_D = _D * z.shape[rg  z_seq = z	.shape[0]r   z_table_3d = ro  z.shape[0], 1, _D)z
.transposez	], 1, _D)rA  _3dzif z.ndim == 2 and z.shape[0] == _seq:    rn  z.reshape(_seq, 1, _D)zelif z.ndim == 1:z.reshape(1, 1, z
.shape[0])else:z.reshape(1, 1, -1)r  r  r  _kernelzR_ROW_SPEC = pl.BlockSpec((1, 1, _D), lambda i, _: (i, jnp.int32(0), jnp.int32(0)))r   z_in_specs = [zPpl.BlockSpec((1, 1, _D), lambda gi, idx: (idx[gi], jnp.int32(0), jnp.int32(0))),z_ROW_SPEC if z$.shape[0] == _seq else pl.BlockSpec(zA.shape, lambda i, _: (jnp.int32(0), jnp.int32(0), jnp.int32(0))),z
_ROW_SPEC,z_out_specs = [	_ROW_SPECr}  z/jax.ShapeDtypeStruct((_seq, 1, _D), out_dtypes[r  rf  _result = pl.pallas_call(,z
out_shape=z'grid_spec=pltpu.PrefetchScalarGridSpec(znum_scalar_prefetch=1,zgrid=(_seq,),zin_specs=_in_specs,zout_specs=_out_specs,r  zinput_output_aliases={  },z
interpret=	_table_3d)(z?return tuple(r.reshape(s) for r, s in zip(_result, out_shapes)))r  r  r   r  r  r  r  rE  r  rt  rR  r  r  r  rG  r  r  r  r  rs  r  r  r  )%r"   r  r  r  r  r  	alias_setr%   other_input_paramsprefetch_kernel_paramsr  r  r  r  tableindicesind_dimrO  r  rV  r  pallas_call_other_argsp3dpallas_call_alias_argssvr  
kernel_refnum_non_alias_in_specsrA  num_outputsalias_map_partsr  arg_idxr  out_shape_partsout_shape_exprall_pallas_argss%                                        r'   r  z-PallasKernel._codegen_scalar_prefetch_wrapper,  s    ##///''xxs//0	 ,,
H***X)))"	 
 
 ##$##$% ! 3##$% 	  	 	;-x		2H(I'J"M	
 [[] 	;""4c:	; 	r)],7<(3+>+>>AXAXX 	 eAC,?,?(@$@AB!$tyy0P0P'P!PSW!W9:P9QQRS	
 	./q>1J0K2NO[[] z	((E,,G++Gx++,DNN8$4[ D<NNZwgaS#BCD NNWWIY78!|eWIeWDUVWKeDk"JQ'\1"JK"5'D6gWWIY8 &("' 3!CjQCqc9KLMcU#aS0EFGqc56cU#aSs*MNw'cU#aS0BCD&--c23 &("%% 3!Cj#c!,ABC&--c23
 473F3FGRrd!B4LGLG( 6		,/03 
 !,}G4
 NNA
 &'-C)D%D"NN?+ 1O 2 CNN"e $)), .UV )) 1ANN<011 NN3c//0KNN 499k][-H#IICO !O'(8(89 @
44w>&&'"WI'>?@ !%		/ : {+ B!BGO  !499_#==CNNN67 J*Q/0N+;1=>HI[[] <NN#;<NN?3NN#89NN#:;	<
 t$"NN%=>O=PPT#UV{{NNZ0E0E/Fa#HIJ  	- () ))  NNR		/ :;1=>NNQqz	x 	  &67A
(	; 	;> #K: H"1 1:< <	J JCz	 z	s   
+[/[A\2A\
[[D\8[A"\(A["<B3\/[/<:\6A\ A[4A\ !A
\[
\"[,	'\4[=9\  \		\\c                    d}|j                   r|dz  }|dz  }|dz  }n|j                  s|dz  }| j                  r|j                   s
|dz  }|dz  }|j                  j	                  |d       y )	Na  
import functools
import math
import torch
import jax
import jax.numpy as jnp
from jax.experimental import pallas as pl
from torch.utils._ordered_set import OrderedSet
from torch._inductor.runtime.runtime_utils import (
    pallas_compute_tiling, pallas_make_block_spec, pallas_permute,
    pallas_gpu_align_output_specs, pallas_gpu_pad_inputs,
    pallas_gpu_unpad_results,
    pallas_ensure_nonzero_rank,
    pallas_make_block_spec_non_tiled,
    torch_dtype_to_jax_runtime,
)
z
import jax.exportz1
from jax.experimental.pallas import tpu as pltpuz8
from torch_tpu._internal.pallas import tpu_torch_pallasz8
from jax.experimental.pallas import mosaic_gpu as plgpuzA
import os as _os; _os.environ.setdefault('JAX_PLATFORMS', 'cpu')Tstrip)r  r  r  r  splice)r"   r  importss      r'   r  zPallasKernel._codegen_imports  s    " ::,,GKKGRRG%%RRG

TG KKGt,r)   c                
   d}d}t        d | j                  j                         D              }| j                  j                         D ]8  \  }}||k(  r|j                  s|c S ||z   c S |j                  r|dz  }4|dz  }: y)a  Map an iteration variable to its output tensor axis index.

        Non-reduction variables map to axes 0, 1, 2, ... in order.
        Reduction variables map to axes after all pointwise axes.
        Returns None if the mapping cannot be determined.
        r   c              3  @   K   | ]  \  }}|j                   rd   ywr@  r3  )r<  rA  r  s      r'   r=  z2PallasKernel._get_iter_var_axis.<locals>.<genexpr>  s     UAann1Ur4  r   N)r@  r+  rJ  r  )r"   r  r  r_idxn_pwr  r  s          r'   _get_iter_var_axiszPallasKernel._get_iter_var_axis  s     U!6!6!<!<!>UU//557 	JCg~%*%7%7vITE\I!!
!	 r)   c           
        t        | j                  j                         D cg c]A  }t        |j                  t
        t        j                  f      rt        |j                        C c}      fd}| j                  j                         }|j                  | j                  j                         d\  }}|D ]  } ||      }|d   s|\  }} ||fS  ||fS c c}w )Nc                    t         j                  j                  |       }|t        |j	                               dk  ryt        d |j	                         D              }t        j                  |      }|v r||fS dS )Nr   NNc              3  v   K   | ]1  }t        |t        t        j                  f      rt        |      n| 3 y wrQ   )r   r  r  r  r;  s     r'   r=  zePallasKernel._get_reshape_target_shape_and_numel.<locals>._get_nd_shape_if_matches.<locals>.<genexpr>  s3       %Qemm(<=A1Ds   79)r   r   r  r  r  r  r   r  )rw  rh  r  r  iter_lengthss       r'   _get_nd_shape_if_matcheszRPallasKernel._get_reshape_target_shape_and_numel.<locals>._get_nd_shape_if_matches  st    ''((2C{c#,,.1Q6!  E IIe$E%*l%:E5>LLr)   r  r   )r   r+  r  r   rK  r  r  r  r  copyr  r!   r  )	r"   r  r  candidate_buf_namesreshape_target_shapereshape_target_numelrw  r/   r  s	           @r'   #_get_reshape_target_shape_and_numelz0PallasKernel._get_reshape_target_shape_and_numel  s    
 " ..557ahhemm(<= AHH
		M #77<<>""499#:#:;5?22+ 	H-h7Fay=C:$&:#%999	 $%9999s   ACc                   ||   }|j                   j                  }| j                  |      }| j                  |      }t	        |      }| j                  |||      }dg|z  }	||	|<   dj                  |	      }
d| d}| d|
 d}|S )Nr  r   r  r   ro  )r  rK  r   r   r  _broadcast_axis_idxrR  )r"   broadcast_varsbroadcast_idxbvrK  renamed_length
length_strnum_broadcast_dimsaxis_idxr
  r  arangereshapeds                r'   $_make_broadcasted_iteration_var_exprz1PallasKernel._make_broadcasted_iteration_var_expr0  s     M*--f5ZZ/
 0++M+=
 e00 *HIIk*	zl!,XYyk3r)   c                X   | j                   r| j                  s| j                  sy |j                  d       | j	                         \  }}t        | j                   j                               }g }d }t        |      D ]J  \  \  }}	| j                  |	j                        }
|
|
|k(  r}.|j                  t        ||	|
             L t        |      }t        |      D ]G  \  \  }}	|| j                  vrt        |      }|	j                  }| j                  |      }| j                  |      }| j                  |      }
|
m|rS|dkD  rN|k7  rIt!        fdt        |      D        d       }|)| j#                  ||      }|j                  | d|        |j                  | d| d       |rMt        |      dkD  r?|
|k(  r:dj%                  d |D              }d	| d}|j                  | d| d
| d       !|dkD  rL|k7  rGt!        fdt        |      D              }| j#                  ||      }|j                  | d|        rdj%                  d | j&                  j(                  D              }||v }|xr: | j                   xr+ | j*                   xr | j,                   xr |	j.                   }|r| j1                  |      nd }|:|j                  | d| d| d| d       | j2                  j5                  |       1|j                  | d| d       J y )Nz*# Define iteration variables as JAX arraysr   c              3  H   K   | ]  \  }}|j                   k(  s|  y wrQ   r  r<  r  r  r  s      r'   r=  z7PallasKernel._codegen_iteration_vars.<locals>.<genexpr>p  s     Qtq!AEESLQ   ""rn  z = jnp.arange(r   r   c              3  2   K   | ]  }t        |        y wrQ   rL  r;  s     r'   r=  z7PallasKernel._codegen_iteration_vars.<locals>.<genexpr>  s     %Kc!f%Kr>  r  ro  c              3  H   K   | ]  \  }}|j                   k(  s|  y wrQ   r  r  s      r'   r=  z7PallasKernel._codegen_iteration_vars.<locals>.<genexpr>  s$      %!Qquu|A%r  rJ  c              3  2   K   | ]  }t        |        y wrQ   rL  rM  s     r'   r=  z7PallasKernel._codegen_iteration_vars.<locals>.<genexpr>  s     (StT(Sr>  z = jnp.arange(_pallas_tile[z$]) + pl.program_id(_pallas_ax2g.get(z, 0)) * _pallas_tile[rg  )r+  r  r   rt  r  rE  rJ  rs  rC  rK  r  r  r  r8   r   r   r  r  rR  r   rV  r  r  r  r  r  r  )r"   r  r  r  r  	var_itemsr  total_var_idxr  r  r  r  r[  rK  r  r  r  r#   r  r  rZ  var_in_computecan_tile_relativer  r  s                           @r'   r  z$PallasKernel._codegen_iteration_varsC  s   
 %%dkkd>Q>QJK 446 	32 ..4467	%.y%9 	!C!'55J%*8L*L #%%'WeZH	 !0%.y%9 H	T!C!'5d1117|H\\F!11&9NN3J/J!(*Q.},$(Qy'@Q%M %0#HH*M  $--
#dV.DE %%
.A&NO %,-1"66 II%K6J%KK	&zl!4%%
#fXYykQR&ST#a'C=,@ $ %"+N";% ! @@"M %%
#dV&<=  $yy(St||?R?R(SS!)\!9" / KK/ 222/ !555/ "... " 9JD++G4t  '))#*$?z J==EJ G++3*A7
 0044W=))XJnZLPQ*RSQH	Tr)   c                n    t        d | D              }t        d | D              }|xr |}|r|S |dz
  |z
  S )Nc              3  d   K   | ](  }t        |j                        j                  d        * ywrNr8   r  r3  r<  r  s     r'   r=  z3PallasKernel._broadcast_axis_idx.<locals>.<genexpr>  s(      !
02C

O&&s+!
s   .0c              3  f   K   | ])  }t        |j                        j                  d         + ywr  r  r  s     r'   r=  z3PallasKernel._broadcast_axis_idx.<locals>.<genexpr>  s+      !
46BJJ**3//!
s   /1r   )rD  )r  r  r  has_reduction_varshas_pointwise_varsis_mixeds         r'   r  z PallasKernel._broadcast_axis_idx  s\     ! !
6D!
 
 ! !
:H!
 
 &<*<  !A%55r)   c                   |j                   }|j                  }|j                  }|j                  d       |j                  d       |j                  d       |j                  d       |j                  d       |D ]  }|j                  d| d        |j                  d       |j                  d	       |j                  d
       |D cg c]  }| d	 }}|D cg c]  }| d	 }	}|D cg c]  }| d	 }
}|D cg c]  }| d	 }}|j                  d       |j                  d       ||	z   }|
|z   }t	        t        |            D cg c]  }d| 	 }}dj                  ||z         }|j                  ddj                  |       d| d       |j                         5  |j                  d       |j                  d       |j                         5  |j                  d       |j                  d       |j                  d       t        t        ||
            D ]#  \  }\  }}|j                  d| d| d| d       % |j                  d       |j                  d       t        |      D ]  \  }}|j                  d| d        |j                  d       |j                  d       |
|z   }|j                  d      j                         }|j                  | ddj                  |       d       |j                  d       |j                  d        |j                  d!       t        |	|      D ]  \  }}|j                  d"| d| d#        |j                  d$       |j                  d       |j                  d%       d d d        |j                  d       |j                  d&       |j                  d'       d d d        |j                  d       |j                  d(       |j                  d)       t        |
      D ]"  \  }}||   }|j                  d*| d+| d,       $ t        |      D ]  \  }}|j                  d*| d-| d.        |D ]  }|j                  d*| d/        |j                  d       |j                  d0       |j                  d1       |j                  d       |j                  d2       |j                  d3       |j                         5  |j                  d4       |j                  d5       |j                  d6       d d d        |j                  d7       |D ]  }|j                  d8| d9        |j                  d       |j                  d       |j                  d:       |j                  d;       y c c}w c c}w c c}w c c}w c c}w # 1 sw Y   ,xY w# 1 sw Y   xY w# 1 sw Y   xY w)<Nz6# Use lax.fori_loop with TMA for automatic OOB maskingzfrom jax import laxz"_tile_size = 128  # Warpgroup sizez_orig_out_shapes = out_shapesz_max_numel = 0z_max_numel = max(_max_numel, .size)for shape in out_shapes:z2    _max_numel = max(_max_numel, math.prod(shape))z8_num_tiles = (_max_numel + _tile_size - 1) // _tile_size_gmem_smemrN  z4# Wrapper kernel using lax.fori_loop with direct TMA	_barrier_r   zdef _tma_kernel(z, *, rq  zdef _tile_body(_tile_idx, _):z$_tile_start = _tile_idx * _tile_sizez5# TMA load inputs from GMEM to SMEM (OOB auto-masked)zplgpu.copy_gmem_to_smem(z%.at[pl.ds(_tile_start, _tile_size)], z, _barrier_r   z # Wait for TMA loads to completezplgpu.barrier_wait(_barrier_z# Compute on SMEM tilesr  r   z7# TMA store outputs from SMEM to GMEM (OOB auto-masked)zplgpu.commit_smem()zplgpu.copy_smem_to_gmem(z$.at[pl.ds(_tile_start, _tile_size)])zplgpu.wait_smem_to_gmem(0)zreturn Nonez# Iterate over all tilesz.lax.fori_loop(0, _num_tiles, _tile_body, None)zA# Build SMEM scratch shapes for inputs, outputs, and TMA barriersz_scratch_shapes = {}z_scratch_shapes['z'] = plgpu.SMEM((_tile_size,), r  z*'] = plgpu.SMEM((_tile_size,), out_dtypes[r  z"'] = plgpu.Barrier(num_arrivals=1)z4# Create flattened output specs aligned to tile sizezV_flat_out_specs, _ = pallas_gpu_align_output_specs(out_shapes, out_dtypes, _tile_size)z## Call plgpu.kernel with TMA kernelz_result = plgpu.kernel(z_tma_kernel,zout_shape=_flat_out_specs,zscratch_shapes=_scratch_shapes,r  r  z.flatten(),z$# Reshape results to original shapesz:return pallas_gpu_unpad_results(_result, _orig_out_shapes))r  r  r  rt  rG  r  rR  r  rs  rY  rstripr  )r"   r  r  r  r  r  rv  r%   gmem_input_paramsgmem_output_paramssmem_input_paramssmem_output_paramswrapper_kernel_paramsall_smem_paramsr  barrier_paramsscratch_paramsgmem_insmem_inrA  kernel_call_argsrH   gmem_outsmem_out
smem_param
orig_parambarrier_params                              r'   r  z%PallasKernel._codegen_jit_wrapper_tma  s   xx!55)) 	OP,-;<67'(( 	JENN:5'HI	J12KLQR2EFQs%[FF3@Aa5kAA2EFQs%[FF3@Aa5kAArMN 14F F+.@@38=P9Q3RSaIaS/SS?^#CDtyy)>?@nEUUWX	
 [[] )	MNN2NN:; ".EFr"VW-6)+<=. )A) NN27);`ah`iituvtwwxy r"AB%&78 HDAqNN%A!A#FGH r"89#47I#I &--c288:	)Adii8H.I-J!LMr"M 45*-.@BT*U &HhNN28*BxjHlm ;<r"}-E".H NN2NN56NNKLS)	MX 	rO	
 	-.&'89 	MAz,Q/JNN#J</NzlZab	
 ''9: 	MAzNN#J</YZ[Y\\^_	 , 	MNN#M?2TU	
 	rMNd	
 	r<=01[[] 	>NN>*NN78NN<=	> 	t( 	6ENNT%45	6sr=>STG GAFA T". ".)	M )	MP	> 	>sO   V'V,1V1V6V;,3WF7W ;W94W W
	WWW#c                   |j                   }|j                  }ddj                  |       d}|j                  d       |j                  d       |D ]  }|j                  d| d        |j                  d       |j                  d	       |j                  d
       |j                  d       |j                  d       |j                  d       |j                  d       |j                  d| d       |j                  d       |j                  d       |j                  d|z          |j                  d       |j                  d       |j                  d       |j                  d       |j                  d       |j                  d       |j                  d       |j                  d       |j                  d       |j                  d| d       |j                  d       |j                  d       |j                  d        |j                  d!       |j                  d"       |j                  d#|z          |j                  d$       |j                  d%       |j                  d&       |j                  d'       |j                  d(       |j                  d)       |j                  d*       |j                  d+|        |j                  d,       |j                  d-       |j                  d.       |j                  d"       |j                  d#|z          |j                  d$       |j                  d%       |j                  d/       y )0Nrf  r   rg  z7# Check if all tensors have same size (no broadcasting)z_all_sizes = []z_all_sizes.append(r  r  z'    _all_sizes.append(math.prod(shape))z&_unique_sizes = OrderedSet(_all_sizes)zH_can_pad = len(_unique_sizes) == 1 and all(s > 1 for s in _unique_sizes)rN  zif _can_pad:z5    # All tensors same size - safe to flatten and padz+    _padded_inputs = pallas_gpu_pad_inputs(r   zZ    _aligned_out_specs, _is_scalar = pallas_gpu_align_output_specs(out_shapes, out_dtypes)z    _result = plgpu.kernel(z        z%        out_shape=_aligned_out_specs,z    )(*_padded_inputs)zD    return pallas_gpu_unpad_results(_result, out_shapes, _is_scalar)r  zA    # Different sizes - check if it's a reduction (scalar output)z)    _out_numel = math.prod(out_shapes[0])r  z    if _out_numel <= 1:zG        # Scalar output (reduction) - pad inputs but keep scalar outputz/        _padded_inputs = pallas_gpu_pad_inputs(z#        _aligned_out_specs = tuple(z.            jax.ShapeDtypeStruct(shape, dtype)z;            for shape, dtype in zip(out_shapes, out_dtypes)z	        )z        _result = plgpu.kernel(z            z)            out_shape=_aligned_out_specs,z        )(*_padded_inputs)z        return _resultz	    else:zP        # Non-scalar output with broadcasting - broadcast inputs to output shapez%        _target_shape = out_shapes[0]z        _broadcasted = [z>            jnp.broadcast_to(_inp, _target_shape) for _inp in z	        ]z<        _padded_inputs = pallas_gpu_pad_inputs(_broadcasted)z^        _aligned_out_specs, _is_scalar = pallas_gpu_align_output_specs(out_shapes, out_dtypes)zH        return pallas_gpu_unpad_results(_result, out_shapes, _is_scalar))r  r  rR  rt  )r"   r  r  r  r  
input_listrv  s          r'   r  z,PallasKernel._codegen_jit_wrapper_legacy_gpu6  s    xx!55#678:

 	PQ()( 	?ENN/wf=>	?12@A?@V	
 	r~&NODZLPQRSh	
 	45zJ./>?/0R	
 	wO	
 	BCv01U	
 	HTUVW<=GHTU{#89~
23BC34/0{#^	
 	>?12LZLY	
 	{#UVl	
 	89~
23BC34V	
r)   c                n    | j                   j                  j                         D ]  \  }}||k(  s|c S  y)z:Map a kernel parameter name back to its graph buffer name.N)r!   r  rJ  )r"   rv  
graph_name
inner_names       r'   rr  zPallasKernel._param_to_buf_name  s<    &*ii&=&=&C&C&E 	""J
U"!!	" r)   c                   |j                   }| j                  }| j                  rdnd}|j                  rdnd}g }|j                  D ]A  }| j                  |      }|j                  |r| j                  j                  |      nd       C | j                  }	|	rd|	 nd}
t        d |D              rdt        |       nd}|j                  d| d	| d
| | |
 d	       |j                  d       |j                  d       |j                  d       |j                  d       |j                  d       |j                  d       | j                  ||j                         |j                  D ]O  }| j                  |      }|r| j                  j                  |      nd}|6|j                  | d| d| d       Q dj                  |j                        }t        d |D              redj                  d |D              }|j                  d| d       |j                  d       |j                  d| d       |j                  d       y|j                  d       |j                  d| d       |j                  d       y)uQ  Generate tiled BlockSpec and grid variables for CPU/TPU.

        Tiles the last 1–2 dimensions of each tensor, respecting TPU
        alignment constraints (last dim multiple of 128, second-to-last
        multiple of 8).  Lower-ndim inputs are right-aligned with the
        reference output shape per numpy broadcast rules.
        r   r   Nz, max_grid_product=rN  c              3  $   K   | ]  }|d u 
 y wrQ   r=   r<  r%   s     r'   r=  z4PallasKernel._codegen_tiled_specs.<locals>.<genexpr>  s     4Q1D=4r  z, permutations=zM_tile, _grid, _ax2g = pallas_compute_tiling(_pallas_out_shapes[0], transpose=z, skip_last_n=z4, exact_only=len(_pallas_out_shapes[0]) < 2, is_tpu=r   z_ng = len(_grid)z_ref = _pallas_out_shapes[0]r  zF    pallas_make_block_spec(s, _ref, _tile, _ax2g, _ng, is_output=True)z    for s in _pallas_out_shapesrn  ro  r   c              3  $   K   | ]  }|d u 
 y wrQ   r=   r+  s     r'   r=  z4PallasKernel._codegen_tiled_specs.<locals>.<genexpr>  s     0q}0r  c              3  2   K   | ]  }t        |        y wrQ   )r   r+  s     r'   r=  z4PallasKernel._codegen_tiled_specs.<locals>.<genexpr>  s     !=a$q'!=r>  z_perm_flags = [rg  r  z]    pallas_make_block_spec(i.shape, _ref, _tile, _ax2g, _ng, permutation=p) for i, p in zip([z], _perm_flags)zG    pallas_make_block_spec(i.shape, _ref, _tile, _ax2g, _ng) for i in [)r  rX  rW  r  r  rr  r  r  r  r  rD  r   rt  r|  r  rR  )r"   r  r  skip_nhas_transposeis_tpu_literal	all_permsr%   rw  mgpmgp_arg	perms_argrv  r  r%  	perm_lists                   r'   r  z!PallasKernel._codegen_tiled_specs  s    xx&&"&"9"9w#&::7 35	(( 	A..q1H=E++//94	 ((14'u-" 4)44 d9o./ 	
 	& (!( #$%k'!%	
 	)*5623T	
 	89s&&tS-D-DE ,, 	GE..u5HDLT2266x@RVF!%E7)F81EF		G YYs667
0i00		!=9!==INN_YKq9:NN56NN%%/LA NN3NN56NNYZdYeefg NN3r)   c                   |j                   }| j                  rdnd}|j                  d       |j                  d|z          |j                  d       |j                  d       |j                  d       |j                  d|j                   d	       |j                  d
| d	       |j                  |rd| dnd       |j                  d       |j                  rA|j                  D cg c]  }d| d
 }}|j                  ddj                  |       d	       |j                  d       |j                  d       |j                  d       |j                  d       |j                  d       |j                  d       y c c}w )N_gridz(1,)r  r  z     out_shape=out_shapes_pallas,z    out_specs=out_specs_pallas,z    in_specs=in_specs_pallas,z    interpret=r  z	    grid=z    input_output_aliases={ r  z    input_output_aliases={},r  zpallas_ensure_nonzero_rank(r   r   zif isinstance(_result, tuple):zF    _result = tuple(r.reshape(s) for r, s in zip(_result, out_shapes))r  z,    _result = _result.reshape(out_shapes[0])zreturn _result)r  r  rt  r  r  rR  )	r"   r  r  r  r  r  	grid_exprr%    kernel_input_params_nonzero_ranks	            r'   r  z)PallasKernel._codegen_jit_wrapper_cpu_tpu  sm    xx#00Gf	23v
*+9:8967(=(='>a@A9+Q/0 ++<*=TB/	

 	t""<?<S<S078-aS20, 0 NNT$)),L"M!NaPQs78T	
 	wEF'(0s   -Fc                f    |j                   r| j                  ||       y | j                  ||       y rQ   )r  _codegen_main_entry_tpu_codegen_main_entry_default)r"   r  r  s      r'   r  z PallasKernel._codegen_main_entry  s+    ::((.>?,,S2BCr)   c                   |j                   }|j                  d       |j                   d}|j                  }|j                  d| ddj                  |j                         d       |j                         5  |j                  d       |j                  d       t        |j                        t        |j                        z   }|D ]  }|j                  | d	| d
| d|          |j                  d       g }|j                  D ]0  }	|j                  |	 d|	 d|	 d       |j                  |	 d       2 |j                  D ]0  }
|j                  |
 d|
 d|
 d       |j                  |
 d       2 |j                  ddj                  |j                  D cg c]  }d| d
 c}      z   dz          g }|j                  D ]p  }|j                  j                  |      }|<t        j                  j                  |      }||j                  t!        |             \|j                  d| d       r |j                  ddj                  |      z   dz          ddg}|j#                  |j$                         |j#                  |       |j                  d| ddj                  |       d       g }|j                  D ]  }|j                  d| d        |rdj                  |      nd}g }|j&                  D ]  }|j                  d| d        |rdj                  |      nd}|j                  d | d!| d"|        |j                  d#| d$       |j                         5  |j                  d%       |j                         5  |j                  d&| d'       d d d        |j                  d(       |j                         5  |j                  d&| d)       d d d        d d d        t        |j                        t        |j                        z   }|j                  d*dj                  |       d+       |j                  d,       |j                         5  |j                  D ]q  }|j                  j                  |      }|:t        j                  j                  |      }||j                  d-| d.|d/       Z|j                  d-| d.| d0       s 	 d d d        |j                  d+       |j(                  r8dj                  d1 |j(                  D              }|j                  d2| d3       n|j                  d4       |j                  d%       |j                         5  |j                  d5| d6       d d d        |j                  d(       |j                         5  |j                  d7| d8       d d d        d d d        y c c}w # 1 sw Y   =xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   #xY w# 1 sw Y   xY w# 1 sw Y   WxY w# 1 sw Y   y xY w)9NrN  _mainro  r   r   , stream=None):z*jax.config.update('jax_enable_x64', False)jax.clear_caches()rn  z.to(torch.int32) if z.dtype == torch.int64 else z+# Build JAX placeholders for export tracingz$_placeholder = jax.ShapeDtypeStruct(z#.shape, torch_dtype_to_jax_runtime(z.dtype))_placeholderout_shapes = (tuple(r  ru  torch_dtype_to_jax_runtime(r  out_dtypes = (ry  rz  zexported = jax.export.export(z, platforms=['tpu'])(r   z'_'.join(str(s) for s in z	 + 'x' + z''zkernel_key = 'z	_out_' + z + '_in_' + z.if not tpu_torch_pallas.lookup_custom_kernel('z', kernel_key):ztry:z)tpu_torch_pallas.register_custom_kernel('zF', kernel_key, serialized_mlir_module=exported.mlir_module_serialized)zexcept TypeError:z/', kernel_key, exported.mlir_module_serialized)zinput_tensors = [rg  zoutput_shape_tensors = [ztorch.empty(z.shape, dtype=z, device='tpu'),z.dtype, device='tpu'),c              3  0   K   | ]  \  }}| d |   ywr|  r=   r~  s      r'   r=  z7PallasKernel._codegen_main_entry_tpu.<locals>.<genexpr>  s     )T&1aQCr!+)Tr  z_input_output_aliases = { z }z_input_output_aliases = {}z%tpu_torch_pallas.call_custom_kernel('zt', kernel_key, inputs=input_tensors, output_shapes=output_shape_tensors, input_output_aliases=_input_output_aliases)zJtpu_torch_pallas.call_custom_kernel(input_tensors, output_shape_tensors, 'z%', kernel_key, _input_output_aliases))r  rt  r  rR  r  r  rE  r  r  r  r  r  r  r   r   r  r   r  r  r  r  )r"   r  r  r  	main_namekernel_name_strall_input_params
param_nameall_jax_input_namesr  ptrr  dtype_exprsrw  r   wrapper_placeholder_argsshape_key_partsr%   output_key_exprinput_key_partsinput_key_exprinput_tensor_namesalias_map_strs                          r'   r;  z$PallasKernel._codegen_main_entry_tpu  s    xxr'u-	//9+Qtyy)?)?@AQ	
 [[] P	
 NNGHNN/0  $C$4$45S=M=M8NN. 
!l#j\ 2$%@N NNHI"$!.. H
!l"F!l"Ej\QY[ $**j\+FGH '' Ae?e>se8M $**cU,+?@A NN ))@Q@QRvdV73RST
 &(K)) P3377='GG--h7E(#**+=e+DE ""%@g#NOP NN+dii.DDtKL )5l'C$$++C,?,?@$++,?@NN/#$ %II678; !O&& O&&)B1#W'MNO 6E  1$  !O,, O&&)B1#W'MNO 6E  1$  NN  1"#~.0 NN@@QQ`a  v&[[] NN+, -RS 23[[] NN+, -;<" "&c&6&6!7$s?O?O:P!PNN.tyy9K/L.MQOP NN56 -- D"77;;DAH+ ! 1 1( ; , NN".tfN5)K[ \ %NN&tfN4&@VW NN3  $		)TCOO)T T!<]O3OP;<NN6" '( )BC NN./ '( )-.UP	 P	F Sf   , .  UP	 P	s   :D'X4!W#.GX45"XW(-)XW5,X4A8X4,BX.BX4=X)X4<X(X4#X4(W2-X5W?:XX	X4X	X4X%	!X4(X1	-X44X=c                   |j                   }|j                  d       |j                   d}|j                  d| ddj                  |j                         d       |j                         5  |j                  d       |j                  r|j                  d       |j                  d	       |j                  rK|j                  d
       |j                  D ]+  }| j                  |||j                  |j                         - |j                  d       |j                  D ]3  }|j                  d      s| j                  |||j                  d       5 |j                  d       |j                  D ]3  }|j                  d      s| j                  |||j                  d       5 |j                  d       |j                  ddj                  |j                  D cg c]  }d| d
 c}      z   dz          g }|j                  D ]p  }|j                  j                  |      }	|	<t        j                   j#                  |	      }
|
|j%                  t'        |
             \|j%                  d| d       r |j                  ddj                  |      z   dz          i |j                  D ]
  }| d|<    |j                  D ]
  }| d|<    ddg}|j)                  |j*                         |j)                  fd|j,                  D               |j                  d| ddj                  |       d       |j                  d        |j.                  rH|j                  d!       |j.                  D ](  }|j                  |   }|j                  | d"| d#       * d d d        y c c}w # 1 sw Y   y xY w)$NrN  r>  ro  r   r   r?  z)jax.config.update('jax_enable_x64', True)z>jax.config.update('jax_default_device', jax.devices('cpu')[0])r@  z*# Convert Torch -> JAX for donated outputs)
contiguousz+# Convert Torch -> JAX for in-place tensorsrk  Fz!# Convert Torch -> JAX for inputsrn  Tz-# Prepare output metadata from PyTorch tensorrB  rC  r  ru  rD  r  rE  _jaxry  rz  c              3  (   K   | ]	  }|     y wrQ   r=   )r<  r  arg_name_maps     r'   r=  z;PallasKernel._codegen_main_entry_default.<locals>.<genexpr>  s      %'+T"%r  zres = r   zjax.block_until_ready(res)z9result_values = res if isinstance(res, tuple) else (res,)z'.copy_(torch.from_dlpack(result_values[z])))r  rt  r  rR  r  r  r  r  _emit_torch_to_jaxr  r  r3  r  r  r  r   r   r  r  r   r  r  r  r  )r"   r  r  r  rG  r  rL  r  rM  rw  r   wrapper_call_argsr  r  rY  s                 @r'   r<  z(PallasKernel._codegen_main_entry_default  s    xxr'u-	9+Qtyy)?)?@AQ	
 [[] E	NNFG##T NN/0KL"%"2"2 J ++"

#&#7#7	 ,  NNHI'' U>>,/++D#szze+TU NN>?'' T>>(+++D#szzd+ST NNJKNN ))@Q@QRvdV73RST
 &(K)) P3377='GG--h7E(#**+=e+DE ""%@g#NOP NN+dii.DDtKL+-L!.. ?
.8\+>Z(?'' 1'*e4LS!1 ".| <$$S%8%89$$ %/2/F/F%  NNV$4#5QtyyAR7S6TTUVWNN78&&O 22 C"005HNN#*$KC5PSTCE	 E	F SGE	 E	s.   /CN78AN7AN7"N2/F:N72N77O c               B    |rdnd}| j                  | d| | d       y )Nz.detach().contiguous()z	.detach()z_jax = jax.dlpack.from_dlpack(r   )rt  )r  r[  r  rV  suffixs        r'   rZ  zPallasKernel._emit_torch_to_jax  s-     .8)[(#A(F8STUVr)   c                   t         j                  j                  }| j                  j	                         \  }}}}|D cg c]  }|j
                   }}|D 	cg c]  }	|	j                  d      s|	 }
}	t        t        t        |            }t        | di       }|
D 	cg c](  }	|j                  |	d      r||j                  |	         * }}	| ddj                  ||z          d}|j                  |       yc c}w c c}	w c c}	w )z7Generate the Python code that calls this Pallas kernel.rj  r  Fz.run(r   r   N)r   r   wrapper_coder!   r  r  r3  rE  r  r8   r  r  r  rR  rt  )r"   r  r  wrapperr  r  rA  r   kernel_param_namesr%   r  call_arg_strs	aliasablealias_call_argskernel_calls                  r'   call_kernelzPallasKernel.call_kernel  s    ''&&$(II$<$<$>!)Q.67aff77&8TALL<S1TTSi01D"6;	 %
}}Q& ,22156
 
 eDIIo.M$N#OqQ+& 8T
s   C9C>.C>-D)
r#   r6   r  r6   r  r   r  r   r7   None)r  r6   r7   r8   )r7   r   )r  r6   r7   r   )r  r6   r7   r   )r  r6   r7   zlist[sympy.Symbol])r  r6   r  r8   r7   z!list[tuple[int, int, int]] | None)rh  r8   ri  zlist[tuple[int, int, int]]r7   r8   )r  r   ru  r  r7   rg  )rK  r  r7   list[int] | None)r  rE  r7   zdict | None)r  r8   r  r6   r7   ztuple[int, ...] | None)r  r8   r  r6   r7   z.tuple[tuple[int, ...], tuple[int, ...]] | None)r  r  r  r  r7   rh  )r  r6   r7   r  )r   r   r7   r  )r  r  r7   r  )r  r  rx  r  r7   rh  )r  r  r7   ztuple[bool, bool])r7   tuple[int, ...])r  r   r7   r  )r7   r  )r7   r   )r  r8   r7   z'tuple[Any, Any, Any, list, bool] | None)r  r6   r7   ztuple[int, OrderedSet])r  r6   r   r   r7   r   )
rL  rE  r  rE  r  r   r2  r   r7   r   )r  r8   r  r6   rA  r  r7   r  )rR  r8   r  ri  r7   r8   )r[  r8   r7   rR   )rh  r8   r  r8   r  r6   r7   z_IndirectAccessInfo | None)r7   rg  )
rh  r8   r  r8   r  r6   rA  r  r7   r8   )r  r8   rR  r8   r7   r8   )r  r8   r  r6   rR  r8   r7   r8   )r  r6   rA  r  r7   r  )rw  r8   r  r6   r  r   r7   r   )r  r8   r7   r   )r  r8   r  r   r  r   r7   r  rQ   )r  r8   r  r8   r  r6   r  r   rA  r  r  r   r7   r  )r  r8   r  r   r  zdict[str, Any]r  r8   r  r   r7   r8   )r  r8   r  r6   r7   r   )
r  r8   r  r6   r  r   r  r   r7   rg  r  )r  r6   r   r  r  int | floatr7   rj  )rN  )r  r6   r  r8   r7   dict[str, Any] | None)r  r8   r  r8   r  r  r7   rk  )r  r6   r  r8   r  r  r7   rk  )
r   r  r   r  r9  rA   r  %CSEVariable | tuple[CSEVariable, ...]r7   rl  )rG  r8   r7   r   )r  rR   r7   r8   )r  r  r  r  r7   r  )r  r   r  r   r  r  r7   rg  )r  r  r  r8   r  r   r7   rg  )r  r  r7   rg  )r  r  r7   r  )r7   z)tuple[tuple[int, ...] | None, int | None])r  list[_BroadcastedIterVar]r  r  r7   r8   )r  r   r  r  r7   rg  )r  rm  r  r  r  r  r7   r  )r  r  r  r8   r7   rg  )rv  r8   r7   rR   )
r  r  r  r8   r  r  r  r8   r7   rg  )r  r  r  r8   r7   rg  )
r  r   r[  r8   r  r   rV  r   r7   rg  )r  r8   r  zIRNode | Noner7   rg  )Zr9   r:   r;   r<   rV   	overridespallas_pexprr   r  rL   r	  r  r  r)  r'  r   r0  r5  r7  rc  r  rl  r|  r  r  r  r  r  r  rC  rF  r  r  r  r  r  r&  rB  rI  r4  r;  rI  rK  rP  rS  rZ  rn  rz  r  r  r  r  r  r  r  r  r  typing_extensionsoverrider  r  r  r  r  r  r  rE  rH  rh  r  r  r  r  r  r  r  r  r  r  r  r  rr  r  r  r  r;  r<  rZ  rf  __classcell__r  s   @r'   r  r    s    &I)5E&5.+`88&089=8FJ8	8!>FbH<8:5K7II'*I	*IV 	, 	,(S"(S,5(S	(ST    Dhh *h	hTrr *r	7rh  ,U0      *& dL		CJ(KT''	'(,6	 ## # 	#
 !# 
#J88 8 "	8
 
8t++ + "	+
 
+ZHIHI HI "	HI
 
HIT 6 6$I
I
!I
*4I
	#I
V(
T88 8 	8
 "8 
8t4YKYK *YK7:YK	YKv>>+:>	>@&9&9)3&9EO&9	&9P2h*=A	@ 7:7: 7: 	7:
 7: "7: 7: 
7:rN
N
 N
 %	N

 N
 N
 
N
` -  -^@D LP5757 *573>57FI57	57  57n EF

 ,
7B
	
 
 57NN.1N	N*

.1
CF
	
B+
+
/2+
DG+
	+
Zmm m &	m
 5m 
/m^ & &
DLQf />	 ++ $+ 	+
 
+ l8l8 l8 $	l8
 
l8\->&":	2":H7HK	&eT)eT0?eT	eTN 6166  6 
	6 6&vUpL
"L
03L
	L
\G R#)#) #) +	#)
 #) 
#)JDZ"Z69Z	ZxN"N69N	N` WW(+W59WJNW	W W' 'r)   r  c                  T     e Zd ZeZedd       Z fdZeZeZ		 	 	 	 	 	 	 	 ddZ
 xZS )PallasSchedulingc                6    t        t        j                  g      S rQ   )r   r   REDUCE_TO_SINGLE_ELEMENT)clsr  s     r'   get_backend_featuresz%PallasScheduling.get_backend_features
  s     >BBCDDr)   c                   t         |   ||      sy|j                         r|j                         rddlm} i }|j
                  j                  D ](  }t        ||      s|j                  ||j                  <   * |j
                  j                  D ]<  }t        ||      s|j                  |v s||j                     |j                  k7  s< y y)NFr   )	MemoryDepT)
r  can_fuser  torch._inductor.dependenciesr{  r  readsr   r  r  )r"   node1node2r{  reads1r  r  s         r'   r|  zPallasScheduling.can_fuse  s    wu- E$6$6$8>F((.. 1c9-'*yyF388$1 ((.. %c9-#((f2Dchh'3994$% r)   c                   t         j                  j                  }||j                  v r|j                  |   S t        j
                  j                  r$t        |t        j
                  j                        nd}t        j                  |j                  d            j                         d d }|dk(  rd| }nd| d| }||j                  |<   |j                  d|      }t               }|j                  d|d	       |j                  |d
       |j                  d       t!        ||      \  }	}
|	 d|
 }|j#                  ||j%                         |       |S )NrN  zutf-8   fusedpallas_rA  rl  zasync_compile.pallas(z, r'''Tr  z''')rJ  )r   r   r_  src_to_kernelr
   tritondescriptive_namesr   hashlibsha256encode	hexdigestr  r   rt  r  r   define_kernelr  )r"   src_codenode_scheduler   r`  
fused_namekernel_hashr  compile_wrapperoriginsdetailed_originsmetadata_comments               r'   r  zPallasScheduling.define_kernel(  sZ    ''&&w,,,((22 }}.. "-1P1PQ 	
 nnX__W%=>HHJ2AN #K=1K#J<q>K*5h' ##O[A(*!!$9+"OPxt4!!&)$7w$O!!%Yb)9(:;k?+C+C+EGWXr)   )r  ztorch.devicer7   zOrderedSet[BackendFeature])r  r8   r  zSequence[BaseSchedulerNode]r   r  r7   r8   )r9   r:   r;   r  kernel_typeclassmethodry  r|  can_fuse_verticalcan_fuse_horizontalr  rr  rs  s   @r'   ru  ru    sZ    KE E
* !""" 3" 	"
 
"r)   ru  )B
__future__r   dataclassesr  r   rW  rp  typingr   r   r   r  r   torch.utils._ordered_setr   torch.utils._sympy.functionsr   rN  r
   runtime.runtime_utilsr   r   r   r   virtualizedr   block_analysisr   commonr   r   r   r   r   simdr   r   r   r   r    ro  collections.abcr>   r?   irr@   ops_handlerrA   	schedulerrB   MAIN_SUFFIX_logginggetArtifactLoggerr9   rJ   rF   r  rT   rV   	dataclassr  r  r  r  r  ru  r=   r)   r'   <module>r     sW   "    	  + +   / 8  6 >  /  C BM < && 2+-  ..228]K> >.U, Uk
K k
\          ' ' '.u=': u='p{C~ Cr)   