
    9j                       U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
mZmZ d dlmZ d dlmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZmZm Z  d d	l!m"Z"m#Z#m$Z$ d
dl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZE ddlFmGZGmHZHmIZImJZJ ddlKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZUmVZVmWZW ddlXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZg ej                  dk(  Ziej                  d        Zkej                  j                  end      Zo eg d      Zpddddddd d!d"d"d#
Zq eg d$      Zrd%d&d'd(d)d*d+d,d-d.d/
Zsd0d1d2Ztej                  ej                  gZwej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                   gZeej                     ed3<   d4 Zd5 Z	 	 	 dyd6ej                  dz  fd7Zd8 Zd9eSd:ej                  d;ed<ej                  d=ej                  d>eMfd?Zd@eeOz  dAedBedCej                  dDeez  f
dEZdFeSdGedHefdIZdFeSfdJZej"                  d6ej                  dKej                  fdL       Zej"                  d6ej                  dKej                  dMefdN       Zej"                  	 dzd6ej                  dKej                  dMedz  fdO       Zej*                   G dP dQ             Z G dR dSe6      Z G dT dU      ZdV Z G dW dXeV      Zej7                  dY        G dZ d[e      Zej7                  d\       ej;                           G d] d^e      Z G d_ d`eT      Z G da dbe      Z G dc dde      Zdee0d>eej                  dz  ezf   fdfZ G dg dh      Z G di dje      Z G dk dle      Z G dm dne      Z G do dpe3      Z G dq dr      Z G ds dt      Zej*                   G du dv             Zej*                   G dw dx             Zy){    N)CallableSequence)Enum)AnycastOptional)dependencies)is_float_dtypeis_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)free_symbol_is_typesymbol_is_typeSymT   )counters   )configcpp_buildercpu_vec_isairmetrics)'set_kernel_post_grad_provenance_tracing)LoopBody)BaseSchedulerNodeBaseSchedulingExternKernelSchedulerNodeForeachKernelSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfget_bounds_index_exprget_fused_kernel_namehas_free_symbolsis_multi_outputs_templateis_welford_reductionparallel_num_threadsPlaceholdersympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_subs)NullKernelHandleropsOpsValueV   )BackendFeatureBracesBufferCSECSEVariableDataTypePropagationDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferKernel
KernelArgsOpOverridesOptimizationContext)_get_dtype_from_loopbodies_get_loop_bodycexprcexpr_indexcodegen_randCppCSEVariableDTYPE_TO_CPPget_promote_dtype
INDEX_TYPELocalBufferContextmay_unify_binary_op_mask_typepromote_args(template_fusion_with_epilogues_supportedunify_mask_base_typevalue_to_cppwin32c                      t         rdS dS )Nz__declspec(dllexport) _IS_WINDOWS     [/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp.pyget_export_declarationrX   Z   s    &1"9r9rV   schedule)+*^||minmaxrZ   r[   r\   r^   r_   argminargmaxr]   welford)
sumprodxor_sumr^   r_   r`   ra   anywelford_reducewelford_combine)
r_   r^   rc   rd   re   rg   rh   r`   ra   rf   z
at::Tensorlongdoubleboolzstd::stringzc10::ScalarTypezat::MemoryFormatz
at::Layoutz
at::Devicez
at::Scalar)
Tensorintfloatrk   str
ScalarTypeMemoryFormatLayoutDevicenumberzstd::vectorzstd::optional)Listr   VECTORIZABLE_DTYPESc                    |t         v rt        j                  }| dv ry| dk(  ry| dv r|t        |   }|t        j                  k(  r| dv rt        t        j
                     }t        |      rd| dnd	| d
}t        |      rd	| dnd	| d}| dv r|n|}| dv r|S d| d| dS t        |       rdt        |    dS t        |       )N)re   rc   rf   r   rd   r4   )r_   ra   r^   r`   r`   ra   -std::numeric_limits<>::infinity()std::numeric_limits<>::min()>::max())r_   ra   )r_   r^   IndexValue<z>{0, }Welford<>())	DTYPE_LOWP_FPtorchfloat32rG   rk   rn   r
   r)   AssertionError)reduction_typedtypecdtypemin_varmax_varinit_vars         rW   reduction_initr      s    22;;e$EJJ>5I#I!%++.F e$ $F8=9'xx8 	 e$ #6(-8'xx8 	
 -0AA7w / 	
 vhfXJb9	

 N+,u-.c22

((rV   c                     t         t        |      }t        |       rd| dS | dv r0|t        j                  k(  rt         t        j
                     }d| dS |S )Nr   >rx   r~   )rG   r;   r)   r   rk   rn   )r   r   scalar_types      rW   reduction_acc_typer      sa    9%@AKN++a((--EJJ&u{{3K[M++rV   indexc           	      p   |t         j                  k(  }| dk(  r|r	d| d| dS |rdnd}| d| d| S | dk(  r| d	| S | d
k(  r| d| S | dk(  r| d| S | dv r|  d| d| dS | dk(  r|rd| d| d| dS d| d| dS | dk(  r6t        |t              r|\  }}	}
nt	        | |      \  }}	}
d| d| d|	 d|
 d	S | dv rkt        |d      rD|j                  t         j                  k(  r'|j                  s||  d| d| d| dS |  d| d| dS ||  d| d| d| dS |  d| d| dS t        |       )Nrc   cascade_sum_combine(, &)|rZ    rd    * re    ^ rf    || )r^   r_   z_propagate_nan(, rg   welford_combine(rh   , {})rx   r   z	_combine(z, static_cast<float>(), )))	r   rk   
isinstancetuplereduction_projecthasattrr   is_vecr   )r   var
next_value
helper_valr   	src_dtypeis_boolconjunctionmeanm2weights              rW   reduction_combiner      s    5::%G)*SAFF!(#cKU!K=*66c*&&"c*&&d:,''' !R
|1EE))%cU"ZLJ<qII%cU"ZL::**j%()D"f0LD"f!#d4&2$bDD--J(  EJJ.%% ()3%7LZLX[\a[bbcdd &&iu4I*UWX $%Yse2j\E7!LL$%Yse2j\CC

((rV   c                 J    t        |       r| d| d| dfS | dv r| dS |S )Nz.meanz.m2z.weightrx   z.index)r)   )r   accs     rW   r   r     sC    N+e}SkcU'?::	/	/f~JrV   codeiter_varnew_iter_var
loop_startloop_endreturnc                 b   t               }t        j                         5 }|j                  dt         d| dt        |       d| dt        |       d| dz          |j                  |j                                t        | j                        D ]  \  }}t        |t        t        f      sJ d}	t        |t              r|j                  }	|j                  }t        j                   d	| z   d	z   | |      }
|	rt        |	|
      }
|j                  |
        	 ddd       |S # 1 sw Y   |S xY w)
a  
    f(iter_var) is transformed to f(new_iter_var) under the inner loop
      \/
    for (new_iter_var = loop_start; new_iter_var < loop_end; new_iter_var++) {
        f(new_iter_var)
    }
    Please be careful while using this function,
    as the variable defined in f(iter_var) will be invalid outside the for loop.
    For example:
    auto tmp0 = in_ptr[x0]; ->
    for (new_x0 = start; new_x0 < end; new_x0++){
        auto tmp0 = in_ptr[new_x0];
    }
    The tmp0 is invalid outside the loop.
    zfor (r    = ; < ; ++)N\b)r6   
contextlib	ExitStack	writelinerI   rD   enter_contextindent	enumerate_linesr   ro   r:   namelineresub)r   r   r   r   r   transformed_codestack_r   deferred_namenew_lines              rW   move_code_under_inner_loopr     s?   , $~				 15""J<qc+j2I1J!Lc+h"7!8<.LM	
 	,3356 - 	1GAt    !M$- $		yyvve
3e;~PTUH'x@&&x0	11, -1, s   C:D$$D.acc_varacc_typer   r   lenc                     t               }t        j                         rd|  d| d| dn
| d|  d| d}|j                  |        |j	                  d| d	d
d|  d |||       ddg       |S )a  
    MSVC don't support dynamic array(VLA). So we use std::unique_ptr here.
    Ref: https://stackoverflow.com/questions/56555406/creating-dynamic-sized-array-using-msvc-c-compiler
    MSVC is the only one compiler without VLA. support. Since MSVC can't get good performance here.
    We just use unique_ptr make it works on MSVC.
    For other compilers, we continue to use VLA to get best performance.
    auto z_arr = std::make_unique<z[]>();r   _arr[];for (int i = 0; i < ; i++){    z
_arr[i] = r   r   )r<   r   
is_msvc_clr   
writelines)r   r   r   r   r   init_fncode_bufferacc_decls           rW   reduction_prefix_arrayr   =  s     !"K !!# y0
$se2Fz7)5R0 
 XJ("3%v.7):gne&D%EQG		
 rV   bufferr   new_namec                 D   t        | j                        D ]  \  }}t        |t        t        f      sJ t        |t              r/t        j                  d| z   dz   | |j                        |_        ]t        j                  d| z   dz   | |      | j                  |<    y )Nr   )r   r   r   ro   r:   r   r   r   )r   r   r   ir   s        rW   replace_acc_namer   ^  s    V]]+ V4
 	
 
 dL)u$058XJSDI!vve&7%&?H:PTUFMM!VrV   c           
         d}t        | j                        D ]  \  }}t        |t        t        f      sJ t        |t              r|j
                  n|}t        j                  ||      }|sU|j                         \  }}t        j                  || d| d| d|      }t        |t              r||_        || j                  |<    y)zT
    Replaces `acc = cascade_sum_combine(value, ...)` with `acc = acc + value;`
    z/(.*?)\s*=\s*cascade_sum_combine\(([^,]+),.*?\);r    + r   N)
r   r   r   ro   r:   r   r   searchgroupsr   )	r   patternr   r   contentmatchr   valuenew_contents	            rW   replace_cascade_sum_with_addr   m  s    
 AGV]]+ /4
 	
 
  *$=$))4		'7+JC&&SESEUG1*EwOK$-'	#.a !/rV   r   c                     | j                  |      st        j                  j                  S ||dz   i}t	        | |      }t        j
                  || z
        S Nr4   )hassympySZeror/   simplify)r   r   replacement	new_indexs       rW   	stride_atr     sJ    99S> ww||a.K5+.I>>)e+,,rV   
vec_lengthc                   	 dd	fd}	fd}| }t        j                  dd      }| j                  t              r| j	                  t        |      |      } t        j                  dd      }| j                  t
              r| j	                  t        ||      |      } t        j                  |       } | |k7  rt        |       S | S )a  
    Simplifies the index expression within the range of a vectorized loop.
    Given a vectorized loop variable `var` in the range of a loop with `vec_length`,
    this function transforms the `index` into an equivalent form. It handles
    simplifications for cases where `var` can be expressed as `vec_length * a + b`,
    where `b` ranges from 0 to `vec_length - 1`. The function reduces occurrences
    of `FloorDiv` and `ModularIndexing` in the `index` with best-effort optimizations.

    NOTE:
    The simplified index expression is intended for analysis purposes only, not
    for code generation. It replaces `FloorDiv` and `ModularIndexing` with free variables
    which are not dependent on the loop variable `var` in the vectorized range. Check
    https://github.com/pytorch/pytorch/pull/117221#discussion_r1449746217 for more details.

    Examples:
    1. If `var` is `x3` and `vec_length` is 16, and `x3 = 16*a + b`, then
       `FloorDiv(x3, div)` or `ModularIndexing(x3, div, mod)` becomes a free variable
       when `div` is divisible by 16.
    2. `ModularIndexing(x3, 1, mod)` can be simplified to `x3 + c` where `c` is a free
       variable when `mod` is divisible by 16.
    r   c                     t        |       }t        j                  |       k(  rt        j                   d       }dz  |S )N_div_cr4   )r   r   gcdSymbol)divisorresultdiv_freevar_idr   r   s     rW   visit_indexing_divz7simplify_index_in_vec_range.<locals>.visit_indexing_div  sK    #w'99Wj)Z7\\SE/?"@AFaNrV   c                    t        | |      }t        j                  |       k(  r!t        j                   d       }dz  |S | dk(  r;t        j                  |      k(  r"t        j                   d       z   }dz  |S )N_mod_cr4   )r   r   r   r   )r  modulusr  mod_freevar_idr   r   s      rW   visit_modular_indexingz;simplify_index_in_vec_range.<locals>.visit_modular_indexing  s     gw799Wj)Z7\\SE/?"@AFaN  \eii<
J5<<3%vn5E(FGGFaNrV   r  T)integerr  )r   Wildr   r   replacer   r   simplify_index_in_vec_range)
r   r   r   r  r	  original_indexdivmodr  r  s
    ``     @@rW   r  r    s    0 NN	 N
**Y
-CyyhsC02DE
**Y
-Cyy!oc3<>TUNN5!E*5#zBBLrV   c                 8    |rt        | ||      } t        | |      S N)r  r   )r   r   r   s      rW   stride_at_vec_ranger    s"     +E3
CUC  rV   c                   &    e Zd ZU dZeed<   eed<   y)ParallelDepthz{
    A class representing parallel depth.
    Includes the starting depth of parallelism and the depth of parallelism.
    parallel_depthstart_depthN)__name__
__module____qualname____doc__rm   __annotations__rU   rV   rW   r  r    s    
 rV   r  c                   `     e Zd Zededefd       Zdddeeez     f fdZ	d Z
d	 Zd
 Z xZS )OuterLoopFusedSchedulerNodenode1node2c                    |j                   |j                   u sJ t        d ||fD              sJ t        d ||fD              rt | |j                   t        |      t        u rt        |j                               n|gt        |      t        u r t        |j                               z   |      S |gz   |      S  | |j                   ||g|      S )Nc              3   T   K   | ]   }t        |      t        t        t        fv  " y wr  )typer  r#   r!   .0nodes     rW   	<genexpr>z3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>  s1      
  J+"
   &(c              3   >   K   | ]  }t        |      t        u   y wr  r#  r  r$  s     rW   r'  z3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>       TTtDz88T   )	schedulerallrf   r#  r  listget_outer_nodes)clsr  r   outer_loop_fusion_depths       rW   fusez OuterLoopFusedSchedulerNode.fuse  s     %//111 
 
 
 	
 
 TeU^TT E{&AA ..01  E{&AA ..01 (#   (# ( u8OPPrV   r-  r"   outer_fused_nodesc                     || _         || _        g }| j                   D ]B  }t        |t        t        f      sJ |j                  t        |j                                      D t        | %  ||       y r  )
r4  r2  r   r#   r!   extendr/  	get_nodessuper__init__)selfr-  r4  r2  flatten_snodes_node	__class__s         rW   r9  z$OuterLoopFusedSchedulerNode.__init__  su      	 (?$++ 	;Eem5G%HIII!!$u'8"9:	; 	N3rV   c                     | j                   S r  )r4  r:  s    rW   r0  z+OuterLoopFusedSchedulerNode.get_outer_nodes  s    %%%rV   c           
      8   dt         dt         dt        dt        dt        f
fdt        t	        |      dz
        D ]0  }||   j
                  }||dz      j
                  } |||d      r0 y	 |D ]  }t        j                  t        j                  |j                  d |       }t	        |j                        |kD  sMt        |t        j                        sht        |j                  |   t        j                        s|d
z  |j                  |   k  s y	 y)Nleft_loop_nestright_loop_nestloop_fusion_depthcurrent_checking_depthr   c                 ^   | j                   sJ |j                   sJ | j                   |   |j                   |   g d}t        fd|D              sy|dk\  sJ |dz
  x}dkD  rE|dz   }|t        | j                         k  sJ |t        |j                         k  sJ  | |||      syy)N)r   sizeoffsetstepsc              3   P   K   | ]  }t        |      t        |      k(    y wr  )getattr)r%  attr_compareleft_loop_levelright_loop_levels     rW   r'  zaOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner.<locals>.<genexpr>9  s2       % O\:/>?s   #&Fr4   r   T)loopsr.  r   )rA  rB  rC  rD  outer_loops_attr_compare_listrL  rM  _inners        @@rW   rP  zNOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner'  s     "''''"((((,223IJO.445KL-)   )F  $)))%6%::!a?)?!)C&-N4H4H0IIII-O4I4I0JJJJ"#%*	 !rV   r4   r   F,  T)LoopNestrm   rk   ranger   	loop_nest	functoolsreduceoperatormulrangesr   r   Integer)	r:  cpp_kernel_proxy_listr2  idxrA  rB  cpp_kernel_proxyouter_rangesrP  s	           @rW   "check_outer_fusion_loop_level_attrz>OuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr  s<   (	$(	%(	  #(	 %(	(	
 (	T 23a78 		C237AAN3C!G<FFO'	 		 !6 	$++ ''(@)@AL $++,/FF|U]];$++,CDMM !3&"))*ABC )	, rV   c                 D   |d   j                   }t        |      }|D cg c]'  }|j                  j                  | j                        ) c}|_        |d   }||j                  _        |j                  j                  d | j                   |j                  _        |S c c}w Nr   )kernel_groupOuterLoopFusedKernelrT  from_loop_levelr2  innerkernelrN  )r:  r[  rb  outer_loop_fused_kernelproxyouter_fused_proxys         rW   merge_outer_fusion_kernelsz6OuterLoopFusedSchedulerNode.merge_outer_fusion_kernelst  s     -Q/<<"6|"D /)
 OO++D,H,HI)
% 2!4-D##*,=,G,G,M,M*d**-
##) ! )
s   ,B)r  r  r  classmethodr   r3  r/  r!   r#   r9  r0  r_  rj  __classcell__r=  s   @rW   r  r    s]    "Q%"Q.?"Q "QH44   2] BC4 &Tl!rV   r  c                   2    e Zd ZddefdZd Zd Zd Zd Zy)	RecordOptimizationContext	func_namec                 .    || _         d | _        d | _        y r  )rp  current_nodeopt_ctx)r:  rp  s     rW   r9  z"RecordOptimizationContext.__init__  s    "2637rV   c                    t         j                  sJ t         j                  j                  sJ t         j                  j                  | _        | j                  J t        j                  | j                  j
                  v r-| j                  j
                  t        j                     | _        nt               | _        | j                  J | j                  | j                  _        | S r  )	r3   interpreterrr  r@   keymetars  rp  ops_namer?  s    rW   	__enter__z#RecordOptimizationContext.__enter__  s    }}}}}))))MM66  ,,,""d&7&7&<&<<,,112E2I2IJDL.0DL||''' $rV   c                     | j                   sJ | j                  sJ | j                  | j                   j                  t        j                  <   y r  )rr  rs  rw  r@   rv  r:  exc_typeexc_valexc_tbs       rW   __exit__z"RecordOptimizationContext.__exit__  s>        |||:>,,2667rV   c                     | j                   S r  )rs  r?  s    rW   get_opt_ctxz%RecordOptimizationContext.get_opt_ctx  s    ||rV   c                 6    | j                   sJ | j                   S r  )rr  r?  s    rW   get_fx_nodez%RecordOptimizationContext.get_fx_node  s           rV   N)rR   )	r  r  r  ro   r9  ry  r  r  r  rU   rV   rW   ro  ro    s#    8# 8
G
!rV   ro  c                  r    t        d | D              rJ d       t        |       x}	t        |   S d| d    dS )Nc              3   X   K   | ]"  }t        |t              xr |j                   $ y wr  )r   rF   r   )r%  args     rW   r'  z$decltype_promoted.<locals>.<genexpr>  s#     Rc:c>2AszzARs   (*z*Promotion of vector types is not supported	decltype(r   r   )rf   rH   rG   )argsdts     rW   decltype_promotedr    sP    RTRR 4R  %%2B47)1%%rV   c                      e Zd ZdZed        Zed        Zed        ZedYd       ZedYd       Z	ed        Z
ed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed         Z"ed!        Z#ed"        Z$ed#        Z%ed$        Z&ed%        Z'ed&        Z(ed'        Z)ed(        Z*ed)        Z+ed*        Z,ed+        Z-ed,        Z.ed-        Z/ed.        Z0ed/        Z1ed0        Z2ed1        Z3ed2        Z4ed3        Z5ed4        Z6ed5        Z7ed6        Z8ed7        Z9ed8        Z:ed9        Z;ed:        Z<ed;        Z=ed<        Z>ed=        Z?ed>        Z@ed?        ZAed@        ZBedA        ZCedB        ZDedC        ZEedD        ZFedE        ZGedF        ZHedGeIj                  dHeIj                  fdI       ZKedGeIj                  dJeIj                  dKeIj                  dLeIj                  dMeIj                  f
dN       ZLedGeIj                  dHeIj                  fdO       ZMedGeIj                  dHeIj                  fdP       ZNedQ        ZOedR        ZPdSeQdTeQdUeRdVeSeQeTf   dWdf
dXZUy)ZCppOverrideszMap element-wise ops to C++c                 ,    t        | |       d|  d| dS )N(r   r   r  abs     rW   addzCppOverrides.add  #    #Aq)*!A3c!A66rV   c                 ,    t        | |       d|  d| dS )Nr   - r   r  r  s     rW   r   zCppOverrides.sub  r  rV   c                 ,    t        | |       d|  d| dS )Nr  r   r   r  r  s     rW   rX  zCppOverrides.mul  r  rV   Nc                    t        | t              sJ || j                  }t        j                  j                  | ||      }t        j                  j                  j                  t        j                  j                  |      }|j                  d| |fd|i       |t        v r6|t        j                  k(  r#	 t        j                  j                  | |||       |S )Nto_dtyper   r   rF   r   r3   rf  get_to_dtype_exprcsegeneratecomputeupdate_on_argsr   r   rn   cache_dtype_convertxr   r   use_compute_typesexprcsevars         rW   r  zCppOverrides.to_dtype  s    !^,,,Ixx))!UI>&&qxx'7'7>j1e*{I6NOM!i5;;&>> HH((IvuErV   c                    t        | t              sJ || j                  }t        j                  j                  | ||d      }t        j                  j                  j                  t        j                  j                  |      }|j                  d| |fd|i       |t        v r5|t        j                  k(  r"t        j                  j                  | |||       |S )NTroundinground_to_intr   r  r  s         rW   r  zCppOverrides.round_to_int  s     !^,,,Ixx))!UI)M&&qxx'7'7>nq%j;	:RSM!i5;;&>HH((IvuErV   c                 T    |t         v sJ | dt         d       dt         |    d|  dS )Nz missing from z.DTYPE_TO_CPPzc10::bit_cast<>(r   )rG   r  )r  r   r   s      rW   to_dtype_bitcastzCppOverrides.to_dtype_bitcast  s=    $U~hZ}&UU$U 34Bqc;;rV   c                     d|  dS )Nz	std::abs(r   rU   r  s    rW   abszCppOverrides.abs       1#QrV   c                     d|  dS )Nz	std::sin(r   rU   r  s    rW   sinzCppOverrides.sin  r  rV   c                     d|  dS )Nz	std::cos(r   rU   r  s    rW   coszCppOverrides.cos  r  rV   c                     d|  d|  dS )Nr  z)(-r   rU   r  s    rW   negzCppOverrides.neg      1#S1%%rV   c                     d|  dS )Nz	std::exp(r   rU   r  s    rW   expzCppOverrides.exp  s     1#QrV   c                     d|  dS )Nz
std::exp2(r   rU   r  s    rW   exp2zCppOverrides.exp2      A3a  rV   c                     d|  dS )Nzstd::expm1(r   rU   r  s    rW   expm1zCppOverrides.expm1      QCq!!rV   c                     d|  dS )Nz	std::erf(r   rU   r  s    rW   erfzCppOverrides.erf  r  rV   c                     d|  dS )Nz
std::erfc(r   rU   r  s    rW   erfczCppOverrides.erfc$  r  rV   c                     d|  dS )Nzcalc_erfinv(r   rU   r  s    rW   erfinvzCppOverrides.erfinv(       aS""rV   c                     d|  dS )Nz
std::sqrt(r   rU   r  s    rW   sqrtzCppOverrides.sqrt-       A3a  rV   c                     d|  dS )Nz1 / std::sqrt(r   rU   r  s    rW   rsqrtzCppOverrides.rsqrt2  s      s!$$rV   c                 |    t         j                  j                  }|dk(  r|  d|  dS |d|  dS t        d|      )Naccuracy + decltype()(1)zstd::log1p(r   8unrecognized config cpp.inject_log1p_bug_TESTING_ONLY = r   cppinject_log1p_bug_TESTING_ONLYr   r  bugs     rW   log1pzCppOverrides.log1p7  sW    jj66*SQCt,,[ 1%% J3'R rV   c                     d|  dS )Nz	std::tan(r   rU   r  s    rW   tanzCppOverrides.tanC  r  rV   c                     d|  dS )Nz
std::tanh(r   rU   r  s    rW   tanhzCppOverrides.tanhH  r  rV   c                 &    t         rd|  dS d|  dS )z
        On windows std::signbit only support float type.
        Ref: https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/signbit?view=msvc-170
        z std::signbit(static_cast<float>(r   zstd::signbit(r   rS   r  s    rW   signbitzCppOverrides.signbitM  s-      /qc4	
 !1%	
rV   c                     d|  d| dS )Nz	std::pow(r   r   rU   r  s     rW   powzCppOverrides.powZ  s    1#Rs!$$rV   c                     d|  dS )Nz	std::log(r   rU   r  s    rW   logzCppOverrides.log^  r  rV   c                     d|  dS )Nzstd::nearbyint(r   rU   r  s    rW   roundzCppOverrides.roundc  s     !1%%rV   c                     d|  dS )Nzstd::floor(r   rU   r  s    rW   floorzCppOverrides.floorh       QCq!!rV   c                 H    |  d| }|  d| }d|  d| d| d| d| d| d	S )
N /  % ((z
 < 0) != (z	 < 0) ? (z != 0 ? z - 1 : z) : r   rU   )r  r  quotrems       rW   floordivzCppOverrides.floordivm  sR     Cs|3qclA3j9SE$wtfDQUPVVWXXrV   c                     d|  dS )Nz
std::ceil(r   rU   r  s    rW   ceilzCppOverrides.ceilt  r  rV   c                     d|  dS )Nzstd::trunc(r   rU   r  s    rW   trunczCppOverrides.truncy  r  rV   c                     |  d| S Nr  rU   r  s     rW   truncdivzCppOverrides.truncdiv~  s     Cs|rV   c                     d|  d| dS )Nz
std::fmod(r   r   rU   r  s     rW   fmodzCppOverrides.fmod  s     A3b1%%rV   c                     d|  dS )Nzstd::isinf(r   rU   r  s    rW   isinfzCppOverrides.isinf  r  rV   c                     d|  dS )Nzstd::isnan(r   rU   r  s    rW   isnanzCppOverrides.isnan  r  rV   c                     d|  dS )Nzstd::lgamma(r   rU   r  s    rW   lgammazCppOverrides.lgamma  r  rV   c                     d|  dS )Nz
std::acos(r   rU   r  s    rW   acoszCppOverrides.acos  r  rV   c                     d|  dS )Nzstd::acosh(r   rU   r  s    rW   acoshzCppOverrides.acosh  r  rV   c                     d|  dS )Nz
std::cosh(r   rU   r  s    rW   coshzCppOverrides.cosh  r  rV   c                     d|  dS )Nz
std::sinh(r   rU   r  s    rW   sinhzCppOverrides.sinh  r  rV   c                     d|  dS )Nz
std::asin(r   rU   r  s    rW   asinzCppOverrides.asin  r  rV   c                     d|  dS )Nzstd::asinh(r   rU   r  s    rW   asinhzCppOverrides.asinh  r  rV   c                     d|  d| dS )Nzstd::atan2(r   r   rU   r  ys     rW   atan2zCppOverrides.atan2       QCr!A&&rV   c                     d|  dS )Nz
std::atan(r   rU   r  s    rW   atanzCppOverrides.atan  r  rV   c                     d|  dS )Nzstd::atanh(r   rU   r  s    rW   atanhzCppOverrides.atanh  r  rV   c                     d|  d| dS )Nzstd::copysign(r   r   rU   r	  s     rW   copysignzCppOverrides.copysign  s      s"QCq))rV   c           	         d|  dd|  df}t        d |D              rt        d |D              S t               }t        j                  j
                  j                  t        j                  | j                        }t        j                  j
                  j                  | j                  | j                        }|j                  d| d       |j                  d	| d
|  d| d       t        j                  j                  j                  |       ||f}t        ||      D ]/  \  }}t        j                  j
                  j                  ||       1 ||fS )Nfrexp()[0])[1]c              3   r   K   | ]/  }t         j                  j                  j                  |      d u 1 y wr  r3   rf  r  try_getr%  	cache_keys     rW   r'  z%CppOverrides.frexp.<locals>.<genexpr>  (     Wyqxx||##I.d:W   57c              3   n   K   | ]-  }t         j                  j                  j                  |       / y wr  r  r  s     rW   r'  z%CppOverrides.frexp.<locals>.<genexpr>  #     UY--i8U   35)r   shapezint32_t r   r   z = std::frexp(r   r   )r.  r   r6   r3   rf  r  newvarr   int32r!  r   r   r  splicezipput)r  
cache_keysr   exponentmantissacse_varsr  cse_vars           rW   frexpzCppOverrides.frexp  s&    aS%s$'77
WJWWU*UUU~88<<&&U[[&H88<<&&QWWAGG&D(1-.xjqcXJbIJ	%h'"%j("; 	1IwHHLLY0	1!!rV   c                     d|  d| dS )Nzstd::hypot(r   r   rU   r	  s     rW   hypotzCppOverrides.hypot  r  rV   c                     d|  dS )Nzstd::log10(r   rU   r  s    rW   log10zCppOverrides.log10  r  rV   c                     d|  dS )Nz
std::log2(r   rU   r  s    rW   log2zCppOverrides.log2  r  rV   c                     d|  d| dS )Nzstd::ldexp(r   r   rU   )r  ns     rW   ldexpzCppOverrides.ldexp  r  rV   c                     d|  d| dS )Nzstd::nextafter(r   r   rU   r	  s     rW   	nextafterzCppOverrides.nextafter  s     !2aS**rV   c                     t         j                  j                  }|dk(  ry|dk(  r|  dS |dk(  r|  d|  dS |	d|  d	|  d
S t        d|      )Ncompile_errorcompile error!runtime_error	; throw 1r  r  r  z	std::max(, decltype()(0))7unrecognized config cpp.inject_relu_bug_TESTING_ONLY = r   r  inject_relu_bug_TESTING_ONLYr   r  s     rW   reluzCppOverrides.relu  s|    jj55/!#O#S	?"JSQCt,,[qcQCu55 I#Q rV   c                     d|  d| dS )Nzmin_propagate_nan(r   r   rU   r  s     rW   minimumzCppOverrides.minimum       $A3b1--rV   c                     d|  d| dS )Nzmax_propagate_nan(r   r   rU   r  s     rW   maximumzCppOverrides.maximum  rE  rV   c                     |  d| d| S )N ?  : rU   )r  r  cs      rW   wherezCppOverrides.where  s     Cs#aS!!rV   c                     d|  d| dS )Nzmod(r   r   rU   r  s     rW   r  zCppOverrides.mod  s    aS1#QrV   c                 (    t        | t        |         S r  )rO   rG   )valr   s     rW   constantzCppOverrides.constant  s    Ce!455rV   c                    t        t        j                  j                  |             }t        j                  j                  j                  t        j                  j                  |t        |             }t        j                  ||      S )Nbounds)
rC   r3   rf  rename_indexingr  r  r  r%   r1   r  )r  r   idx_strr   s       rW   
index_exprzCppOverrides.index_expr  sb    0067hhll##HHg.CD.I $ 
 ||C''rV   c                 "   t               }t        j                  j                  j	                         }|j                  d| d       t        j                  j                  |      5  |j                         5   |       }|j                  d| d       d d d        d d d        |j                  d       t        j                  j                  j                  |       t        |d| d      }|  d| d| S # 1 sw Y   exY w# 1 sw Y   ixY w)	Nr    = [&]return r   r  z())rI  z() : )r6   r3   rf  r  r"  r   swap_buffersr   r  r$  rO   )maskbodyotherr   body_varr  
other_codes          rW   maskedzCppOverrides.masked"  s    ~ 88<<&&(xj/0XX""4( 	0$++- 	0VFNNWVHA./	0 	0 	s	% "%9XJc)BC
s8*E*66	0 	0 	0 	0s$   'D8C9D9D	>DDc                     |  d| S )N && rU   r  s     rW   logical_andzCppOverrides.logical_and3       D}rV   c                     d|  S )N!rU   r  s    rW   logical_notzCppOverrides.logical_not8      1#wrV   c                     |  d| S )Nr   rU   r  s     rW   
logical_orzCppOverrides.logical_or<  rd  rV   c                     |  d| S )N != rU   r  s     rW   logical_xorzCppOverrides.logical_xorA  rd  rV   c                     d|  d|  d| dS )Nr  )( & r   rU   r  s     rW   bitwise_andzCppOverrides.bitwise_andF      1#Rs#aS**rV   c                     d|  d|  dS )Nr  z)(~r   rU   rg  s    rW   bitwise_notzCppOverrides.bitwise_notJ  r  rV   c                     d|  d|  d| dS )Nr  rp   | r   rU   r  s     rW   
bitwise_orzCppOverrides.bitwise_orN  rs  rV   c                     d|  d|  d| dS )Nr  rp  r   r   rU   r  s     rW   bitwise_xorzCppOverrides.bitwise_xorR  rs  rV   c                    t               }|j                  d       |j                         5  t        | j                     }|j                  d| d| d       |j                  d| d| d| d       |j                         5  |j                  d	|  d
       d d d        |j                  d	|  d| d|  d| d	       d d d        |j                  d       |S # 1 sw Y   BxY w# 1 sw Y   (xY w)N[&]()constexpr decltype() max_shift = sizeof(z) * CHAR_BIT;$if ((static_cast<std::make_signed_t<>>() < 0) || ( >= max_shift))return decltype(z)(0);z#)(static_cast<std::make_unsigned_t<z) << r   ()r6   r   r   rG   r   r  r  r   scalar_ts       rW   bitwise_left_shiftzCppOverrides.bitwise_left_shiftV  s   ~w[[] 	#AGG,HNN%aS(=hZ}U NN6xjA3kRSQTTcd  <!1!E:;<NN"1#%H
RUVWUXX]^_]``bc	 	t< <	 	s$   AC&C&C&C#	C&&C/c           
         t               }|j                  d       |j                         5  t        | j                     }|j                  d| d| d| d       |j                  d| d| d| d	       |j                         5  |j                  d
|  d|  d       d d d        |j                  d
|  d|  d| d       d d d        |j                  d       |S # 1 sw Y   ?xY w# 1 sw Y   (xY w)Nr|  r}  r~  z ) * CHAR_BIT - std::is_signed_v<z>;r  r  r  r  r  rp  z >> max_shift); >> r   r  r  r  s       rW   bitwise_right_shiftz CppOverrides.bitwise_right_shiftj  s   ~w[[] 
	A#AGG,HNN%aS(=hZGghpgqqst NN6xjA3kRSQTTcd  K!1!BqcIJKNN-aS1#T!B?@
	A 	t	K K
	A 
	As$   AC)C#C)C&	"C))C2seedrG  c                     d|  d| dS Nznormalized_rand_cpu(r   r   rU   r  rG  s     rW   randzCppOverrides.rand|  s    %dV2fXQ77rV   base_offsetthreads_per_roundtidvecc                     d|  d| dS r  rU   )r  r  r  r  r  s        rW   
rand_eagerzCppOverrides.rand_eager  s     &dV2k]!<<rV   c                     d|  d| dS )Nz
randn_cpu(r   r   rU   r  s     rW   randnzCppOverrides.randn  s    D6F81--rV   c           	           d|  d| d| d| d	S )Nzrandint64_cpu(r   r   rU   )r  rG  lowhighs       rW   	randint64zCppOverrides.randint64  s#    vRxr#ba@@rV   c                     d|  d|  d|  dS )Nr  z)(1) / (decltype(z)(1) + std::exp(-r   rU   r  s    rW   sigmoidzCppOverrides.sigmoid  s    1#.qc1B1#RHHrV   c           
      N   t               }d|  d}d|  d}|j                  d       |j                         5  |j                  d|  d| d| d       |j                  d	|  d
| d| d       |j                  d       d d d        |j                  d       |S # 1 sw Y   xY w)Nr  )(0)r  r|  auto left = z > 0 ? rJ  r   auto right = z < 0 ? return left - right;r  r6   r   r   )r  r   scalar_zero
scalar_ones       rW   signzCppOverrides.sign  s     ~!!D) 4(
w[[] 	3NN\!GJ<s;-qQRNN]1#WZLK=PQRSNN12	3 	t	3 	3s   ABB$r   r   r   
extra_metar   c                     t         r  )NotImplementedError)r:  r   r   r   r  s        rW   partial_accumulatezCppOverrides.partial_accumulate  s
     "!rV   NT)Vr  r  r  r  staticmethodr  r   rX  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r,  r.  r0  r2  r5  r7  rB  rD  rG  rL  r  rP  rV  r`  rc  rh  rk  rn  rr  ru  rx  rz  r  r  r   Exprr  r  r  r  r  r  ro   r8   dictr   r  rU   rV   rW   r  r    s   %7 7 7 7 7 7 ( (T 	 	 < <             & &     ! ! " "     ! ! # # ! ! % % 	 	     ! ! 	
 	
 % %     & & " " Y Y ! ! " "   & & " " " " # # ! ! " " ! ! ! ! ! ! " " ' ' ! ! " " * * " "  ' ' " " ! ! ' ' + +   . . . . " "     6 6 ( ( 7 7          + + & & + + + +  &  " 85:: 8uzz 8 8 
=jj
=ZZ
= !::
= ZZ	
=
 ZZ
= 
= .EJJ .

 . . A

 AEJJ A A I I 
 
"" " 	"
 cN" 
"rV   r  r  c                   &    e Zd ZdZ fdZed        Zed        Zed        Zed        Z	ed        Z
ed        Zed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed        Z"ed         Z#ed!        Z$ed"        Z%ed#        Z&ed$        Z'ed%        Z(ed&        Z)ed'        Z*ed(        Z+ed)        Z,ed*        Z-ed+        Z.ed,        Z/ed-        Z0ed.        Z1ed/        Z2ed0        Z3ed1        Z4ed2        Z5ed3        Z6ed4        Z7ed5        Z8ed6        Z9ed7        Z:ed8        Z;ed9        Z<ed:        Z=ed;        Z>ed<        Z?ed=        Z@ed>        ZAed?        ZBed@        ZCedA        ZDedB        ZEedC        ZFedD        ZGedE        ZHedF        ZIedG        ZJedH        ZKedI        ZLedJ        ZMedSdK       ZNedSdL       ZOedM        ZPedN        ZQedO        ZRedP        ZSeTdQ        ZUeTdR        ZV xZWS )TCppVecOverridesz.Map element-wise ops to aten vectorization C++c                     t         |   |       fd}t        t              j	                         D ];  \  }}t        |dd       t        u s|dvst        | ||j                               = S )Nc                       fd}|S )Nc                     | D cg c]@  }t        |t        t        j                  f      st        |t              r|j
                  s|B }}| D cg c]   }t        |t              r|j
                  r|" }}t        |       }|r|rg }| D ]  }t        |t        t        j                  f      rt        |t        j                        r1|j                  s%t        j                  |t        j                        }n$t        j                  |t        j                        }t        |t              r|j                  n|}|j                  |        |r=t!        |      dk(  rt#        |      }n#
t$        j&                  u rt#        |dd        |dd  |r|rt        t(        j*                  t,              sJ |D cg c]p  }t        |t              r\|j
                  sP
t$        j.                  t$        j0                  t$        j2                  fvrt(        j*                  j5                  |      n|r }}|r 
|i |S t7        t$              }t9        |
j:                        }|J  || i |S c c}w c c}w c c}w )Nr   r4   )r   rm   r   r  rF   r   r/  	is_numberr1   rV  r   int64rP  r2   r   appendr   rL   r  rL  r3   rf  CppVecKernelr  r  r  	broadcastr8  rJ  r  )r  kwargsr  scalarsvectorsnew_argsnew_arg
scalar_opsscalar_funcr=  funcr:  s            rW   wrapperz6CppVecOverrides.__new__.<locals>.wrap.<locals>.wrapper  s8     $!#UZZ'89"37

    $!#~63::  
  :w!H# -%cC+<=)#uzz:3==&)nnS%++&F&)ll3&D/9#x/H#))cC ,-  8})#/#9!6!66'3HQRL'A w%ahh===  (0  $ !+7N C(/$($3$8$8$3$9$9$3$=$=("%"	 HH..w7 ")) H  $ 4V44 "'!=J")*dmm"DK&222&777@ s   AI(%I-5A5I2rU   )r  r  r=  r:  s   ` rW   wrapz%CppVecOverrides.__new__.<locals>.wrap  s    @8D NrV   r=  )r`  rV  )	r8  __new__varsr  itemsrJ  r  setattr__func__)r1  r  kargsr  r   methodr:  r=  s         @rW   r  zCppVecOverrides.__new__  sx    ws#O	b !1779 	;LD&v{D1\Ad S G dD$9:	; rV   c                     |  d| S )Nr   rU   r  s     rW   r  zCppVecOverrides.add      Cs|rV   c                     |  d| S )Nr  rU   r  s     rW   r   zCppVecOverrides.sub  r  rV   c                     |  d| S Nr   rU   r  s     rW   rX  zCppVecOverrides.mul  r  rV   c                     |  d| S r  rU   r  s     rW   truedivzCppVecOverrides.truediv   r  rV   c                     |  dS )Nz.abs()rU   r  s    rW   r  zCppVecOverrides.abs$      F|rV   c                     |  dS )Nz.sin()rU   r  s    rW   r  zCppVecOverrides.sin(  r  rV   c                     |  dS )Nz.cos()rU   r  s    rW   r  zCppVecOverrides.cos,  r  rV   c                     |  dS )Nz.exp()rU   r  s    rW   r  zCppVecOverrides.exp0  r  rV   c                     |  dS )Nz.exp2()rU   r  s    rW   r  zCppVecOverrides.exp24      G}rV   c                     d|  d}|  d| S )Nr  r  z	.exp() - rU   )r  vec_ones     rW   r  zCppVecOverrides.expm18  s#     aS%IgY''rV   c                     |  dS )Nz.erf()rU   r  s    rW   r  zCppVecOverrides.erf>  r  rV   c                     |  dS )Nz.erfc()rU   r  s    rW   r  zCppVecOverrides.erfcB  r  rV   c                     |  dS )Nz	.erfinv()rU   r  s    rW   r  zCppVecOverrides.erfinvF      IrV   c                     |  dS )Nz.sqrt()rU   r  s    rW   r  zCppVecOverrides.sqrtJ  r  rV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr   == r   r   r3   rf  r  rF   r   _get_mask_typer	  s     rW   eqzCppVecOverrides.eqN  c    !((L111!^,,,ww"""(())!''231QCtA3a@@rV   c                    t        t        j                  t              sJ t        | t              sJ | j
                  t        j                  k(  rO|j
                  t        j                  k(  sJ t        t        j                  j                  | |f      \  }}| d| S | j
                  J t        j                  j                  | j
                         d|  d| dS )Nrm  r  r   )r   r3   rf  r  rF   r   r   rk   rN   r  r  )r  r
  x_casty_casts       rW   nezCppVecOverrides.neU  s    !((L111!^,,,77ejj 77ejj(((1!((2B2BQFKNFFXT&**77&&&hh--agg67q4s!DDrV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr  r   r   r  r	  s     rW   ltzCppVecOverrides.lta  c    !((L111!^,,,ww"""(())!''231QCs1#Q??rV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr  z > r   r  r	  s     rW   gtzCppVecOverrides.gth  r  rV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr   <= r   r  r	  s     rW   lezCppVecOverrides.leo  r  rV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr   >= r   r  r	  s     rW   gezCppVecOverrides.gev  r  rV   c                     |  d| S Nrq  rU   r	  s     rW   and_zCppVecOverrides.and_}  r  rV   c                     |  dS )Nz.rsqrt()rU   r  s    rW   r  zCppVecOverrides.rsqrt      H~rV   c                     |  d| dS )Nz.pow(r   rU   r  s     rW   r  zCppVecOverrides.pow  s    E!ArV   c                     |  dS )Nz.log()rU   r  s    rW   r  zCppVecOverrides.log  r  rV   c                     |  dS )Nz.round()rU   r  s    rW   r  zCppVecOverrides.round  r  rV   c                     |  dS )Nz.floor()rU   r  s    rW   r  zCppVecOverrides.floor  r  rV   c                     |  dS )Nz.ceil()rU   r  s    rW   r  zCppVecOverrides.ceil  r  rV   c                     |  dS )Nz.trunc()rU   r  s    rW   r  zCppVecOverrides.trunc  r  rV   c                     |  d| dS )Nz.fmod(r   rU   r  s     rW   r  zCppVecOverrides.fmod  s    F1#QrV   c                     |  dS )Nz	.lgamma()rU   r  s    rW   r  zCppVecOverrides.lgamma  r  rV   c                 .    t        | |      \  } }|  d| S r  rK   r  s     rW   rc  zCppVecOverrides.logical_and  "    ,Q21Cs|rV   c                     d|  S N~rU   rg  s    rW   rh  zCppVecOverrides.logical_not  ri  rV   c                 .    t        | |      \  } }|  d| S Nrw  r  r  s     rW   rk  zCppVecOverrides.logical_or  r  rV   c                 .    t        | |      \  } }|  d| S Nr   r  r  s     rW   rn  zCppVecOverrides.logical_xor  r  rV   c                 .    t        | |      \  } }|  d| S r  r  r  s     rW   rr  zCppVecOverrides.bitwise_and  r  rV   c                     d|  S r  rU   rg  s    rW   ru  zCppVecOverrides.bitwise_not  ri  rV   c                 .    t        | |      \  } }|  d| S r	  r  r  s     rW   rx  zCppVecOverrides.bitwise_or  r  rV   c                 .    t        | |      \  } }|  d| S r  r  r  s     rW   rz  zCppVecOverrides.bitwise_xor  r  rV   c                     |  d| S )Nz << rU   r  s     rW   r  z"CppVecOverrides.bitwise_left_shift      D}rV   c                     |  d| S )Nr  rU   r  s     rW   r  z#CppVecOverrides.bitwise_right_shift  r  rV   c                     t        t        j                  t              sJ t        j                  j	                  | |       S r  )r   r3   rf  r  load)r   rG  s     rW   	load_seedzCppVecOverrides.load_seed  s.    !((L111((--f-./rV   c                 |    t        t        j                  t              sJ t	               }d|  d}t        |||      S )Nz)result[offset_idx] = normalized_rand_cpu(, offset[offset_idx]);r   r3   rf  r  r6   rE   r  rG  r   rand_functions       rW   r  zCppVecOverrides.rand  s@    !((L111~7v=ST 	 FD-88rV   c                 |    t        t        j                  t              sJ t	               }d|  d}t        |||      S )Nzresult[offset_idx] = randn_cpu(r  r  r  s       rW   r  zCppVecOverrides.randn  s;    !((L111~9$?UVFD-88rV   c                     t        t        j                  t              sJ t	               }d|  d| d| d}t        |||t        j                        S )Nz#result[offset_idx] = randint64_cpu(z, offset[offset_idx], r   r   )r   r3   rf  r  r6   rE   r   r  )r  rG  r  r  r   r  s         rW   r  zCppVecOverrides.randint64  sS    !((L111~=dVCYZ]Y^^`ae`ffhiFD-EErV   c                 ~    | j                   |j                   k(  sJ d       |  dt        j                  | |       d| S )Nz;remainder vec implementation expect the same inputs' dtype.z - (z) * )r   r  r  r  s     rW   	remainderzCppVecOverrides.remainder  sI    ww!''! 	
I	
! D11!Q78QC@@rV   c                     |  dS )Nz.tan()rU   rg  s    rW   r  zCppVecOverrides.tan  r  rV   c           	          t         j                  j                  r"d|  d}d|  d}d|  d}| d| d| d|  d| 	S |  d	S )
Nr  r  z)(2)z)(-2)z / (z + (r   z).exp()) - z.tanh())r   r  use_decompose_tanh)r  r  vec_twovec_minus_twos       rW   r  zCppVecOverrides.tanh  sl    ::((!!D)G!!D)G's%0M)4y]O3qcWIV S= rV   c                     |  dS )Nz.reciprocal()rU   rg  s    rW   
reciprocalzCppVecOverrides.reciprocal  s    M""rV   c                     |  dS )Nz.atan()rU   r  s    rW   r  zCppVecOverrides.atan
  r  rV   c                     |  dS )Nz.acos()rU   r  s    rW   r  zCppVecOverrides.acos  r  rV   c                     |  dS )Nz.asin()rU   r  s    rW   r  zCppVecOverrides.asin  r  rV   c                     |  dS )Nz.cosh()rU   r  s    rW   r  zCppVecOverrides.cosh  r  rV   c                     |  dS )Nz.sinh()rU   r  s    rW   r  zCppVecOverrides.sinh  r  rV   c                     |  dS )Nz.log10()rU   r  s    rW   r0  zCppVecOverrides.log10  r  rV   c                     |  dS )Nz.log2()rU   r  s    rW   r2  zCppVecOverrides.log2"  r  rV   c                     |  d| dS )Nz.nextafter(r   rU   r	  s     rW   r7  zCppVecOverrides.nextafter&  s    Ks!$$rV   c                     |  d| dS )Nz
.copysign(r   rU   r  s     rW   r  zCppVecOverrides.copysign*  s    Jqc##rV   c                     |  d| dS )Nz.atan2(r   rU   r  s     rW   r  zCppVecOverrides.atan2.      GA3a  rV   c                     |  d| dS )Nz.hypot(r   rU   r  s     rW   r.  zCppVecOverrides.hypot2  r0  rV   c           
      <    d|  d}d|  d}| d| d|  d| d|  d
S )	Nr  r  z)(0.5)z * ((r   z)/(r  z)).log()rU   )r  r  vec_one_halfs      rW   r  zCppVecOverrides.atanh6  sE     aS%"1#V,uWIS3wis1#XNNrV   c                     |  dS )Nz.asinh()rU   r  s    rW   r  zCppVecOverrides.asinh=  r  rV   c                     |  dS )Nz.acosh()rU   r  s    rW   r  zCppVecOverrides.acoshA  r  rV   c                     t         j                  j                  }|dk(  ry|dk(  r|  dS |dk(  r|  d|  dS |	d|  d	|  d
S t        d|      )Nr9  r:  r;  r<  r  r  r  zat::vec::clamp_min(r=  r>  r?  r@  r  s     rW   rB  zCppVecOverrides.reluE  s|    jj55/!#O#S	?"JSQCt,,[(;qc?? I#Q rV   c                     d|  d|  d|  dS )Nr  z)(1)/(decltype(z)(1) + z.neg().exp())rU   r  s    rW   r  zCppVecOverrides.sigmoidV  s    1#_QCwqcGGrV   c                     |  dS )Nz.neg()rU   r  s    rW   r  zCppVecOverrides.negZ  r  rV   c                    t        | j                        r)| j                  |j                  k(  sJ d       d|  d| dS t        d | |fD              sJ d|  d}t        j                  j                  |j                        dk  r,| ddt        j                  j                  z  dz
   d	| d
| d}|  d| }d|  d| d| d}d|  d| d| d| d	}| d| d| d| d
| d| dS )NzDdiv_floor_floating_vec implementation expect the same inputs' dtype.zdiv_floor_floating_vec(r   r   c              3   F   K   | ]  }t        |j                          y wr  )r   r   )r%  items     rW   r'  z+CppVecOverrides.floordiv.<locals>.<genexpr>f  s     G'

3G   !r  r4   ::blend<r  (1), r  r  r  rm  z(0))r  r   z	(0)) != (z(0)))z	::blendv(r  rq  )r
   r   r.  r3   rf  _get_raw_num_vectorstiling_factor)r  r  _tr  has_remis_negs         rW   r  zCppVecOverrides.floordiv^  s1   !''"77agg% V% -QCr!A66GAGGGGQCq!Bxx,,QWW59d(A)?)?$?1#D"ERt5QRPSSTUSA3<D!Cs$rd$/G!Ct9QCs2$e<FT4&4&B4uWISPQRRrV   c                     t         j                  j                  |j                        dk  r2d| d}| ddt         j                  j                  z  dz
   d| d| d}|  d| S )Nr4   r  r   r=  r  r>  r  )r3   rf  r?  r   r@  )r  r  rA  s      rW   r  zCppVecOverrides.truncdivq  sp     88((1A5QCq!B$hQXX%;%; ;q@AB4uQCqQACs|rV   c                     | j                   t        j                  k(  rO|j                   t        j                  k(  sJ t        t        j
                  j                  | |f      \  }}| d| S d|  d| dS )Nrq  at::vec::minimum(r   r   r   r   rk   rN   r3   rf  r  r  r  a_castb_casts       rW   rD  zCppVecOverrides.minimumz  l    77ejj 77ejj(((1!((2B2BQFKNFFXS))&qcA3a00rV   c                     | j                   t        j                  k(  rO|j                   t        j                  k(  sJ t        t        j
                  j                  | |f      \  }}| d| S d|  d| dS )Nrw  at::vec::maximum(r   r   rG  rH  s       rW   rG  zCppVecOverrides.maximum  rK  rV   c                     |  d|  S r  rU   rg  s    rW   squarezCppVecOverrides.square  r  rV   c                    t        t        j                  t              sJ |j                  t
        j                  k(  rY|j                  t
        j                  k(  sJ t        t        j                  j                  | ||f      \  }}}d| d| d| d| d	S d| d| d| dt        j                  j                  | |j                         d	S )Nr  
)::blendv(r   r   )
r   r3   rf  r  r   r   rk   rN   r  _get_mask_cast)r  r  rK  blendv_ablendv_bblendv_cs         rW   rL  zCppVecOverrides.where  s    !((L11177ejj 77ejj(((+?  1a),(Hh xj
8*Bxj8*TUVVqcA3b2ahh6M6MaQRQXQX6Y5ZZ[\\rV   c                 ~   t               }d|  d}d|  d}d|  d| d| d| d|  d}d|  d| d| d|  d| d}|j                  d       |j                         5  |j                  d	| d
       |j                  d| d
       |j                  d       d d d        |j                  d       |S # 1 sw Y   xY w)Nr  r  r  rQ  r   r   r   r|  r  r   r  r  r  r  )r  r   vec_zeror  blendv_lblendv_rs         rW   r  zCppVecOverrides.sign  s    ~qc&aS%qcH:Ry8*CPQsRSTqcH:Ry1#S
RSTw[[] 	3NN\(156NN]8*A67NN12	3 	t	3 	3s   <B33B<c                 "   |t         j                  t         j                  t         j                  t         j                  t         j
                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  fv sJ t         d|        t        | t              sJ | j                  }t         j"                  j%                  | ||      }t         j"                  j&                  j)                  t         j"                  j*                  |      }|j-                  d| |fd|i       |t.        v r5|t         j                  k(  r"t         j"                  j1                  | |||       |S )N does not support r  r   )r   rk   float64rn   bfloat16float16uint8int8r#  r  float8_e4m3fnfloat8_e5m2r  r   rF   r   r3   rf  r  r  r  r  r  r   r  r  s         rW   r  zCppVecOverrides.to_dtype  s"   JJMMKKNNMMKKJJKKKK
 
 	2 Z)%1	2 
 !^,,,GG	xx))!UI>&&qxx'7'7>j1e*{I6NOM!i5;;&>HH((IvuErV   c                 6   |t         j                  t         j                  t         j                  fv sJ t         d|        t        | t              sJ | j                  }t        j                  j                  | ||d      }t        j                  j                  j                  t        j                  j                  |      }|j                  d| |fd|i       |t        v r5|t         j                   k(  r"t        j                  j#                  | |||       |S )Nr[  Tr  r  r   )r   r_  r`  r#  r  r   rF   r   r3   rf  r  r  r  r  r  r   rn   r  r  s         rW   r  zCppVecOverrides.round_to_int  s    KKJJKK
 
 	2 Z)%1		2 

 !^,,,GG	xx))!UI)M&&qxx'7'7>nq%j;	:RSM!i5;;&>HH((IvuErV   c                 z    t         j                  j                  }|dk(  r|  d|  dS ||  dS t        d|      )Nr  r  r  z.log1p()r  r  r  s     rW   r  zCppVecOverrides.log1p  sT    jj66*SQCt,,[S>! J3'R rV   c                 	   t        t        j                  t              sJ t	               }t        j                  j
                  j                         }t        j                  j                  |       5 }|j                  d| d       t        j                  j                  |      5  |j                         5   |       }|j                  d| d       d d d        d d d        d d d        |j                  d       t        j                  j                  j                  |       j                  | d}fd}|j                  r|}	n ||      }	t        |t                  }
 ||
      }t        t"              sJ |       |j                  r6t	               }|j                  d       t        j                  j                  |      5  |j                         5  |j                  d| d	       |j                         5  |j                  d| d       d d d        |j                  d
       |j                         5  t        j                  j
                  j%                  t        j                  j                  |	      }t        j                  j
                  j%                  t        j                  j                  |      }t        |t"              sJ |       t        |t"              sJ |       |_        |_        t        j                  j&                  }|j                  d|j)                  |||       d       d d d        d d d        d d d        |j                  d       t        j                  j
                  j%                  t        j                  j                  |      }t        |t"              sJ d|_        n|j                  rKt        j                  j
                  j%                  t        j                  j                  |  d|	 d|       }nJt        j                  j
                  j%                  t        j                  j                  |  d| d|
       }|j+                  d| |||fi        |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   uxY w# 1 sw Y   zxY w# 1 sw Y   xY w)Nr   rX  rY  r   r  c                     t         j                  k(  r$t        j                  j	                          d|  dS t        j                  j                         d|  dS )N::from(r   r  )r   rk   r3   rf  r  _get_vec_type)r   r   s    rW   maskify_or_vecifyz1CppVecOverrides.masked.<locals>.maskify_or_vecify  s]     EJJ& 88**,-WTF!< ..u56avQ?rV   [&]if (z.all_zero())elseTrI  rJ  r`  )r   r3   rf  r  r6   r  r"  r`  r   rZ  r   r  r$  r   r   rO   rG   rF   r  	overridesrL  r  )r[  r\  r]  r   r   new_maskr  	body_coderi  body_code_vecr_  other_code_vecbody_vec_varother_vec_varrm  r  r   s                   @rW   r`  zCppVecOverrides.masked  s   !((L111~hhll!!#XX__T" 	4hNNU3%v./&&t, 4dkkm 4234 4	4
 	s	%e2J		 ==%M-i8M!%e)<=
*:6(N3=X=3??>DNN5!&&t, dkkm hZ|<=[[] @NNW^,<A#>?@v&[[] #$88<<#8#8((%$L %&HHLL$9$9((&%M &lNCQ\QC%m^DSmSD).L&*/M' **  NN!)//(L-"X!YYZ[% 4 NN4 XX\\**  F fn555 FM]]XX\\**  TF#m_C?O"PF XX\\**  TF#i[J<"HF
 	htUF(CRHO4 4 4 4	4 	4>@ @    s   35R(R9RRRS&SR+)SDR8SSR	RR	RR(+R50S8S=SS	
SSc                 X   t        t        j                  t              sJ t        j                  j	                  |       }t        j                  j
                  t        j                  j                     }t        j                  j                  ||      }|dk(  rt        j                  | |      S |t        j                  j                  j                  t        j                  j                  t        |      t        |             }t        j                   ||      }t        |t"              r|j$                  }t        j                  j'                  ||      }n:t        j                  j)                  d ||t        j                  j                        }|j+                  d| |fi        |S )Nr   rR  rV  )r   r3   rf  r  rT  itervars
tiling_idx_try_get_const_strider  rV  r  r  r  rC   r%   r1   r  r2   r   arange_load_or_store_non_contiguousr  )r  r   r   
tiling_varstrider\  r   r  s           rW   rV  zCppVecOverrides.index_expr/  s3   !((L111((.XX&&qxx':':;
//zBQ;**477((,,''  %,7LT7R ( C LLe,E%*XX__UF3FXX;;eUAHH$4$4F 	lT5M2>rV   c           
      x   d|  dd|  df}t        d |D              rt        d |D              S t        | j                     }t        j
                  j                  rt        j
                  j                  nt        j
                  j                  }t               }t        j
                  j                  j                  t        j                        }t        j
                  j                  j                  | j                        }|j                  d| fi        |j                  d| fi        t        j
                  j                  | j                        }|d	k(  rd
| dnd| d| d}|j                  |d	k(  rd| dnd| d| d       |j                  | d| d       |j                  d       |j!                         5  |j                  d| dt        j
                  j                   d       |j                  |  dt#        |       d       |j                  dt        j
                  j                   d       |j                  d| dt        j
                  j                   d       |j                  dt#        |       d       |j!                         5  |j                  d       d d d        |j                  |d	k(  r| dt#        |       dn| d| d t#        |       d       |j                  | d!| d"t#        |       d       d d d        |j                  d#       t        j
                  j$                  j'                  |       ||f}	t)        ||	      D ]/  \  }
}t        j
                  j                  j+                  |
|       1 ||fS # 1 sw Y   xY w# 1 sw Y   xY w)$Nr  r  r  c              3   r   K   | ]/  }t         j                  j                  j                  |      d u 1 y wr  r  r  s     rW   r'  z(CppVecOverrides.frexp.<locals>.<genexpr>J  r  r  c              3   n   K   | ]-  }t         j                  j                  j                  |       / y wr  r  r  s     rW   r'  z(CppVecOverrides.frexp.<locals>.<genexpr>K  r  r   r   r,  )r  r4   at::vec::Vectorized<r   at::vec::VectorizedN<r   zat::vec::Vectorized<int32_t> r   zat::vec::VectorizedN<int32_t, > r   r|  __at_align__ std::array<	> tmpbuf;.store(tmpbuf.data(), r   z!__at_align__ std::array<int32_t, z> tmpbuf_exponent;z> tmpbuf_mantissa;r   r   z@tmpbuf_mantissa[i] = std::frexp(tmpbuf[i], &tmpbuf_exponent[i]);z? = at::vec::Vectorized<int32_t>::loadu(tmpbuf_exponent.data(), z! = at::vec::VectorizedN<int32_t, z!>::loadu(tmpbuf_exponent.data(), r   z ::loadu(tmpbuf_mantissa.data(), z();)r.  r   rG   r   r3   rf  	tail_sizer@  r6   r  r"  r   r#  r  _get_num_vectorsr   r   rD   r  r$  r%  r&  )r  r'  r   rF  r   r(  r)  n_vec
mantissa_tr*  r  r+  s               rW   r,  zCppVecOverrides.frexpG  s{   aS%s$'77
WJWWU*UUUagg&%&XX%7%7qxx!!QXX=S=S~88<<&&U[[&988<<&&QWW&5!b9!b9))!''2 z #6(!,(5'; 	
 	z ,H:Q71%8*AF	

 	*Qxj23w[[] 	NN*6("QXX5K5K4LIV NNaS 6{47H6ILMNN3AHH4J4J3KK]^ NN*6("QXX5K5K4LL^_ NN1+d2C1DFKL V NNA: *[\ghl\m[nnpq z!B5'Ijkvw{k|j}}  A
 NN*C
|+KKX\L]K^^`a+	0 	u	%h'"%j("; 	1IwHHLLY0	1!!# 	 	s&   
CN0&N$8A$N0$N-	)N00N9c                     fd}|S )Nc                     |rJ t         j                  }t        |t              sJ t	               }|j                  d       | d   j                  }|j                  |      }|j                  r|j                  n|j                  }g }t        |   }j                  dv }	|	rdn|}
j                  dk(  rt        | d      n|
}
|j                         5  t        |       D ]  \  }}t        |t              r}|j                  sJ |j                  |k(  sJ |j                  d| d|j                   d	| d
       |j                  | d| dt!        |       d       |j#                  d| d       |j#                  |        |j                  d|
 d|j                   d        | }|j                  dt!        |       d       |j                         5  |j                  d| d
       d d d        dt!        |       }|	r
d| d| d}n|dk(  rd|
 d}n	d|
 d| d}|j                  d| d| d       d d d        |j                  d       |S # 1 sw Y   kxY w# 1 sw Y   (xY w)Nr|  r   )r  r  r  rk   r  r  r   z> tmpbufr   z.store(tmpbufz	.data(), r   tmpbufz[i]z> tmpbuf_out;r   r   ztmpbuf_out[i] = ztmpbuf_out.data(), at::vec::VecMask<,z>::fromr4   r  z>::loaduz at::vec::VectorizedN<rY  r  r  )r3   rf  r   r  r6   r   r   r  r  r@  rG   r  r   r   rF   r   rD   r  )r  r  rf  r   	vec_dtyper  rF  scalar_argsr   output_maskoctypeargidxr  res	load_argsload_fnr  s                   rW   re  z)CppVecOverrides._scalarize.<locals>.inner  s   :XXFfl333>DNN7#QI++I6E'-'7'76##V=Q=QDK!),F%.. 3 K
  +VF  ((,>> T"X& 
  B#,T? 0KFC!#~6"zz)z"yyI5556vhbAUAU@VV^_e^ffgh "e=	+dBSATTVW $**VF83+?@#**3/0 .vhb9M9M8Nm\ ";/!5k$6G5HOP[[] >NN%5cU!#<=>1+d2C1DE	 1&5'IGaZ 4VHHEG 6vhbxPG	9+R@A9B: NN4 K> >'B Bs&   DI&I0AI&I#	I&&I/rU   )r1  r  re  s    ` rW   
_scalarizezCppVecOverrides._scalarize  s    4	l rV   c                    t        t              }t        t              j                         D ]S  \  }}t	        |t
              s||vs| j                  |j                        }||_        t        | |t        |             U y r  )
r  r  r  r  r   r  r  r  r  r  )r1  vec_varsr   r  r  s        rW   _initialize_scalarizez%CppVecOverrides._initialize_scalarize  sh    ( .446 	7LD&&,/D4H~~foo6 $T<#56		7rV   r  )Xr  r  r  r  r  r  r  r   rX  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rc  rh  rk  rn  rr  ru  rx  rz  r  r  r  r  r  r  r  r  r  r%  r  r  r  r  r  r0  r2  r7  r  r  r.  r  r  r  rB  r  r  r  r  rD  rG  rO  rL  r  r  r  r  r`  rV  r,  rk  r  r  rl  rm  s   @rW   r  r    sj   8[z                   ( (
         A A 	E 	E @ @ @ @ A A A A                                           0 0 9 9 9 9 F F A A   	! 	! # #               % % $ $ ! ! ! ! O O        H H   S S$   1 1 1 1   	] 	]    .   	 	 M M^  . 6" 6"p 7 7r 7 7rV   r  cppvecc                       e Zd Zed        Zy)CppTile2DOverridesc                     t        t        j                  t              sJ t        j                  j	                  |       } t
        j                  | |      S r  )r   r3   rf  CppTile2DKerneltransform_indexingr  rV  )r  r   s     rW   rV  zCppTile2DOverrides.index_expr  s=    !((O444xx**40))$66rV   N)r  r  r  r  rV  rU   rV   rW   r  r    s    7 7rV   r  c                       e Zd ZdZeZeZdZdZ	 fdZ
eefdZd Zd4dedz  fd	Zej$                  d
        Z	 d5dej*                  fdZdej*                  defdZdej*                  dej2                  fdZdej*                  dej2                  fdZd Zdej*                  dej*                  dedefdZdedej*                  fdZd4dZ d Z!de"ez  dedede#jH                  fd Z%d4de&dz  fd!Z'd" Z(	 	 d6d#Z)	 d7d$Z*d% Z+d& Z,d' Z-d( Z.d) Z/d* Z0e1defd+       Z2d, Z3ej$                  d-        Z4d. Z5d7d/Z6d0 Z7	 	 d8ded1edz  d2ej2                  dz  fd3Z8 xZ9S )9	CppKernela%  
    Base class for C++ kernel code generation in PyTorch Inductor.
    This class is responsible for generating C++ code from the intermediate representation.

    Args:
        args: Kernel arguments used for code generation
        num_threads: Number of threads for parallel execution
    r   r   c                    t         |   |       i | _        g | _        d | _        g | _        g | _        d | _        t               | _	        g | _
        t               | _        t               | _        t               | _        t               | _        t               | _        d| _        t               | _        t               | _        t'        | j(                  | j*                  d      | _        t'        | j(                  | j*                  d      | _        t'        | j(                  | j*                  d      | _        t               | _        t               | _        || _        i | _        g | _        y )NFtmp_acc)name_prefixwelford_helpercascade_helper)r8  r9  active_rangesinner_itervarscall_rangesrY  ru  reduction_depthr<   reduction_prefixreduction_prefix_generatorsreduction_suffixparallel_reduction_prefixparallel_reduction_suffixlocal_reduction_initlocal_reduction_storesis_reductionnon_parallel_reduction_prefixnon_parallel_reduction_suffixr7   newvar_prefixsuffixreduction_csewelford_helper_csecascade_helper_csepreloads
poststoresnum_threadsreduction_omp_decreduction_var_names)r:  r  r  r=  s      rW   r9  zCppKernel.__init__  s7    HJ 35:>(*,.# . 0 <>( . 0)7)9&)7)9&$2$4!&4&6#!-;-=*-;-=* !3!3T[[iX"%9I#
 #&9I#
 '((*&=?.0 rV   c                 8   t         j                  j                  r'| j                  s| j                  j	                  d       | d}t         j                  j                  rdn	t               }| d}	| j                  j	                  | d| d |||       d       | j                  j                  t        ||||||             | j                  j	                  |	 d| d       | j                  j                  d| d	d
d| d ||||	|       ddg       y )Nz(int max_threads = omp_get_max_threads();_localmax_threadsz	_arr[tid]r   r   r   zfor (int tid = 0; tid < z; tid++)r   r   r   r   )r   r  dynamic_threadsr  r   r*   r  r$  r   r  r  r   )
r:  r   r   r   r   reduction_combine_fnreduction_init_fn	acc_localr  acc_local_in_arrays
             rW   _gen_parallel_reduction_buffersz)CppKernel._gen_parallel_reduction_buffers  s=    ::%%d.L.L**44: e6N	#ZZ77M=Q=S 	 !$uI.!!++j)C(9.%(P'QQRS	
 	&&--"!		
 	##--1C0DC	{RS.TU&&11*;-x@se33NCI[glmnnop		
rV   c                 Z    | j                   D ]  }t        | j                  || d        y )Nr  )r  r   stores)r:  var_names     rW   %update_stores_with_parallel_reductionz/CppKernel.update_stores_with_parallel_reduction1  s0    00 	IHT[[(xj4GH	IrV   Nr   c                    |J t               }t        j                         5 }t        | d      rK|j	                  | j
                         | j                  |       |j                  |j                                |j	                  | j                         |j	                  | j                         |j	                  | j                         d d d        t        | d      r|j	                  | j                         | j                  r5| j                  D ]&  }| j                  |   \  }}t        ||| d||      }( |S # 1 sw Y   sxY w)Ncodegen_inner_loops_tail)r6   r   r   r   r$  r  r  r   r   loadsr  r  r  r  r  r   )r:  r   r   r\  startends         rW   gen_bodyzCppKernel.gen_body5  s   ||~!!# 	%ut23DMM*((.##DKKM2KK

#KK%KK$	% 4./KK(** X!//4
s1$uE]ESVWX 	% 	%s   B)D>>Ec              #     K   | j                   }|rYt        j                  ||      }t        |t              r3|j
                  }t        |t              sJ t        j                  |_	        || _         	 | || _         y# || _         w xY ww)z>Context manager to add an additional mask to loads and stores.N)

_load_maskr1   r  r   r2   r   rF   r   rk   r   )r:  r[  priors      rW   r`  zCppKernel.maskedI  ss      88D%(D$)zz!$777 #ZZ
	$J#DOeDOs   A/B
2A> 6B
>	BB
r   c                 P    | j                   |   }|||z  |z   i}t        ||      }|S r  )ru  r/   )r:  r   scaleitervar_idxrG  r   r   r   s           rW   scale_index_with_offsetz!CppKernel.scale_index_with_offset\  s7     mmK(C%K&01uk2	rV   r   c                 6    t        | j                  |            S )z
        Convert an index expr to a string that can be used in cpp code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the cpp kernel.
        )rC   rT  r:  r   s     rW   index_to_strzCppKernel.index_to_strd  s    
 T))%011rV   itervarc                 D     t         fd|j                  D              S )z]
        Check if an index has free symbol CppCSEVariable that depends on `itervar`.
        c              3   (  K   | ]  }|j                   j                  j                  v ret        j                  j                  |j                      t              r4j                  j                  |j                      j                          y wr  )r   r  varname_mapr   rF   
depends_on)r%  sr  r:  s     rW   r'  z6CppKernel.index_indirect_depends_on.<locals>.<genexpr>o  sj      
vv---488//7H HH  (33G<
s   BB)rf   free_symbolsr:  r   r  s   ` `rW   index_indirect_depends_onz#CppKernel.index_indirect_depends_onk  s%      
''
 
 	
rV   c                 F    ||j                   v xs | j                  ||      S r  )r  r  r  s      rW   index_depends_onzCppKernel.index_depends_onv  s,    %,,, 
0N0N71
 	
rV   c                 T    t        t        | j                  | j                              S r  )r  r%  ru  rY  r?  s    rW   
var_rangeszCppKernel.var_ranges{  s    Ct{{344rV   r  rF  lowerupperc                    |s|sy t        |t        j                        }|rIt        j                  |t
        j                        j                  }t        j                  j                  }nt        j                  j                  }	 | j                  t        j                  _
        t        j                  |t
        j                        j                  }|t        j                  _
        | j                  }|r.t        j                  j                  | j                  |            nd }	| j                  ||rdnd |	| j                        }
| j                   j#                  ||
d       y # |t        j                  _
        w xY w)N0F)
assignment)r   r   TMPr1   rV  r   r  r   r3   rf  r  r  sexprrT  indirect_assertr  r  r  )r:  r  rF  r  r  indirectr  r   prior_computesize_strr   s              rW   check_boundszCppKernel.check_bounds~  s
    &tTXX6^^D%++6<<FXX%%F HH,,M1#':: ekk:@@#0 ZZFAF188>>$"6"6t"<=D##5CdHdoo
 	&$59 $1 s   AE' 'E>r   c                 6   | j                   j                  |      }| j                  |      }| dt        |       d}| j                  j                  | j                  |t        j                  j                  |            }|j                  d| ||fi        |S )N[]r  r  )r  inputrT  rD   r  r  r  r3   graph	get_dtyper  )r:  r   r   r   r   r  s         rW   r  zCppKernel.load  s    iiood#$$U+aE*+1-""4::t177;L;LT;R"SftT5&92>rV   c                    d|v sJ | j                   j                  |      }| j                  |      }|| dt        |       d| d}n|dk(  rt        j
                  j                  s$| j                  dk(  r| dt        |       d| d}nSt        j                  j                  |      }dt        |    d	| d
}d| dt        |       d| d}nt        d|       | j                  j                  t        ||             y )Nbufr  ] = r   
atomic_addr4   z] += zstatic_cast<r  r   zatomic_add(&z], r   store mode=)r  outputrT  rD   r   r  r  r  r3   r  r  rG   r  r  r   r:   )r:  r   r   r   moder   r   r   s           rW   storezCppKernel.store  s   }}iit$$$U+<U!K./tE7!<D\!::--$2B2Ba2GaE 235qA))$/&|E':&;2eWAF%cU!K,>+?s5'L%D6&:;;l467rV   c                 H    | j                   j                  d| d| d       y )Nr  z" ? 0 : (throw std::runtime_error("z"), 0));)r  r   )r:  condmsgs      rW   device_assert_asynczCppKernel.device_assert_async  s'    v7uHE	
rV   r   r   rtyper   c                 8    ddt         d z  ffd}|S )NrF  c                 L    |  d d        dS t        |       S )Nr   r   r   )r   )rF  r   r   r   r   r  s    rW   re  z.CppKernel._gen_reduction_prefix.<locals>.inner  sH    |"1SEWUE-B,C1EE- rV   r  )rm   )r:  r   r   r  r   r   re  s    ````` rW   _gen_reduction_prefixzCppKernel._gen_reduction_prefix  s    	d
 	 	 rV   c                 h    | j                   D ]#  }| j                  j                   ||             % y r  )r  r  r$  )r:  rF  gen_fns      rW   finalize_reduction_prefixz#CppKernel.finalize_reduction_prefix  s/    66 	7F!!((6	7rV   c                 P   |dk(  ry|dk(  r|t         j                  k(  r| j                  J t        j                  t
        j                  | j                  | j                  d        }d}t        j                  j                  j                  t        j                  ||            S y)Nrg   Trc      F)r   rn   r  rU  rV  rW  rX  r  r3   r  sizevarsguard_or_falser   Gt)r:  r   r   
use_scalarreduction_size
chunk_sizes         rW   need_use_acc_helperzCppKernel.need_use_acc_helper  s     -- U"u';##///&--d..t/C/C/EFN
 J77##22588NJ3WXXrV   c           
      R   |rt        ||      n|}t        |      }|dv sJ d}	t        ||	      }
|dk(  rdnd}|r
t        |   }n&t        | d      r| j	                  |      nt        |   }| d| d|	 d	| d
| d
}|dk(  r|S t        |
t        j                        r
|
dk  rd| S |S )N)rg   rc   r  rg   WelfordHelperCascadeSumHelperrh  <r   r  r  r   rc   r4   zstatic )r   rD   rG   r   rh  r   r   rZ  )r:  r   r   helper_ranger   r  r  num_range_threadnum_range_thread_exprr  
num_chunkshelper_typeh_typehelper_init_lines                 rW   _acc_helper_initzCppKernel._acc_helper_init  s     3>GL+.< 	 !,,< =!::::
-z:
 !11 # 	
 !%(F 41 ""5)!%(  m1VHBzl"ZL$% 	 U"##j%--0Z1_ -.//##rV   c           
      >   t         j                  j                  rdn	t               }| j                  j                  | j                  ||||d |             | j                  j                  | j                  ||||||             |r|n| d}|dk(  rI| j                  j                  | d| d| d       | j                  j                  | d| d| d       y | j                  j                  | d	| d       | j                  j                  | d
| d       y )Nr  _vecrg   z = welford_combine(r   r   z_local = welford_combine(z	_local, &z = cascade_sum_final(&z_local = cascade_sum_final(&)
r   r  r  r*   r  r   r&  r  r  r  )	r:  r   r   r   r  r   r  r  r  s	            rW   _use_acc_helperzCppKernel._use_acc_helper 	  sA    $ZZ77M=Q=S 	 	**44!!
L%z	

 	!!++!!
L%j	

 #3%t--..88(-fXSBG ''11(3F89ZLPRS ..88(0B? ''11(6zl"ErV   c           
      H   |dv }|||f}|| j                   j                  v r| j                   j                  |   S | j                   j                  | j                  d| d      }| j                  j                  |        d| _        |r|n|}t        ||      }	| j                  j                  | j                  ||	||t                     | j                  ||d      rt        j                  t        j                  | j                   | j"                  d        }
|dk(  r,| j$                  j                  | j&                  d| d      }n+| j(                  j                  | j&                  d| d      }d| }| j+                  ||||
|d       | j,                  j/                  | d	t1        ||||       d
       n| j"                  J | j2                  | j"                     }t5        | j"                  dz   t7        | j2                              D ]$  }|| j                   |   z  | j2                  |   z   }& | j,                  j/                  | d	t1        ||||       d
       | j9                  ||	||       t;        ||      }|| j                   j                  |<   |S )Nra   r`   
reduction FwriteTrg   scalar_r  r   r   r4   )r   )r  reduction_cacher  r  r  r  r  r   r  r  r   r  rU  rV  rW  rX  rY  r  r  r  r  r)  r  r   r   ru  rS  r   r  r   )r:  r   r   r   r   argmax_or_argminreduction_keyr   
init_dtyper   r  r   scalar_helper_valr   r   r  s                   rW   	reductionzCppKernel.reduction@	  s   )-AA!>58D..>>>%%55mDD  ))JJ*]O4E * 
 	  ''3%1 "2Y
%njA((//&&X~z>	
 ##NE4@&--dkk$*>*>*@AN !11!44==LLJ}o">e > 
 "44==LLJ}o">e > 
 #** 6  ! !  KK!!%s,^S%IZ[\\]^ ''333MM$"6"67E4//!3S5GH BA.q1AABKK!!%s,^S%uUVVWX 	,,S(NJW">37<B**=9rV   c                     | j                  |      }| j                  j                  |      }| j                  j	                  t        || dt        |       d| d             y )Nr  r  r   )rT  r  r  r  r   r:   rD   )r:  r   r   r   r   s        rW   store_reductionzCppKernel.store_reduction|	  s]    $$U+iit$''#aE(:';4waHI	
rV   c                    | j                   ri| j                   t        |      t        |      z   k(  s+J | j                    dt        |       dt        |              | j                  t        |      k(  sJ t        |      t        |      z   | _         | j                   D cg c]  }| j	                  |       c}| _        t        t        | j
                              D cg c]  }t        t        j                  |       c}| _
        t        |      | _        | j                  d | j                   | j                  | j                  d  fS c c}w c c}w )Nr  r   )r  r   r  r   rT  rY  rS  r-   r   XBLOCKru  )r:  lengthsreduction_lengthsr  r4  s        rW   
set_rangeszCppKernel.set_ranges	  s2   ##uW~>O8P'PP ##$Dw(8EBS<T;UVP ''3w<777$W~6G0HHD<@<L<LMq4//2MDK s4;;/0 /t{{A>DM
 $'w<D MM0D001MM$..01
 	
 Ns    E !Ec                     | j                   J t        | j                         }t        j                  j                  j                  |      S r  )r  r.   r3   r  r  optimization_hint)r:  r  s     rW   	size_hintzCppKernel.size_hint	  s?    +++T--.ww11$77rV   c                 ^   t        | t              sJ t               | j                  J t        |j                  t
              r+|j                  j                  |j                               n | j                  |j                               |j                  d uxr# |j                  j                     j                  t        j                         5 }j                  r6rj                          nj                         |j!                         n4dkD  r/j#                         r|j%                  j'                                dt(        ffddd	 ddt(        dt*        ffdddt(        dt*        ffd	 	 ddt(        dt*        dt,        ffd	|j%                  j'                                t        |j                  t
              r t        t.        j0                  t2              rt.        j0                  j4                  rt.        j0                  j4                  }|j7                         D ]  }t9        |j;                         j<                  D cg c]  }| j?                  |       c}      }t@        |j;                         jB                     }	d
|	 dtE        |       d}
|jG                         }jI                  d|	 d| d|
 d       jI                  |	 d| d| d         |       d d d        y c c}w # 1 sw Y   y xY w)Nr4   
_loop_nestc                      fd} j                         }t        |t              r|j                  D ]
             y t        |t              sJ  j
                   |       r|j                          t        j                         5 }|j                  j                                |j                         d d d        y # 1 sw Y   y xY w)Nc                      j                   sJ j                   j                     } | j                  xr | j                  S r  )rN  r  r  parallel)rootrB  	par_depths    rW   is_parallel_reductionzOCppKernel.codegen_loops_impl.<locals>.gen_kernel.<locals>.is_parallel_reduction	  s=    %++++%++I,A,ABD,,>>rV   )
get_kernelr   rc  re  CppKernelProxyrN  r  r   r   r   r   r  )rB  rH  rf  r   r   gen_loop_nestrG  s   `   rW   
gen_kernelz0CppKernel.codegen_loops_impl.<locals>.gen_kernel	  s    ?
 $..0f&:;&,ll 2
%j12 &fn===!''38M8ODDF#--/ .5++DKKM:-. . .s   1CCc                     |r0| j                   }|r| j                  |z   }|S | j                  |z   }|S | j                  }|r|| j                  z   }|S || j
                  z   }|S r  )r  r  r  r  r  r  )rf  rE  	is_suffixr  prefixs        rW   get_reduction_prefix_suffixzACppKernel.codegen_loops_impl.<locals>.get_reduction_prefix_suffix	  s}    #44F!'!A!AF!J "M "(!E!E!N!M#44F!'&*J*J!J "M "(&*N*N!N!MrV   depthc                    | j                         }| j                  sJ | j                  |   }t        j                         5 }|j                  rI|sG 	||j
                  d      }|r|j                  j                                j                  |       
rR|j
                  rFj                         |j                  r)|j                  sJ j                  |j                          | |       
rC|j
                  r7|j                  rj                  |j                         j                          |j                  r&|s$j                   	||j
                  d             d d d        y # 1 sw Y   y xY w)NF)rN  T)rI  rN  r   r   r  rE  r   r   r$  r  r  close)rB  rQ  in_reductionrf  loopstack_outerr  r   gen_loop_atrP  is_reduction_loopthreadsworksharings          rW   gen_loop_with_reductionz=CppKernel.codegen_loops_impl.<locals>.gen_loop_with_reduction	  sA    $..0!''''!''.))+ {((+F"DMMU,( ,'55dkkmD$45(T]]#,,W5!66#)#@#@@#@ KK(C(CD
E2(T]]!88 KK(E(EF#))+((7 &+  s   D*E66E?c                 X   t        j                         5 }| j                  sJ | j                  |   }|j                         }|
	 d d d        y j	                  |       |j                  j                                 | |dz   |j                         d d d        y # 1 sw Y   y xY wr   )r   r   rN  linesr   r   r   r  )rB  rQ  r   rU  
loop_linesr   rK  s        rW   rW  z1CppKernel.codegen_loops_impl.<locals>.gen_loop_at	  s    ))+ Lu%++++%++E2D!%J!)L L OOJ/''6!*eai9J9JKL L Ls   1B AB  B)rT  c                 t    | j                   |t        | j                         k(  r	 |        y  | ||       y r  )rN  r   )rB  rQ  rT  rL  r[  s      rW   rK  z3CppKernel.codegen_loops_impl.<locals>.gen_loop_nest
  s7    
 ##+uJ<L<L8M/Mz*+J|LrV   zstd::make_unique<z []>(r   zstd::unique_ptr<z	 []> buf_r   r   z* z = buf_z.get();)FF)r   F)r   )%r   rJ  r*   r  rf  rc  decide_parallel_depthmax_parallel_depthrN  r  r  r   r   r  rS  rE  mark_parallelsingler   r   rR  rm   rk   r3   local_buffer_contextrJ   local_buffersvaluesr.   
get_layoutrF  rT  rG   r   rC   get_namer$  )r:  rT  r   rZ  r   re  local_buffersize_vallocal_buf_sizelocal_buf_dtypeallocatelocal_buffer_namerL  rW  rK  r[  rP  rX  rG  rY  s     ``        @@@@@@@@rW   codegen_loops_implzCppKernel.codegen_loops_impl	  s   $///&(+++i&&(<=!((>>,,.I 22,,.I
 OO4' D	 5 56CC 	 !!# ~	%u''$%%'((1''	21%%'''6.x .$"" DI$-0 B	L 	L 	L %*M$MM #M . 9++-ABq557IJ**88 !" 6 6 D D$1$8$8$: L%2 -9,C,C,E,J,J ( !00:&N '3<3J3J3L3R3R&SO!2?2C5~I^H__`aH(4(=(=(?%KK*?*;9EVDWWZ[cZddef KK*+2.?-@HYGZZab" )$}~	% ~	%`a~	% ~	%s    !FL#4LB	L#L##L,c                 T    t         j                  |       }| j                  |||       y r  )rR  buildro  )r:  r   rZ  rT  s       rW   codegen_loopszCppKernel.codegen_loops-
  s"    NN4(		4=rV   c                 :    t         j                  j                  ryy)NAOTI_TORCH_CHECKTORCH_CHECK)r3   r  aot_moder?  s    rW   assert_functionzCppKernel.assert_function1
  s    77% rV   c                    | j                   J | j                   |j                  |j                  |j                  z    }| j                         }d}d}|D ]m  }t        j
                  j                  j                  |d      }|d|z  k\  s||k(  r n3||z  t        j                  j                  k  r n|dz  }||z  }||z  }o t        j                  j                  r|dk(  rt        |      dkD  rd}t        ||j                        S )Nr4   r   i    fallbackr   r  r  )r  r  r  r@  r3   r  r  r?  r   r  min_chunk_sizer  r   r  )	r:  ra  rY  rY  seqparrQ  r  hints	            rW   r`  zCppKernel.decide_parallel_depth8
  s   +++!!**"..1C1R1RR

 nn 		D77##55dT5JDa'k!SG^g~

 9 99QJE4KC4KC		 ::%%%1*VqE .@.L.L
 	
rV   c              #     K   | j                   | j                  | j                  | j                  f}t	               | _         t	               | _        t	               | _        | j                  j                         | _        d  | j                  j                  | j                          | j                  j                  | j                         | j                  j                  | j                         |\  | _         | _        | _        | _        y wr  )r  r  r  r  r<   cloner  r$  )r:  r  s     rW   write_to_suffixzCppKernel.write_to_suffixU
  s     T\\4;;A#%
%'$&88>>#$$TZZ0$$T\\2$$T[[1<A9T\4;s   D
Dc                     t        |i |S r  )rF   )r:  r  r  s      rW   create_cse_varzCppKernel.create_cse_varb
  s    t.v..rV   c                     |rT|t         j                  t         j                  fv r2|t         j                  t         j                  fv rdt
        |    d| dS dt
        |    d| dS )Nzc10::convert<z>(std::round(r   r  r   )r   r`  r_  rn   rj   rG   )r:  srcr   r   r  s        rW   r  zCppKernel.get_to_dtype_expre
  se    %**ekk22ekk5<<88"<#6"7}SELL|E232cU!<<rV   c                 b    | j                  |||      }| j                  j                  ||       y r  )r  r  r&  )r:  dst	dst_dtyper  r   r  s         rW   r  zCppKernel.cache_dtype_convertn
  s(    %%c9i@T3rV   rO  r   c                 V   
 |d} j                   syg 

 fd}|-| j                   v sJ  j                   |   \  }} ||||      s4y j                   j                         D ]  \  }}|\  }} ||||      r y dj                  
      }	|	r|j                  d| d|	 d       yy)	NrR   Tc                 6   | |k(  ryd }t        j                        D ]  \  }}||k(  s|} n t              t        u r|r| dk(  r|j                  |   k(  rd}j                  | dt        |               j                  | dt        |              y)NFr   r4   r  r   T)r   ru  r#  r  rY  r  rD   )r  r  r   var_idr   _var
conditionsr:  s         rW   genz)CppKernel.codegen_conditions.<locals>.gen~
  s    |F$T]]3 4$;F
 T
i'QJ4;;v..T+e*<)=>?SS)9(:;<rV   Frb  zif(r  r   )r  r  joinr   )r:  r   rO  r   r  r  r  r  _rangejoined_conditionsr  s   `         @rW   codegen_conditionszCppKernel.codegen_conditionsr
  s     >F!!
	* ?$,,,,,++C0JE3uc3' $ 2 2 8 8 : !f#
s5#t, ! #KK
3NNS*;)<B?@rV   r  )r4   r   NFFNN):r  r  r  r  r  rm  rC   r  r  r  r9  r   r   r  r  r6   r  r   contextmanagerr`  r   r  r  ro   r  r   r  r  r  rk   r  r  r  r
  r8   r   r   r  rm   r  r  r&  r)  r6  r8  r=  r@  ro  rr  propertyrw  r`  r  r  r  r  r  rl  rm  s   @rW   r  r    s(    IEMF'1^ /('
RI\D0 ( $ $& BCZZ2%** 2 2	
uzz 	
ELL 	

ejj 
5<< 

5:jj: jj: 	:
 :@ UZZ 8$

3  	
 {{:7cDj 78 *$Z PU@:x

(8
O%b> ! ! !
: 
B 
B/=  "#'	00 d
0 \\D 	0rV   r  c                   <    e Zd ZeZ	 d' fd	Zdej                  dej                  fdZ	de
j                  defdZde
j                  defd	Zde
j                  defd
Ze
j                  fde
j                  defdZdede
j                  defdZ	 d'dedej                  de
j                  dedz  fdZ	 	 	 d(dedz  dej                  de
j                  dedz  deez  dz  dededz  fdZdedej                  f fdZ	 d)deez  dedej                  de
j                  def
dZd'dZd Zd ZdedefdZdedej                  defdZd  Z d! Z!ddde
jD                  fdej                  dz  d"edz  d#e
j                  dz  fd$Z#d' fd%	Z$d) fd&	Z% xZ&S )*r  Nc                     t         |   ||       t        j                         | _        | j                  sJ |dkD  sJ d       || _        || _        || _        |r|| _        y || _        y )Nr   z0Expect pass in Non-Zero tiling_factor explicitly)	r8  r9  r   pick_vec_isavec_isar@  rv  r  	num_elems)r:  r  r  r@  rv  r  r=  s         rW   r9  zCppVecKernel.__init__
  si     	{+"//1|||q T"TT *$"&/]rV   r   r  c                       j                  ||      ry  fd|j                  D        D ]"  }t        |t              sJ |j                  s" y  t        || j                        }|j                  r|S d S )Nc              3      K   | ]A  }t        |t        j                        r%j                  j                  |j
                      C y wr  r   r   r  r  r  r   r%  r  r:  s     rW   r'  z5CppVecKernel._try_get_const_stride.<locals>.<genexpr>
  s:      
a* HH  (
   AA
)r  r  r   rF   r   r  r@  r  )r:  r   r  indirect_varr{  s   `    rW   rw  z"CppVecKernel._try_get_const_stride
  s|    ))%9
''
 	L
 lN;;;""	 %UGT5G5GH))v3t3rV   r   r   c                     t        j                  | j                  |j                  z  dz  | j                  j                         z        }|dk\  sJ |S )N   r4   )mathr  r@  itemsizer  	bit_widthr:  r   num_vectorss      rW   r  zCppVecKernel._get_num_vectors
  sO    ii/!3dll6L6L6NN
 arV   c                 p    | j                   |j                  z  dz  | j                  j                         z  S )Nr  )r@  r  r  r  )r:  r   s     rW   r?  z!CppVecKernel._get_raw_num_vectors
  s0     !!ENN2Q69O9O9QQQrV   c                 h    | j                  |      }|dk(  rdt        |    dS dt        |    d| dS )Nr4   r  r   r  r  )r  rG   r  s      rW   rh  zCppVecKernel._get_vec_type
  sJ    ++E2!),u*=)>a@@*<+>*?qQOOrV   c                 l    |t         j                  k(  ry| j                  |      }dt        |    d| dS )NrR   r  r  r   )r   rk   r  rG   r  s      rW   r  zCppVecKernel._get_mask_type
  s<    EJJ++E2"<#6"7qQGGrV   r[  c                     |j                   t        j                  k(  sJ t        |             | j	                  |      }| dt
        |    d| dS )Nz.template cast<r  r   )r   r   rk   reprr  rG   )r:  r[  r   r  s       rW   rR  zCppVecKernel._get_mask_cast
  sP    zzUZZ'3d3'++E2|E':&;1[MMMrV   r   	load_maskc                    t         |   }| j                  |      }d}|rS|j                  s&| j                  t        j
                         d| d}n!| j                  |t        j
                         }|dk7  r| dt        |       n|}|t        j                  k(  r.| j                          d| dt        | j                         d}	|	S |r| d| d| d	| dn,| j                  |       d
| dt        | j                         d}	|	S )a  
        Get a load line str that loads a vector from `var` at `index` of type `dtype`.
        If `load_mask` is not None, we do a masked load accordingly.
        Notes on the `dtype`:
        1. We always load `self.tiling_factor` number of elements regardless of the `dtype`.
           It means we load half of the vector lanes for 16-bit data types and quarter of the
           vector lanes for 8-bit data types.
        2. `torch.bool` and `torch.uint8` could mean masks and we load them as float mask vectors.
        Nrg  r   r   r   r   z.template loadu<r  r  ::loadu()rG   r  r   r  r   rn   rR  rD   rk   r  rh  )
r:  r   r   r   r  cpp_typer  load_mask_strloadbufr   s
             rW   _get_vec_load_linezCppVecKernel._get_vec_load_line
  s*      &++E2###'#6#6u{{#C"DGI;VW X#'#6#6y%++#N"O5:aZSE[/01SEJJ))+,GG9B{4>>?Z>[[\]D  ! !/!1(1[MG9TUV**512(7)2kRVR`R`FaEbbcd 
 rV   r   store_value
accu_storec                 ^    |r	|J d       |r|sJ  j                   dt        j                  dt        f fddt        j                  dt        f fddt        dt        f fd}t               }|j                  d	       |j                         5   |      }	 |      }
d
t        |    d|
 d}|j                  |       |r |j                  | dt        |	       d       t         j                   j                      d      }i } fd|j                  D        D ]4  }t        |t              sJ |j                  s" ||      }| d| d||<   6  j!                  | j                  |      }d} j"                  l|rJ d       t         j"                  t              sJ  j"                          j"                  j                  r j"                   d| d}n j"                   d}t%        j&                         r|j                  d j(                          n|j                  d j(                          |j                  d| d| dt         j*                         dz   | dz          |j                         5  t-        j.                         5 }t        |      }|D ]#  }t1        j2                  d| z   dz   ||   |      }% || d| dn| }|r4|j                  d | d       |j5                  |j                                |r!|rd!nd"}|j                  | d#| d$| d%       n|j                  d&| d'| d(       ddd       ddd       |s( j7                  d)d*|      }|j                  d+| d(       ddd       |j                  d,       |r#|j                  d(       j9                  |       y j:                  j=                  ||-      }t        |t              sJ d.|_        |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w)/a  
        Load or store a vector in a non-contiguous way. The vector is initialized from an array that is
        filled in an inner loop over the tiling factor.
        :param var: buffer to load from or store to, i.e. `var[transformed(index)]`. If None, we load the index
                    as index expression, i.e. `transformed(index)`.
        :param index: index into the `var` or the index expression by its own if `var` is None.
                      The `index` could contain indirect indexing or the tiling itervar. When used in
                      the inner loop, the index is transformed as follows:
                      1. the index is linearized along the tiling dim.
                      2. the indirect indexing vector variables are transformed into arrays over the tiling dim.
        :param dtype: data type of `var` or `index` if `var` is None.
        :param buffer: the code buffer to write the generated code to. If None, we write to `self.loads`.
        :param store_value: the value to store. If None, we load the vector.
        :param accu_store: whether accumulate the store_value to store_ptr. If True, a store_value should be provided
        :return: a CppCSEVariable that represents the loaded vector or None if it is a store.
        Nzstore var must be providedr   r   c                 r    | j                   dk  rj                  d| j                   z  z  S j                  S N   )r  r  r   r:  s    rW   get_result_sizezCCppVecKernel._load_or_store_non_contiguous.<locals>.get_result_size)  s1    ~~!~~enn)<==~~%rV   c                 r    | j                   dk  rj                  d| j                   z  z  S j                  S r  )r  r@  r  s    rW   get_tiling_sizezCCppVecKernel._load_or_store_non_contiguous.<locals>.get_tiling_size/  s5    ~~!))Q%..-@AA)))rV   vec_varc                 L   | j                   sJ t               }|j                  d       |j                         5  | j                  }|J |t
        j                  k(  rt
        j                  } |      } 	|      }|j                  dt        |    d| d       |  dt        |       d}|j                  |       |j                  d       d d d        |j                  d       
j                  j                  |      }t        |t              sJ |S # 1 sw Y   JxY w)	Nrj  r  r   r  r  r   zreturn tmpbuf;r  )r   r6   r   r   r   r   rk   rn   rG   rD   r  r  r   rF   )r  r   r  result_sizetiling_sizer   r  r   r  r  r:  s          rW   vec_to_arrayz@CppVecKernel._load_or_store_non_contiguous.<locals>.vec_to_array5  s   >>!>>DNN5! 1#MM	 ,,,

* %I-i8-i8.|I/F.Gr+V_` ""8[9Q8RRTUt$/01 NN4 XX&&vt4Ffn555M!1 1s   BDD#rj  r  r   r  r  r   rP  c              3      K   | ]A  }t        |t        j                        r%j                  j                  |j
                      C y wr  r  r  s     rW   r'  z=CppVecKernel._load_or_store_non_contiguous.<locals>.<genexpr>\  s:      !!!TXX. $$QVV,!r  r  r  r  rG  zunexpected store with load maskz.is_masked(r   z != 0z#pragma GCC unroll z#pragma unroll 
for (long  = 0; r   r   r   r   rk  +==r   z tmpbuf[r   ztmpbuf[r  r   ztmpbuf.data()r   rY  r  r  T)r  r   r   rm   rF   r6   r   r   rG   rD   r,   ru  rv  r  r   r   r  r  r   is_gccr@  r  r   r   r   r   r   r  r$  r  r  )r:  r   r   r   r   r  r  r  r   r  r  result_declareitervar_innerreplacementsr  	array_varr  r   index_crhsr   	load_liner  r  r  s   `   `                  @@rW   ry  z*CppVecKernel._load_or_store_non_contiguous
  sd   2 #/O3OO1;>ZZF	&5;; 	&3 	&	*5;; 	*3 	*	. 	^ 	 	, ~u[[] @	7)%0K)%0K*<+>*?r+iX  NN>*"m#9+k:R9SSUV /==12&9M L!++! Q
 ",???&& ,\ :I4=;aa1PL.Q 004??= 1 E I*&I(II!$//>BSDOOSB??))#'??"3;}oQ OI#'??"35 9I!!#!4T5G5G4HIJ1C1C0DEFNN]O62"O3{4>>'B&C2FG"O3'(
  H
 4 4 6 H%%e,$0 L ff<.1E9$\2G .1_Qwiq)WINNT)A#67''6*4$#KNNcU!K=r#RSNNW]O4uA#FG#H H$  33OQN	156A@	7B 	tNN3MM$XX&&vt5&AFfn555 FMM=H H H HY@	7 @	7sE   B1P#EP#P(B2PP"2P#PPP 	P##P,r   c                 :   | j                   j                  |      }| j                  |      }t        j                  j                  |      }| j                  | j                     }| j                  ||      }|dk(  rt        	| )  ||      S |dk(  rG| j                  |||| j                        }| j                  j                  | j                  ||      }n| j!                  |||      }t#        |t$              sJ |j'                  d| ||fi        d|_        |S )Nr   r4   r  r  T)r  r  rT  r3   r  r  ru  rv  rw  r8  r  r  r  r  r  r  ry  r   rF   r  r   )
r:  r   r   r   r   rz  r{  r   r  r=  s
            rW   r  zCppVecKernel.load  s    iiood#$$U+!!$']]4??3
++E:>Q;7<e,,q[**3udooNDXX&&tzz4u&EF77UEJF&.111ftT5&92>rV   r   c           	         t        |t              s#t        |t              r|j                  sJ |       | j                  | j
                     }| dt        |       }| j                  ||      }t               }	|dk(  r|rl|t        j                  k(  r#| j                  | j                  |       d| dn,| j                  |       d| dt        | j                         d}
d| d|
 d}|t        j                  k(  r%| j                  |	j                  | d| d       |	S |	j                  | d| dt        | j                         d       |	S | j                  ||||	||	       |	S )
a2  
        Get a store line buffer that stores `value` into `var` at `index` of `dtype`. It handles
        both contiguous and non-contiguous store cases.
        :param value: Vectorized type templaterized on `dtype`.
        :param var: buffer to store into.
        :index: index into the `var`.
        r   r4   r  r   r   r  .store(r   )r   r  r  )r   ro   rF   r   ru  rv  rD   rw  r<   r   rn   r  rh  r  r   ry  )r:  r   r   r   r   r  rz  var_exprr{  r   r  s              rW   _get_store_linezCppVecKernel._get_store_line  s   " %%un-%,,		 
 ]]4??3
U#k%012++E:>Q; +0F ))%01(1E ..u56hxj;W[WeWeKfJgghi 
 E7#dV1-#(>%z<=  gWXJbT^^1L0MRP  ..UE$Ej /  rV   c                    dv sJ t        |t              sJ |       |j                  s| j                  |      }| j                  j                        }| j                  |      }t        j                  j                        }|B| j                  ||||      }| j                  j                  |j                  fd             y |dk(  rft        j                  j                   sT| j"                  dk(  rE| j                  | |||d      }| j                  j                  |j                  fd             y | j%                  |      }| j%                  t&        j(                        }	t*        |   }
t-        j.                  |t&        j(                        j0                  }t        |t              r|j                  sJ | j2                  r,d|
 d	|	 d	| d
| d	| d	| d	t5        | j2                         d}nd|
 d	|	 d	| d
| d	| d	| d}| j                  j7                  t9        |             y t;        d|       )Nr   c                     t        |       S r  r:   r  r   s    rW   <lambda>z$CppVecKernel.store.<locals>.<lambda>  s    ,tQ2G rV   r  r4   T)r  c                     t        |       S r  r  r  s    rW   r  z$CppVecKernel.store.<locals>.<lambda>  s    l46K rV   zatomic_add_vec<r   r  r   r  )r   rF   r   r  r  r  rT  r3   r  r  r  r  r$  mapr   r  r  r  r  r   r  rG   r1   rV  r   r  rD   r   r:   r  )r:  r   r   r   r  r   r   r   n_srcn_idxr   r   s    `          rW   r  zCppVecKernel.store  s   }}%07%70||NN5)Eiit$$$U+!!$'<''sE5ADKKtxx(GHI\!::--$2B2Ba2G++g# ,  ""488,K#LM--e4--ekk:%e,uekk:@@!%8U\\II>>,VHBugRwbRPUwVXY^X__abmnrn|n|b}a~  A  BD,VHBugRwbRPUwVXY^X__abD%%l4&>?%D6&:;;rV   c           
      X   |t         v sJ |dv }| j                  | j                  k\  }|r|n|}t        |t              sJ |       |j
                  s| j                  |      }|||f}|| j                  j                  v r| j                  j                  |   S d}	|	 dt        |    d}
t        ||      }| j                  ||      }| j                  j                  | j                  d| d      }t        |t              sJ | d}d	| }d	| }| xj                  | ||gz  c_        d
| _        | j                   j#                  | j%                  ||||t&                     | j                   j#                  | j%                  ||||| j(                               | j+                  ||d      }|r| j                   j#                  | j%                  ||||| j(                               | j                  J t-        j.                  t0        j2                  | j4                  | j                  d       }|dk(  r,| j6                  j                  | j8                  d| d      }n+| j:                  j                  | j8                  d| d      }d	| }t=        | j4                  | j                     | j>                        rl| j                  | j                  k\  rQt=        || j4                  | j                           t=        | j4                  | j                     | j>                        z  n|ntA        jB                  d      }| j4                  | j                     | j>                  z  r>| j                  | j                  k\  r#t=        || j4                  | j                           n|ntA        jB                  d      }d| }| jE                  |||||d
       | jE                  |||||       | jE                  |||||       | jF                  r|n|}| jF                  r|n|}|dk(  r4| jH                  jK                  | d| jM                  ||||       d       n| jH                  jK                  | d| jM                  ||||       d       n| j                  J | jN                  | j                     }tQ        | j                  dz   tS        | jN                              D ]$  }|| j4                  |   z  | jN                  |   z   }& ||||d}| jH                  jK                  | d | jL                  ||fi | d       | jU                  ||||| jL                  | j(                         | jU                  ||||tV        t&               |r+| jU                  ||||| jL                  | j(                         |tX        jZ                  k(  }|rDt]        |      rS| j_                  |      dv sJ d       d| d}d| d}| j`                  jK                  | dtW        |||       d       n|r	| d| d}n|r|dv rd| d}n|dk(  sJ | d}nd | jM                  |d!d"      z   d#z   } |tX        jZ                  k(  }|rtX        jb                  n|}!d$t        |!    d}
d%t        |!    d&| j_                  |!       d}"| }#|r|dk(  sJ | d'| }#|" d(|
 d)|
 d*|  d&|# d
}| j`                  jK                  | dtW        ||||+       d       |}$nq|}$t]        |      r2d	|$ }%| j`                  jK                  |$ dtW        ||$|%       d       n2|r0|dk(  sJ d	|$ }%| j`                  jK                  |$ d|$ d'|% d       te        ||$      }&|&| j                  j                  |<   |&S ),aw  
        Perform vectorized reduction operation.

        This method handles vectorized reduction for different reduction types.
        It manages special cases for low-precision floating point types and
        employs precision improvement techniques for certain reduction operations.

        Args:
            dtype: The output data type for the reduction result
            src_dtype: The source data type of the input value
            reduction_type: Type of reduction operation (sum, min, max, etc.)
            value: The input value to reduce

        Returns:
            The result of the reduction operation
        r+  zat::vecz::Vectorized<r   r,  Fr-  r(  masked_TNrg   r   r/  r0  rc   r   r   r4   )r   r   horizontal_reductionr   )r  r  )r4   r   z4Welford reduction does not support VectorizedN (N>2)zwelford_vec_reduce_all(r   z_vec_reduce_all()rf   rc   r_   rf  z.all_zero()r^   z.all_masked()z	{ return r  r
  z; }r  zat::vec::vec_reduce_all<r   r   z([](z& x, z& y) r  )3VECTORIZABLE_RTYPESrv  r  r   rF   r   r  r  r1  rG   r   reduction_acc_type_vecr  r  r  r  r  r  r  r   reduction_init_vecr  rU  rV  rW  rX  rY  r  r  r  r   r@  r   rZ  r)  r  r  r   reduction_combine_vecru  rS  r   r  r   r   rk   r)   r  r  rn   r   )'r:  r   r   r   r   r2  r  r4  r3  vec_nsr  r   acc_type_vecr   acc_vec
masked_accmasked_acc_vecuse_acc_helperr  r   masked_helper_valhelper_vec_rangemasked_helper_vec_ranger5  acc_vec_helper_val_r   r   r  r   r   masked_next_valuereduce_all_bodyr  vec_reduce_all_func
result_vectmpvarmasked_tmpvarr  s'                                          rW   r6  zCppVecKernel.reduction  s   & !4444)-AA#$2F2FF"2Y
%07%70||NN5)E!>58D..>>>%%55mDDl5&9%:!<%njA22>:N  ))JJ*]O4E * 
 #~...E,se_
"7),  uw$GG  ((//&&X~z>	

 	((//&&''	
 11.%O,,33**" "++ ''333&--dkk$*>*>*@AN !11!44==LLJ}o">e > 
 "44==LLJ}o">e > 
 #** 6 DKK8$:L:LM $*>*>> ^T[[-IJt{{4??;T=O=OPQ ( ]]1%   ;;t/$2D2DD $*>*>> ^T[[-IJ' ]]1% $ #** 6  ! !    Z1A5   !' *.~WH/3~~+:K&%%jD$>$>~xY^`k$l#mmno %%jD$>$>~xY^`k$l#mmno ''333MM$"6"67E4//!3S5GH BA.q1AAB $(<&	F KK!!)39t99.'\U[\]]^_ 	,,!%!;!;"55 	- 	
 	,,!2, 	- 	
 00%)%?%?"&"9"9 1  5::%#N3,,U3 8  J JJ   7wiqA
&=n=MQ$O!%%//e30FWXYYZ[ " .//?yJ
! & 
 $%WI[!9J)U222$+9M!:J  00cJK  
  5::-+2EKK	,\)-D,EQG(@iAX@YY[\`\q\qr{\|[}}~&# 'y
!)U222$+9C/?!@J 34DU3%u_L]]_`j_kklm
!!++%s,^S*Xabccde FF#N3")& 2%%//hc"3NFM"Z![[\]  %...")& 2%%//hc&]O1= #>6:<B**=9rV   c                 (   | j                  |      }| j                  j                        }t        j                  j                        }|j                  r$|t        j                  k7  rt        j                  }n|}t        j                  j                  |      }t        j                  j                  |      }t               }	| j                  | j                  k\  r.|	j                  | dt!        |       dt"        |    d| d       n||k7  rt"        |   j%                  dd       d| }
|t        j&                  k(  r&| d| j                  t        j&                         d}n?||cxk(  rd	k(  rn nd
t"        |    d| d}n d
t"        |    d| dt"        |    d| d| d}|	j                  d|
 d| d       |
}|	j)                  | j+                  ||||             | j,                  j)                  |	j/                  fd             y )Nr  z] = static_cast<r  r   z::r   z.template cast<bool,r   r4   zat::vec::convert<r   r  r   r   r   c                     t        |       S r  r  r  s    rW   r  z.CppVecKernel.store_reduction.<locals>.<lambda>  s    T18M rV   )rT  r  r  r3   r  r  is_floating_pointr   rj   rn   rf  r  r<   rv  r  r   rD   rG   r  rk   r$  r  r  r  )r:  r   r   r   r   	out_dtyper   out_num_vectorssrc_num_vectorsr   converted_valueconverts    `          rW   r8  zCppVecKernel.store_reduction  s   $$U+iit$GG%%d+	&&9+DKKEE((33I>((33E:??d222NN%qU+,,<\)=T<UUWX]W^^`a
 E!#I.66tSAB!E7K   

*!&';D<Q<QRWR\R\<];^^abG&/>Q>/Y0G/H5'QRS  
 0Y0G/H./qe1D0EQFWWYZ_Y``ac   &7s7)1EF'KK,,UC	JK$$TXX.M%NOrV   
scalar_varc                    |j                   rJ |j                  t        j                  k(  rE| j                  j                  | j                  | j                          d|j                   d      }n]|j                  J | j                  j                  | j                  | j                  |j                         d|j                   d      }t        |t              sJ |j                  |_        |j                  |_        d|_         |S )Nrg  r   r  T)r   r   r   rk   r  r  r  r  r   rh  r   rF   dependent_itervars)r:  r  r  s      rW   r  zCppVecKernel.broadcast  s    $$$$uzz)hh''!4!4!6 7wz>OqQG ##///hh''%%j&6&678*//9J!LG '>222"((%/%B%B"rV   r{  c           	      "   |j                   rJ |j                  J | j                  j                  | j                  | j                  |j                         d| d| d      }t        |t              sJ |j                  |_        d|_         |S )Nz	::arange(r   r   T)r   r   r  r  r  rh  r   rF   )r:  r   r{  r  s       rW   rx  zCppVecKernel.arange*  s    <<{{&&&""LL!!%++./yr&K
 &.111{{rV   c                 ^   t         |   }| j                  |      }t        |      rd| dS |dv r|t        j                  k(  rt        j
                  n|}t        |   }| j                  ||      }|dk(  r+t        |      s|t        j                  k(  rd| dnd| d}n*t        |      s|t        j                  k(  rd| dnd| d	}| d
| dS |dk(  r| j                          dS t        ||      }	| d
|	 d}
|t        j                  k(  r|dv sJ | j                          d|	 dS |
S )Nr   r   rx   r`   r{   rz   r}   ry   r|   r  r   rf   z	::from(0))r^   r_   rc   rg  )r;   rh  r)   r   rk   rn   rG   r  r
   r  r   )r:  r   r   r   vec_typecompute_dtyper   r   rO  scalar_initvec_inits              rW   r  zCppVecKernel.reduction_init_vec6  sm   07%%k2/hZs++11+0EJJ+>EKKKM!-0F22>5IH) &e,0C +6(-@/xx@  &e,0C ,F8=A/xx@ 
 ZqQ''U"))+,I66$^U;ZqQ/EJJ!%::::))+,GK=BBrV   c                    t         |   }| j                  |      }t        |      rd| dS |dv r| j                  t        j
                        }|t        j                  k(  r@| j                  t        j                        }dt        t        j                      d| d| dS | j                  |      }dt        |    d| d| dS |t        j                  k(  r|dv sJ | j                          S |S )Nr   r   rx   zIndexValueVec<r   )r^   r_   rf   rc   )
r;   rh  r)   r  r   r  rk   rn   rG   r  )r:  r   r   r   r  r  r  s          rW   r  z#CppVecKernel.reduction_acc_type_vecZ  s    07%%k2/hZq))11))%++6E

" --ekk:'U[[(A'B"UG2eWTUVV))+6E#L$=#>br%PQRREJJ!%AAAA))+,-rV   r  r   c                 @   |t         j                  k(  }|dk(  r=| j                  rd| d| dt        | j                         dS |r| d| S d| d| dS |dk(  r=| j                  rd| d| dt        | j                         dS |r| d	| S d
| d| dS |dk(  rq|r4| j                  rd| dt        | j                         d| dS d| d| dS | j                  rd| d| dt        | j                         dS |rdnd}	| d|	 d| S |dk(  r2| j                  rd| d| dt        | j                         dS | d| S |dk(  r2| j                  rd| d| dt        | j                         dS | d| S |dk(  rp|r:| j                  r"d| d| dt        | j                         d| d	S d| d| d| dS | j                  rd| d| dt        | j                         dS d| d| dS |dk(  rgt	        |t
              r|\  }
}}nt        ||      \  }
}}| j                  r%d| d|
 d| d| dt        | j                         dS d| d|
 d| d| d	S |dv r+|J t        |   }|}|t         j                  k(  r\t        t         j                     }t         j                  }t	        |t              r%|j                  rt        | j                  |f      \  }| j                  |      }| j                  t         j                        }d}d}|%|J dt        |      j!                          }d| }| j                  r.| d | d| d| | d!| d| | dt        | j                         dS | d | d| d| | d!| d| | dS |d"k(  rt	        |t              rF|j"                  t         j                  k(  sJ t        t$        j&                  j                  |f      \  }| j                  rd#| d| dt        | j                         dS | d| S t(        )$Nr_   zmax_masked_reduce(r   r   rw  rM  r^   zmin_masked_reduce(rq  rF  rc   r   r   zsum_masked_reduce(r   rZ   r   rd   zprod_masked_reduce(r   re   zxor_sum_masked_reduce(r   rg   r   rh   r   z}, r   rx   rR   z_combine_vec<r  rf   zany_masked_reduce()r   rk   r  rD   r   r   r   rG   rn   rF   r   rN   r  r  r  ro   r  r   r3   rf  r  )r:  r   r   r   r   r   r  r   r   r   r   r   r   r   r	  r  r  t_extra	arg_extras                      rW   r  z"CppVecKernel.reduction_combine_vecm  s8    uzz)U"~~+C5:,bT^^A\@]]^__  e3zl+ -SEJ<qA
 u$~~+C5:,bT^^A\@]]^__  e3zl+ -SEJ<qA
 u$>>1*RDNN@[?\\_`j_kklmm1*SANN>>/uBzl"[QUQ_Q_E`Daabcc)0#cK!U!K=*>>v%~~,SEJ<r+dnnB]A^^_``c*..y(~~/uBzl"[QUQ_Q_E`Daabccc*..//>>-cU"ZL;t~~C^B__bcmbnnopp-cU"ZLJ<qQQ>>-cU"ZL;t~~C^B__`aa-cU"ZLBB00*e,#- b& $5^Z#P b&~~)#d4&2$b[Y]YgYgMhLiijkk)#d4&2$bLL33(((!),F%MEJJ&%ekk2 %j.9j>O>O$8
}$UMZ))-8E))%++6EGI +777s#78>>@AB L	~~%&mF82eWBuggY WuBzl9+RDNN8S7TTUW
 ))vhbr%QXPYY[\_[``bcmbnoxnyyz{{u$*n5!''5::555 4QXX5E5E
} U~~+C5:,bT^^A\@]]^__c*..%%rV   c           	         t        |t              sJ |j                  J |j                  s4t        |t              r|j                  rd| d}t        	|   ||||      S |}|}|r!| j                  |j                         d| d}|r!| j                  |j                         d| d}|r|rd| d| d| d| d	}| d| d| }n#|r| d| }| d| }n|sJ | d| }| d| }| j                  |j                         d| d}|r6|j                  s!| j                  |j                         d| d}d| d| d}| j                  rS| j                  |j                         d	| j                  |j                         d
| dt        | j                         d}d| d}| j                   d| d| dS )Nr  z).all_masked()r   r  z) & (r   rg  z) | ~(z::set(z::from(1), (r   z, "index out of bounds: z"))r   rF   r   r   r8  r  rh  r  r  rD   rw  )
r:  r   r  r  r[  lower_scalarupper_scalarr  
cond_printr=  s
            rW   r  zCppVecKernel.indirect_assert  s$   #~...yy$$$zz$/DKK4&/7*3udCC))#))45QugQ?E))#))45QugQ?EUugT#eC5E7!<D(>cU#l^DJWD&D(>cU3JL5U#eW%D5L>2J%%cii014&:;;--cii89aHtfF4&*D>>&&syy12&9L9LSYY9W8X YV3{4>>:;1>  4&'&&'q.FzlRTUUrV   c                    t        |t              sJ |j                  st        
|   ||||      S t
        |   }| j                  |      }t
        |   }| j                  |      }d| d}	|t        j                  k7  r2|t        j                  k(  r| j                  |       d| d| d| d}	|	S |t        j                  k(  r |t        j                  k7  r| d| d| d}	|	S ||k7  r~d}	|rG|t        j                  t        j                  fv r%|t        j                  t        j                  fv rd	}	nd
}	||cxk(  rdk(  rn n|	d| d| dz   }	|	S |	d| d| d| d| d| dz   }	|	S )Nr  r   z::from<r  r  z.to<r   rR   zat::vec::round_convertzat::vec::convertr4   r  )r   rF   r   r8  r  rG   r  r   rk   r  rn   rj   r`  r_  )r:  r  r   r   r  src_cpp_typer   dst_cpp_typedst_num_vectorsr  r=  s             rW   r  zCppVecKernel.get_to_dtype_expr   s   #~...zz7,S%HMM#I.//	:#E*//63%qz

"u

':)))45W\N!OK\\^_b^ccdeD( ' %**$%**)<U$|nAo->cBD$ # %D%++u||!<<ejj%++66/)/6Q6,r#a88  ,q(9<./IZZ\]`\aabcd  rV   r  )NNFr  )'r  r  r  r  rm  r9  r   r  r   rw  r   r   rm   r  rn   r?  ro   rh  r  rF   rR  r  r<   rk   ry  r  r  r  r6  r8  r  rx  r  r  r   r  r  r  rl  rm  s   @rW   r  r  
  s   I C"45:: 4 4ekk c R%++ R% R
P5;; P3 P 38++ HEKK H# HN> N%++ N# N ,0## zz# {{	#
 "D(#T )-37 M4ZM zzM {{	M
 %M >)D0M M 
$	M^ UZZ 4 !*^#* * zz	*
 {{* *X"<Hrh%PNN ~ $
N 
ELL 
^ 
"H0 %),0(-l& ||d"l& #Tkl& ;;%l&\#VJ rV   r  c                        e Zd ZdZeZ	 	 d fd	Zd Zd Z	 ddZ	de
dej                  f fdZd fd		Zd
 Z fdZdej                  dej                  fdZ xZS )r  an  
    A vector kernel that handles the 2d tiles with the tile size defined in `tiling_factor` on
    the inner-most loop level and one of the outer loop level (`outer_tiling_idx`). When the data
    tile is accessed in a contiguous way from the outer loop axis, a transposition is applied on the
    tile to make the access contiguous from the inner-most loop axis. Then, the same vectorization
    logic from its parent `CppVecKernel` is leveraged for load/store/compute. The transposed tile load
    and store are generated into kernel.preloads and kernel.poststores buffers.

    The loop structure looks like below:
    for ...
      for i_outer ...
        for ...
          for inner_most ...
            // generated by CppTile2DKernel
            float tmp0[16*16]; at::vec::transpose_mxn<...>(tmp0, in_ptr0 + ..., ...); // into kernel.preloads
            float tmp1[16*16]; // into kernel.preloads
            for i_inner ... { // the kernel inner loop
              vectorized loads/compute/stores (e.g., load tmp0, store tmp1) // into kernel.loads/compute/stores
            }
            at::vec::transpose_mxn(out_ptr0 + ..., tmp1, ...) // into kernel.poststores
          for inner_most ... (tail)
            // generated by CppVecKernel
            ...
      for i_outer ... (tail)
        for ...
          for ...
            // generated by CppKernel
            ...
    c                     t         |   ||||d   |       || _        || _        || _        |r|n|| _        |r|n|| _        d| _        y )Nr4   T)r8  r9  tiling_indicesinner_tail_sizeouter_tail_sizeinner_num_elemsouter_num_elemsinner_is_tiling_idx)r:  r  r  r@  r  r  r  r=  s          rW   r9  zCppTile2DKernel.__init__B  s`     	1	
 -..2A}2A}#' rV   c                 L    t        | j                  | j                      d      S )NrP  )r,   ru  	outer_idxr?  s    rW   inner_itervarzCppTile2DKernel.inner_itervarY  s"    !T]]4>>%B$C6"JKKrV   c                 b   | j                   | j                     }| j                   | j                     }t        ||| j                        }t        ||| j                        }| j
                  d u xr@ |dk(  xr9 |j                  |      xr& |j                  |       xr |j                  |       S r   )ru  r!  rv  r  r@  r  r   )r:  r   	outer_var	inner_varouter_strideinner_strides         rW   need_vec_transposez"CppTile2DKernel.need_vec_transpose\  s    MM$..1	MM$//2	*5)T=O=OP*5)T=O=OPOOt# 0!0		)$0 !$$Y//0 !$$Y//	
rV   c                 n   t         j                  j                  |      }| j                  }| dt	        |       }d}	t	        t        || j                  | j                     | j                               }
t	        | j                         }|r|	|}	}||
}}
d}| j                  |z  r| j                  | j                  }}n| j                  | j                  }}|r|dk(  rdnd}t        |t        j                        r|j                  r&t        |t        j                        rA|j                  s5dt         |    d| d	| d
|
 d
|	 d
| d
t	        |       d
t	        |       d}n4dt         |    dt	        |       dt	        |       d| d	| d
|
 d
|	 d
| d}|r| j"                  j%                         }na| j"                  j'                  |      s)| j"                  j)                  | j*                  |d      }nd}| j"                  j-                  |      }|r>t         |   }d| d| d}| d| d| d| d| d
}| j*                  j/                  |       |j1                  dt3        |            }|r'| j4                  j/                  t7        ||             |S | j*                  j/                  |       |S )Nr   __place_holder__Tr  truefalseztranspose_mxn<r  r  r   r   Fr-  zalignas(std::max(std::size_t(z), alignof(z)))r   r  r[   r   )r3   r  r  r@  rD   r  ru  rv  r  r  r  r  r   r   r  r  rG   r  r"  containsr  r  getr   r  ro   r  r:   )r:  r   r   r   is_store
store_moder   factorr  r  ld_srcld_dstneed_defineMNr  load_or_storetile_var	cpp_dtypealignasdefine_lines                        rW   gen_transposed_tile_load_storez.CppTile2DKernel.gen_transposed_tile_load_storei  s    !!$'##SU+,-  3E4==;Y[_[m[m nop/0CC#VFF##h.'')=)=qA $$$$ A !)jL.HVPW
q%**%akkq%**%akk !e!4 5Qzl C56("SEF82k!n5ERTUGWWY[  !e!4 5Q{1~6FaTUGWWXYcXd e56("SEF827  xx(H""=1xx((U(SHKxx||M2H$U+I 6fX[SVWG$IQyk8*AfXQvhbQKMM##K0%--.@#h-POO%%l4&GH  MM##M2rV   r   r   c                 Z   | j                   j                  |      }| j                  |      }| j                         }| j	                  |      r| j                  |||d      }| dt        || j                  z         }t        j                  j                  |      }| j                  |d|      }| j                  j                  | j                  ||      }	|	j                  d| ||fi        t!        |	t"              sJ d|	_        |	S | j'                  |      }
t(        | U  ||
      S )NF)r/  r   r   r  r  T)r  r  rT  r"  r(  r<  rD   r  r3   r  r  r  r  r  r  r  r   rF   r   r  r8  r  )r:  r   r   r   re  r8  r  r   r   r  r   r=  s              rW   r  zCppTile2DKernel.load  s   iiood#$$U+""$""5)::c55 ; H "
#k%$..2H&I%JKGGG%%d+E**7Au=DXX&&tzz4u&EF!!&4u*=rBfn555 FMM//6I7<i00rV   c                 :   d|v sJ t        |t              sJ |       |j                  s| j                  |      }| j                  j                  |      }| j                         }| j                  |      }| j                  |      r| j                  |||d|      }| dt        || j                  z         }| j                  sdt        j                  j                  |      t         t"        j$                  t"        j&                  t"        j(                  t"        j*                  gz   v r| d| dt        | j                         d}	n| d| d}	| j,                  j/                  t1        ||	             y | j3                  |      }
t4        | m  ||
||       y )Nr   T)r/  r0  r   r  r   r   )r   rF   r   r  r  r  r"  rT  r(  r<  rD   r  r  r3   r  r  r   r   r_  r`  ra  rb  r  r   r:   r  r8  r  )r:  r   r   r   r  r   re  r8  storebufr   r   r=  s              rW   r  zCppTile2DKernel.store  sv   }}%07%70||NN5)Eiit$""$$$U+""5)::c54D ; H #3{54>>3I'J&KLH~~!2!24!8M

##!!	M = "  zK4O3PPRSz4KK!!,tT":;//6IGM$	5$7rV   c                    | j                         }| j                  r2|j                  d| d| dt        | j                         d| d	       y |j                  d| d| dt        | j
                         d| d	       y )Nr  r  r   r   r   )r"  r  r   rD   r  r  )r:  r   re  s      rW   r  z#CppTile2DKernel.codegen_inner_loops  s    ""$##NNUG6%K@T@T4U3VVXY^X__bc NNUG6%K@T@T4U3VVXY^X__bcrV   c                    t         |   ||      }| j                  d   | j                  k  r| j                  nt	        | j                        \  | _        | _        | j                  | j                  d   k(  r+| j                  | _        | j                  | _
        d| _        |S | j                  | _        | j                  | _
        d| _        |S )Nr4   r   FT)r8  r=  r  r  reversedr!  rv  r  r  r  r  r  r  r  )r:  groupreduction_groupr  r=  s       rW   r=  zCppTile2DKernel.set_ranges  s    w!%9 ""1%(<(<< $--. 	(
 ??d11!44!11DN!11DN',D$
  "11DN!11DN'+D$rV   r   c                 Z    | j                  || j                  | j                               S )Nr  )r  r!  r"  r  s     rW   r  z"CppTile2DKernel.transform_indexing  s0    ++%%' , 
 	
rV   r  r  )r  r  r  r  r  rm  r9  r"  r(  r<  ro   r   r  r  r  r  r=  r  rl  rm  s   @rW   r  r  !  ss    < #I (.L
 6::x1 1UZZ 1,8>	$


 
uzz 
rV   r  _bodyc                 j   | j                   gt        | j                  j                               z   }d}d}|D ]  }|j                  j
                  D ]  }|j                  dk(  s|j                  dv r!|j                  dvrd}t        |d      r|j                  rt        j                  |j                  v sJ |j                  t        j                     }|j                  r|j                  t        vrd}|&||j                  k7  st        j                  d       |j                  }d}  ||fS )	z
    Returns the low precision data type (torch.float16/torch.bfloat16) contained in the nodes
    and if all the nodes can codegen with this data type without converting to float.
    Otherwise returns None and True.
    NFplaceholder)	get_indexrV  )r  r  r  r  r  Trw  z.bf16 and fp16 are mixed in the scheduler node.)
root_blockr/  	subblocksrf  r  nodesoptargetr   rw  r@   rv  r   r   warningswarn)rF  
sub_blocks_lowp_fp_type	_use_fp32	sub_blockr<  rs  s          rW   get_loop_body_lowp_fprU    s$    ""#d5??+A+A+C&DDJ(,MI !	__** 	!Exx=(ELL = -  || $  !	uf%%***..%**<<</4zz:M:Q:Q/R}}](J $I".$5 &VW$+MMM 	9	!!> )##rV   c                   8    e Zd ZdZdeee   ee   f   fdZd Zy)TilingSelectz
    Implement the heuristic to select the tiling factors and tiling indices.
    In the future, we can implement advanced heuristic in a subclass.
    r   c           	        # t        |      }t        |      }|sJ t        d |D              rg g fS t        j                  }t        |d         d   ##rt        #fd|dd  D              r#}t        j                         j                  |      }| j                  |||      }|rt        |d       \  }}	t        |      t        |	      z   }
t        j                  j                  rVd }d	 }d
 }t!        t#        |
            D cg c]  }t%        t&        j(                  |       }}t#        |      }|d | ||d  }}i }i }|D ]n  }|j*                  gt-        |j.                  j1                               z   }|D ]4  }|j2                  j4                  D ]  }|j6                  dv r|j6                  dk(  rdnd}|j8                  j;                  ||f      |j<                  |   j<                  d      } |||      r4 |||||      }|j6                  dk(  r|n|dvr ||j6                  |       t?        |j6                  t@              s|j6                  jC                  d      r|j6                  dv r|j6                  |vrd||j6                  <   ||j6                  xx   dz  cc<    7 q tE        |j1                               }tE        |j1                               }d}d}||k\  s|dkD  r||z  |k\  rg g fS |	s9|r7t#        |      dk(  r)tG        ||d      g      s||d      |dz  k  r	|dk  rg g fS |tH        v rt        j                         j                  |      } |D ]  }!|!dk  r|!t#        |
      z   }!|!dk  s|!t#        |
      k\  r*tG        |
      retJ        j2                  jL                  jO                  |
|!   d      }"|"| k  sitJ        j2                  jL                  jQ                  |"|        | dz  } n|
|!   | k  s| dz  } n t#        |      dk(  r|g|fS t#        |      dk(  r||g|fS g g fS c c}w )Nc              3   ,   K   | ]  }|t         v  y wr  )rv   )r%  r   s     rW   r'  z-TilingSelect.select_tiling.<locals>.<genexpr>:  s     HEu//Hs   r   c              3   @   K   | ]  }t        |      d    k(    yw)r   N)rU  )r%  	loop_body_lowp_fp_dtypes     rW   r'  z-TilingSelect.select_tiling.<locals>.<genexpr>>  s(      "
 #9-a0NB"
   r4   r  c                     t        | d         S r   r   sizess    rW   r  z,TilingSelect.select_tiling.<locals>.<lambda>K  s    #eAh- rV   rv  c                 L    ||d      }t        | ||      }|j                  r|S d S ra  )r  r  )r   ru  r@  r  r  r{  s         rW   _try_get_stridez3TilingSelect.select_tiling.<locals>._try_get_strideQ  s4     '~a'89G0OF%+%5%56?4?rV   c                 2    | |vrd|| <   y || xx   dz  cc<   y r   rU   )	node_namenon_contig_indexing_op_counters     rW   _update_negative_op_countz=TilingSelect.select_tiling.<locals>._update_negative_op_count[  s(     !(FFDE6yA6yAQFArV   c                     t        |      dk(  xr: t        |       dkD  xr* |d   dk\  r|d   n|d   t        |       z   t        |       k  S Nr4   r   r_  )ru  r  s     rW   _is_valid_indicesz5TilingSelect.select_tiling.<locals>._is_valid_indicesc  sb    
 N+q0 (MA-(  .a0A5 +1-!/!2S]!Bh-(	rV   )rV  r  r  rV  r   r   r4   masked_subblock)r1   r  rP  rI  gQ?#   r  
   ry  ))rB   rA   rf   r   rn   rU  r.  r   r  	nelements_select_tiling_indicesr_   r   r   r  enable_tiling_heuristicsrS  r   r-   r   r:  rJ  r/  rK  rf  r  rL  rN  r\  indexing_from_argsr  r   ro   
startswithrc   r'   r   r3   r  r?  check_lt)$r:  fn_listvar_sizes_listloop_bodies
all_dtypesr   r@  r  rC  rD  r  rd  rh  rk  r4  ru  r  r  reduction_vars
op_counterrg  rF  rQ  rT  r<  arg_idxr   r{  op_numnon_contig_indexing_op_numratio_thresholdquantity_thresholdfactor_lowptiling_indice
call_ranger\  s$                                      @rW   select_tilingzTilingSelect.select_tiling1  s    %W-/<
zHZHHr6M.{1~>qAc "
(_"
 
 #E#002<<5<I44^]
 %($?&"E?  ,)??Kzz22@G" #3{#34 34;;B  #&e*-o._-. % .0
 BD.( BE"'"2"2!3d5??;Q;Q;S6T!TJ%/ B	%.__%:%: BE$||/NN/4|||/K!QR(1(I(I%)>$:)""'**W"5":":1"=)? $5X~#N-<(-x.&F
 ,1<<<+G )/-36-A(A,1LL:X)*  *%,,< % 7 78I J#(<<#M$N $)<<z#A?@Ju||$<$.u||$<$A$<7BBB@ Z..01-0299;.* #'%'"-1CCQJ2V;N
 r6M (N+q0,!."34
 nQ/0=13DD r6M% *668BBBO%3 M$q((5K8H(H$q(MS=M,M '4%&WW%5%5%G%G'6 &H &
 &3GG,,55j+N,71,<M!$]3kA(3q(8" >"a'%66>"a'%}5~EE2vQs   
!Qc           	         g }t        ||      D ]`  \  }}t        j                  |g| }|t        j                  |j
                  |j                        D cg c]  }|j                   c}z  }b t        t                  }	g }
t        t                  }t        t                  }|D ]  }|j                  D ]  }t        j                  d|j                        s$t        |||      }|dk(  r7|dk(  rO|	j                  t        |j                  dd               |
j!                  t        |j                  dd               t#        d |j                  D              r(|j                  t        |j                  dd               |j                  t        |j                  dd                
 |	|z
  |z
  }t%        |d       \  }}t'        |      t'        |      z   }t'        |	      dk(  r|dz
  gS |rt)        |      dd  S |	|z  |z
  }t)        |	      }t'        |      dk(  r|d   |v r|d   |dz
  k(  r|S t)        ||
j*                        dd  S c c}w )	Nz^d\d+$r   r4   c              3   P   K   | ]  }t        |t        j                           y wr  )r   r   SIZEr%  r  s     rW   r'  z6TilingSelect._select_tiling_indices.<locals>.<genexpr>  s     S!4995S   $&c                     t        | d         S r   r_  r`  s    rW   r  z5TilingSelect._select_tiling_indices.<locals>.<lambda>      s5QR8} rV   rb  r  r   )r%  r	   extract_read_writes	itertoolschainreadswritesr   r   rm   r  r   r   r   r  r  r  r.  r_   r   sortedcount)r:  rv  rw  r@  	all_indexfn	var_sizesrwdepcontig_varscontig_vars_listnon_contig_stride_constnon_contig_stride_otherr   r   r{  contig_onlyrC  rD  num_itervarscontig_and_const_stridecontig_vars_sorteds                         rW   rq  z#TilingSelect._select_tiling_indices  sV    	 .9 	UMB	11"AyAByrxx/ST#))TTI	U !o'",S/"3",S/"3 	CE)) CyyCHH5,UCGQ;q[OOC$56$++C,=>Sv?R?RSS+//CHHQRL0AB+//CHHQRL0ABC	C "$;;>UU!$^9T!U5zC$88{q  1$%%+&rs++11##$ $K0"#q("2&*AA"2&,*::%%(.>.D.DEbcJJK Us   IN)	r  r  r  r  r   r/  rm   r  rq  rU   rV   rW   rW  rW  +  s1    
i 
tCy$s)#	$	iV.KrV   rW  c                        e Zd ZU eZee   ed<   eZee   ed<   e	Z
ee	   ed<    fdZd ZdefdZdefd	Zd
 Zd Zd Zdee   fdZd Zd Zddedz  fdZdeded   fdZ xZS )rJ  
kernel_clsvec_kernel_clstile2d_kernel_clsc                     t         |   |j                  |j                  j                         || _        d | _        d | _        t        j                         | _
        g | _        y r  )r8  r9  r  wsr  rb  rT  r  r   r  picked_vec_isakernelsr:  rb  r=  s     rW   r9  zCppKernelProxy.__init__  sQ    **LOO,G,GH(2=2J2J2L(*rV   c                 `    |D ])  }t        |t              sJ t        j                  |       + y r  )r   r#   r9   propagate_scheduler_node)r:  rL  r<  s      rW   data_type_propagationz$CppKernelProxy.data_type_propagation   s1     	@Ee]33388?	@rV   scheduler_nodec                     t        |j                  t              syt        j                  |       t        |j                        d   d uxr t        |j                        d    S )NTr   r4   )r   rF  r   r9   r  rU  )r:  r  s     rW   is_lowp_fp_schedulerz#CppKernelProxy.is_lowp_fp_scheduler&  s\    ...944^D!."6"67:$F C).*>*>?BB	
rV   r[  c                     dt         j                  j                  fd}|j                  gt	        |j
                  j                               z   }|D ]  } ||j                          y )N	sub_graphc                 Z   dt         j                  j                  dt         j                  d z  fddt         j                  j                  dt         j                  d z  fddt         j                  j                  dt         j                  ffddt         j                  j                  dt         j                  ffddt         j                  j                  dt         j                  ffd}t	        | j
                        }g |D ]8  }|j                  d	v r |      xt        v rt        fd
|j                  D              rB|j                  d   }| j                  |      5  | j                  d||t         j                  f      |j                  fd       t        xj                   dz  c_        d d d        |j                  dk(  r |      xt        v r|j                  \  }}}}} ||      rt"        j$                  j'                  |      | j)                  |      5  | j                  d||f      |j+                  |       t        xj                   dz  c_        d d d        u|j                  dk(  r|j                  \  }}}	}
|t        v st         j                  t         j,                  t         j.                  t         j0                  fv sJ |t        v rt         j                  nt         j                  |	|
f|_
        |j                  dk(  r`|j                  d   t        v rK|j                  \  }}
t        fd|j                  D              rk||
t         j                  f|_
        |j                  dk(  rq|j                  d   t        v r\|j                  \  }}t        fd|j                  D              rډj3                  |       ||t         j                  f|_
        |j                  dk(  s|j                  \  }}}|t        v rd |||      s[| j)                  |      5  | j                  d|||f      |j+                  |       t        xj                   dz  c_        d d d        t        v st        fd|j                  D              r|j                  d   }| j                  |      5  | j                  d||t         j                  f      |j                  fd       t        xj                   dz  c_        d d d        ; dt         j                  j4                  ffd} ||        y # 1 sw Y   oxY w# 1 sw Y   |xY w# 1 sw Y   xY w# 1 sw Y   xY w)Nr&  r   c                 Z   | j                   dk(  r,t        j                  j                  | j                  d         S | j                   dk(  r| j                  d   S | j                   dk(  rCt        | j                        dkD  r| j                  d   S | j                  j                  dd      S y)	z6Get input dtype for nodes that may consumes lowp fp dtr  r4   r  r  r  r   r   N)rN  r3   r  r  r  r   r  r.  r&  s    rW   get_input_dtypez]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_input_dtype2  s    ;;')77,,TYYq\::[[$6699R=([[J.499~)#yy|+#{{{DAArV   c                 $   | j                   dk(  rFt        | j                        dk(  sJ t        j                  j                  | j                  d         S | j                   dv r| j                  d   S | j                   dk(  r| j                  d   S y)	z6Get output dtype for nodes that may produce lowp fp dtr  r   r4   )r  rP  rV  r  r  r   N)rN  r   r  r3   r  r  r  s    rW   get_output_dtypez^CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_output_dtype@  sz    ;;&(tyy>Q...77,,TYYq\::[[$JJ99R=([[$6699Q<'rV   r  c                 .    |t         v sJ  |       |k(  S )z]Check if the given node produces output with expected low precision floating point data type.)r   )r&  r  r  s     rW   is_lowp_fp_sourcez_CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_sourceL  s!    ]***'-33rV   c                 X    |t         v sJ  |       x}r||k(  S | j                  dk(  ryy)zZCheck if the given node accept input with expected low precision floating point data type.r  TF)r   rN  )r&  r  input_dtyper  s      rW   is_lowp_fp_sinkz]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_sinkQ  s>    ]***"1$"77;7&",,[[J. rV   c                 Z     |       xr t        fd| j                  D              S )zCheck if the node is a lowp fp sources which are all directly fed to ops that accepts lowp fp input
                thus no need to promote to float
                c              3   0   K   | ]  } |        y wr  rU   r%  userr  r  s     rW   r'  z}CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote.<locals>.<genexpr>`  s      ;26OD"-;   r.  users)r&  r  r  r  s    `rW   is_lowp_fp_source_no_promotezjCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote\  s1     )r2 s ;:>**; 8 rV   )r  rV  c              3   0   K   | ]  } |        y wr  rU   r  s     rW   r'  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>m       M?44Mr  r   r  r  c                     | uS r  rU   r4  to_type_nodes    rW   r  zVCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<lambda>u  s    A\4I rV   r4   r  r6  rP  r  c              3   0   K   | ]  } |        y wr  rU   r  s     rW   r'  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>  r  r  c              3   0   K   | ]  } |        y wr  rU   r  s     rW   r'  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>  r  r  r  c              3   0   K   | ]  } |        y wr  rU   )r%  r  r   r  s     rW   r'  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>  s     Ue <Ur  c                     | uS r  rU   r  s    rW   r  zVCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<lambda>  s    A\<Q rV   r  c                 V    dt         j                  j                  ffd} ||        y )Nr  c                 B   dt         j                  j                  fd}| j                  D cg c]  }|j                  dk(  s| }}|D cg c]  } ||      s||j
                  i }}|D ]  }|j                         D ]q  \  }| j                  v st        fd|D              sv s.t        d |D              sAj                  d   }j                  |       | j                         s  | j                  | j                          y y c c}w c c}w )Nto_nodec                 :    t        d | j                  D              S )Nc              3   :   K   | ]  }|j                   d k(    yw)r  N)rN  r%  usrs     rW   r'  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to.<locals>.<genexpr>  s     "U3::#;"Us   r  )r  s    rW   _used_by_tozCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to  s    ""Uw}}"UUUrV   r  c              3   \   K   | ]#  }|j                   d    j                   d    k(   % ywr  Nr  )r%  r  r&  s     rW   r'  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>  s&     #ScCHHRLDIIbM$A#Ss   ),c              3   F   K   | ]  }|j                   d    t        v   ywr  )r  r   r  s     rW   r'  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>  s"      ,&JM(E,&r<  r  )r   fxNoderL  rN  r  r  r.  all_input_nodesreplace_all_uses_with
erase_nodeowning_modulelint)	r  r  r&  all_to_nodesall_to_nodes_and_users
node_usersr  val_nodeto_lowp_fp_legalized_nodess	     `     rW   _eliminate_duplicate_to_nodezCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node  s-   VUXX]] V *3$!%DKK:<U$L $ 8D./3{SWGXtzz*.* . '= ;
+5+;+;+= ;KD%#y6 ##SU#S S$(,F$F(+ ,&QV,& )&
 ,0+?+?+C $ : :8 D ) 4 4T :;;, !..6!( 79$.s   DDDD)r   r  Graph)r  r  r  s     rW   eliminate_to_dtypez`CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype  s"    ')EHHNN ')R -Y7rV   )r   r  r  r   r/  rL  rN  r   r.  r  r  inserting_aftercall_methodrn   r  r   cpp_to_dtype_countr3   r  r  inserting_beforereplace_input_withr]  r^  r  r  r  )r  r  sub_graph_nodesr<  r1   r   r   	value_varr   r   r   r  r  r  r   r  r  r  r  r  r  s                @@@@@@@@rW   add_to_dtypezDCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype1  s    ehhmm  d8J  
 uxx}} 
 t9K 
 4 45;; 4
	!ehhmm 	! 	!588== ekk  #9??3O)+&( {@LL$::/66=H MMM **Q-C"2259 8'0'<'<&c5%++-F (= ( 33(*I  22a728 8 LLG+.u55-G16.Cq)Q3IrB GG--d3E"33E: 8'0'<'<&c9e-D (= ( 00LI22a728 8 \\[0 

!& M1  %!KK!NN!MM!KK	)       +0M+AEKKu!KK*!&
 \\Z/EJJrNm4S',zz$S%MMM "%uekk!:EJ\\Z/EJJrNm4S#(::LS!RMMM  /55e<"%q%++!6EJ\\%779>6S)UI !M1  <IyQ!*!;!;E!B @/8/D/D$.c9i5P 0E 0" !& 8 8L Q ' : :a ? :@ -  UUU"'**Q-C!*!:!:5!A @/8/D/D$.c5%++5N 0E 0" !& ; ;$02Q!" !( : :a ? :@ @g{@z*8ehhnn *8X y)8 8$8 8L@ @"@ @s4   7AU:>AVAV3AV :V	V	V	 V*	)r   r  r  rJ  r/  rK  rf  r  )r:  r[  r  rQ  rT  s        rW   legalize_lowp_fp_dtype_loopbodyz.CppKernelProxy.legalize_lowp_fp_dtype_loopbody0  s]    ^	*EHHNN ^	*@  **+d93F3F3M3M3O.PP
# 	*I)	*rV   c                     t         fd|D              r|D ]  }|j                  j                  gt        |j                  j                  j                               z   }|D ]  }|j                  j                  D ]n  }|j                  dv s|j                  sJ t        j                  |j                  v sJ |j                  t        j                     }|j                  t        v rnJ    y |D ]^  }t        |t              sJ t        |j                  t               sJ |j                  }|j#                         rN j%                  |       ` y )Nc              3   d   K   | ]'  }t        |t              xr j                  |       ) y wr  )r   r#   r  )r%  r<  r:  s     rW   r'  z8CppKernelProxy.legalize_lowp_fp_dtype.<locals>.<genexpr>  s3      
 um,Q1J1J51QQ
   -0)r  r  )r.  rF  rJ  r/  rK  rf  r  rL  rN  rw  r@   rv  r   r   r   r#   r   is_memory_copyr  )r:  rL  r<  rQ  rT  fx_noders  r\  s   `       rW   legalize_lowp_fp_dtypez%CppKernelProxy.legalize_lowp_fp_dtype  sD    

 

  B#kk445KK))0029 
 ", BI#,??#8#8 B">>->>#*<</<#6#:#:gll#JJ#J;B<< 3 7 7<G $+==M#AA#ABB	B  	;Ee]333ekk8444"[[D&&(44T:	;rV   c           	      J   ! t              t              k(  sJ | j                  t        d       \   | j                          !fd} fd! || j                        }t
        j                  xj                  |j                  z  c_        t
        j                  xj                  |j                  z  c_        t        j                  |      | _        | j                  r| j                  s6|g| _        | j                  dd        | j                  j!                  |        y t"        j$                  j&                  j)                  d      5  t+               }|j-                        \  }}t        |      t        |      k(  sJ d}d }	|r{d}
|d   }|dz   }t        | j                  j.                        |kD  rM| j                  j.                  |   j0                  }
| j                  j.                  |   j0                  }|
xr | }t        |      dk(  rt2        xj4                  dz  c_        | j                  j7                  |d   |d   	      } || j8                  |d   |d         }|j:                  |j<                  z
  }|j>                  d|j<                  fi|_         t&        jB                  jD                  r || j8                  |d   |d   |      }n|}|j>                  g|_#        |j>                  |j<                  |j:                  fi|_         ||g| _        |}	nt        |      d
k(  r|d   t        | j                        dz
  k(  r|d   |d   k(  sJ t2        xj4                  d
z  c_        | j                  j7                  |d   |d   	      }d|j<                  f|j<                  |j:                  fd}|j:                  |j<                  z
  }| j                  j7                  |d   |d   	      }d|j<                  f|j<                  |j:                  fd}|j:                  |j<                  z
  } || jH                  |d   |      }|j>                  |d   |j>                  |d   i|_         g }t&        jB                  jD                  rldD ]f  \  }}|dk(  r|nd }|dk(  r|nd } || jH                  |d   |||      }|j>                  ||   |j>                  ||   i|_         |jK                  |       h n || j8                  |d   |d         }|j>                  |d   |j>                  |d   i|_         |j>                  g|_#        |jK                  |       |j>                  |d   |j>                  d|j:                  fi|_         |j>                  |j>                  g|_#        |jK                  |       |g|z   | _        |}	n|g| _        | j                  ||	       | j                  j!                  |        d d d        y # 1 sw Y   y xY w)Nc                     t        | d         S r   r_  r`  s    rW   r  z2CppKernelProxy.codegen_functions.<locals>.<lambda>6  r  rV   rb  c                      j                   | g| 5 }t        xj                  dz  c_         |       |cd d d        S # 1 sw Y   y xY wr   )
new_kernelr   generated_kernel_count)r1  r  rf  rb  runs      rW   codegen_kernelz8CppKernelProxy.codegen_functions.<locals>.codegen_kernel:  sL    (((4t4  ..!3.F  s   #AAc           	      T   | j                        \  }}d}t        	      D ]u  \  }}|ft        t        j                              dffv r|rJ  |||       ;d}|dfk(  sJ d| d d        | j                         5   ||d       d d d        w y # 1 sw Y   xY w)NFrU   Tzunexpected group: rm  r   )r=  r%  r   r  r  r  )
rf  r  rz  	in_suffixr  r  rv  rC  rD  rw  s
         rW   r  z-CppKernelProxy.codegen_functions.<locals>.runD  s    #)#4#4UO#L D.I!$Wn!= %IO,9??5/BCRH!   )(=t^, $I$)  V ,I;d5'OCTUV 
  //1 %4% %%% %s   	
BB'	Finplace_buffersr   r4   )r1  r   maintailr  )r  )r  r  )r  r  r  )&r   rb  r_   r=  r  r3   r  removed_buffersinplaced_to_removerR  rq  rT  r  ru  r  aggregate_reduction_buffers
set_kernelr   	_inductorr   patchrW  r  rN  r  r   generated_cpp_vec_kernel_counttiler  rF  
tiled_sizer   r  r  enable_loop_tail_vecr  r  r  )"r:  rv  rw  r  scalar_kerneltiling_selecttiling_factorsr  _inner_loop_reduction_outer_not_outer_loopinner_loop_reductionouter_loop_levelinner_loop_levelouter_loop_reductionrU  
vec_kernelr  tail_kernel
outer_loopr^  r  
inner_loopinner_rangesr  tile2d_kernelouter_rinner_r_inner_tail_size_outer_tail_sizerf  rC  rb  rD  r  s"    ``                           @@@@rW   codegen_functionsz CppKernelProxy.codegen_functions3  s   7|s>2222((!$^9T!U/		%( 't7	=#@#@@	""m&F&FF"!6""$--)?DL,,UD9NN%%d+ __##))%)@ ~	,(NM-:-H-H.*NN ~&#n*====.3+K',$#1!#4 #3a#7 t~~++,/??+/>>+?+?(,"l ) ,0>>+?+?(,"l ) -I5I1I 4 >"a'66!;6~~**>!+<^TUEV*W+''):N1<M
 !II7	,0HHq$//6J+K
(::22"0++&q)&q)!	#K #0K48HH:M0-1XX7S,T) *K8"^$)"1%T]]);a)??&q)^A->>?
 66!;6!^^00"1%nQ.? 1 
 
 5 56'22JOOD  #-//J4I4I"I!^^00"1%nQ.? 1 
 
 5 56'22JOOD  #-//J4I4I"I .**"1%"! NNL$8NNL$8/+ !::22- 3( 07&/@Od ) 07&/@Od ) "0 22*1-*,," 'NNL,A&NNL,A0, $**62-30 "0++^A->q@Q"J #V(<"V(<0J, 2<0@J-&&z2"V(<"JOO(<3M/ 5?NNJNN3SM0&&}5 -<( -,,/ NN%%d+}~	, ~	, ~	,s   R5XX"c                     |D ](  }| j                  |       t        j                  |       * | j                  ||       y r  )r  r9   propagate_loopbodyr"  )r:  rx  rw  r\  s       rW   codegen_loop_bodiesz"CppKernelProxy.codegen_loop_bodies  s?     	9D0062248	9 	{N;rV   rL  c                    | j                  |       | j                  |       t        |      dk\  sJ d }|D cg c]  }t        j                  ||       }}t        t        j                  t              r2t        j                  j                  rd }|D cg c]
  } ||       }}|D cg c]  }|j                  d    }}| j                  ||       y c c}w c c}w c c}w )Nr4   c                     | j                          | j                          t        t        j                  t
              r | j                  | S | j                  |      S r  )decide_inplace_updatemark_runr   r3   rf  r0   rF  codegen)r&  
index_varss     rW   r  z(CppKernelProxy.codegen_nodes.<locals>.fn  sF    &&(MMO!(($56!tzz:..||J//rV   c                 R    t         j                  j                  |       }| |_        |S r  )r3   rd  localize_functionoriginal_fn)r  
wrapped_fns     rW   wrap_fnz-CppKernelProxy.codegen_nodes.<locals>.wrap_fn  s+    33EE
 *,
&!!rV   )r  r  r   rU  partialr   r3   rd  rJ   re  rC  r"  )r:  rL  r  r&  rv  r0  rw  s          rW   codegen_nodeszCppKernelProxy.codegen_nodes  s    ##E*""5)5zQ	0 <AA49$$R.AA q--/AB&&44" .55rwr{5G549:D$**Q-::w7# B 6:s   CC-C"c                 >    | j                  | j                  ||       y r  )ro  rT  )r:  r   rZ  s      rW   rr  zCppKernelProxy.codegen_loops  s    kBrV   c                 F    | j                   D ]  }|j                           y r  )r  r  r:  rf  s     rW   r  z4CppKernelProxy.update_stores_with_parallel_reduction  s!    ll 	;F88:	;rV   Nr   c                 (   |J d}| j                   D ]q  }t        j                         5 }|j                  ||      r@d}|j	                  |j                                |j                  |j                                d d d        s y # 1 sw Y   ~xY w)N
C10_LIKELYC10_UNLIKELY)r  r   r   r  r   r   r$  r  )r:  r   	if_prefixrf  r   s        rW   r  zCppKernelProxy.gen_body  s     	ll 	3F%%' 35,,T9= .I''6KK 12	3 3	33 3s   ABB	inner_loop_reduction_outer_notr  	LoopLevelc                     d fd} j                   d   }|r|sJ  ||       nZ|j                           j                  j                  |j                          j                  j                  |j                          j
                  j                  |j
                          j                  j                  |j                          j                  j                  |j                          j                  j                  |j                          j                  j                  |j                          j                  j                  |j                         y)z
        CppKernel/CppVecKernel/CppTile2dKernel have reduction buffers themselves.
        Here, we decide how to aggregate them together and place new reduction buffers
        under CppKernelProxy.
        c           
          t        j                        dk\  sJ j                  d   }j                  d   }t        |j                        sJ t	        |      j
                  u r^|j                  |j                         |j                          j                  j                  |j                  |j                  z          n5|j                          j                  j                  |j                         t               }t        j                         5 }|j                  |d| j                        r:|j                  |j!                                |j                  |j"                         d d d        t        j                         5 }|j                  |d| j                        r#|j                  |j!                                t	        |      j
                  u r|j$                  }|D ]X  }| d| j                   dt'        | j(                         d}t+        |j,                  ||       t+        |j"                  ||       Z t/        |j,                         |j                  t1        |j"                  | j                  | j                   d	| j(                  | j2                               n|j                  |j"                         d d d        |_        y # 1 sw Y   pxY w# 1 sw Y   |_        y xY w)
Nr   r   r  r7  r8  r   z_tail - r  r  )r   r  r   r  r#  r  r  r@  r  r$  r6   r   r   r  r   r   r   r  r  rD   r  r   r  r   r   rF  )	r  main_loop_kerneltail_loop_kernel
suffix_bufr   rz  r   r   r:  s	           rW   !aggregate_reduction_prefix_suffixzUCppKernelProxy.aggregate_reduction_buffers.<locals>.aggregate_reduction_prefix_suffix(  s   t||$)))#||A#||B/.0C0CDDD $%8 !::$22 !::<%%,,$55&778
 !::<%%,,-=-N-NO &J%%' I5#66jnn ''
(9(9(;<%%&6&G&GHI %%' M5#66
 ''
(9(9(;<,-@)9)M)M$2 D*.uZ^^4DH[YcYnYnMoLppq'rH,-=-D-DdHU, 0 A A4 55E5L5LM"))6 0 A A *#->>"2% 8 * 5 5 * #))*:*K*KL5M6 %/D!CI IM6 %/D!s   AK EK- K*-K=r   N)r  r;  )r  r  r  r$  r  r  r  r  r  r  r  )r:  r:  r  rA  main_kernels   `    rW   r  z*CppKernelProxy.aggregate_reduction_buffers  s   9	/v ll1o):-j9113!!(()E)EF!!(()E)EF&&--k.S.ST&&--k.S.ST!!(()I)IJ##**;+M+MN**1155	
 	**1155	
rV   r  )r  r  r  r  r  r#  r  r  r  r  r  r9  r  r#   r  r   r  r  r"  r%  r/  r2  rr  r  r6   r  rk   r   r  rl  rm  s   @rW   rJ  rJ    s     #,JY+)5ND&5/>tO,>+@
= 
c* c*J;<r,h<84#6 8BC;3\D0 3U
.2U
@H@UU
rV   rJ  c                   $     e Zd Z fdZd Z xZS )rc  c                 p    t         |   |j                  |j                  j                         g | _        y r  )r8  r9  r  r  r  re  r  s     rW   r9  zOuterLoopFusedKernel.__init__x  s)    **LOO,G,GH%'
rV   c           
         g }| j                   D cg c]  }|j                          }}|D ]h  }|j                  }|J |j                  |j	                  t        t        |      |j                  z
  |j                        |      j                         j t        t        |j                  t        |            |j                        S c c}w )Nr{  )re  rI  r  r  r`  r  r   r  r  r^   r_   )r:  ra  rY  kernels_parallel_depthrT  nested_kernelsrf  r  s           rW   r`  z*OuterLoopFusedKernel.decide_parallel_depth|  s    !#48JJ+
'0I  "+
 +
 % 	F !,,K***")),,!,/A/M/MM$6$B$B	  !.
	  "1137M3N +66	
 	
)+
s   C)r  r  r  r9  r`  rl  rm  s   @rW   rc  rc  w  s    (
rV   rc  c                       e Zd ZdZdZdZy)ReasonFusedNodessame_vars_reducecompatible_reductioncompatible_ranges_no_reductionN)r  r  r  SAME_VARS_REDUCECOMPATIBLE_REDUCTIONCOMPATIBLE_RANGES_NO_REDUCTIONrU   rV   rW   rI  rI    s    )1%E"rV   rI  c                       e Zd ZU eZee   ed<   dZ ee	j                  e	j                  g      Zedej                  dee	   fd       Z fdZdefdZd	 Zd
 Zd Zdedz  fdZd Zd Zd ZdededefdZd Zd Zd Z d Z!de"e#   fdZ$de%fdZ&de%e'z  e#z  fdZ(dedefdZ)dede*e   d e*e   fd!Z+d" Z,d# Z-d$ Z.d(d%Z/d& Z0d(d'Z1 xZ2S ))CppSchedulingkernel_proxy_clsi  devicer   c                     | j                   S r  )backend_features)r1  rS  s     rW   get_backend_featuresz"CppScheduling.get_backend_features  s    ###rV   c                 V    t         |   |       |r| j                          d| _        y r  )r8  r9  reset_kernel_group_ready_to_flush)r:  r-  r=  s     rW   r9  zCppScheduling.__init__  s'    ###%$rV   statusc                     || _         y r  rY  )r:  rZ  s     rW   _set_flush_statuszCppScheduling._set_flush_status  s
    %rV   c                 &    t        d |D              S )Nc              3      K   | ];  }t        t        t        j                  j                  j
                  |             = y wr  )r   r  r3   r  r  r   r  s     rW   r'  z)CppScheduling.group_fn.<locals>.<genexpr>  s,     M!U3qww//88!<=Ms   AA)r   )r:  ra  s     rW   group_fnzCppScheduling.group_fn  s    MuMMMrV   c                 "    t               | _        y r  )KernelGrouprb  r?  s    rW   rX  z CppScheduling.reset_kernel_group  s    'MrV   c                    |j                         s|j                         rt        j                  ||      S |j                         r(|j                         rJ t	        j                  ||      S | j                  ||      t        j                  k(  rt        |t        t        f      sJ t        |t        t        f      sJ |j                  \  }\  }}|j                  \  }\  }}|dk(  r|dk(  s	J ||f       fdt        |      t        |      k  r|n|}t        |t              sJ t        |      t        |      k  r|n|}	 |	      }
|j                  |
       |j                  \  }\  }}|j                  \  }\  }}||k(  rt	        j                  ||      S  |      }t        |	t              r|	j                  |       ngt        |	t              sJ |	j                  D ]&  }t        |t              sJ |j                  |       ( t	        |	j                  |	j                        }	|j                  \  }\  }}|j                  \  }\  }}||k(  s	J ||f       t	        j                  ||      S | j                  ||      r't         j                  ||| j#                  ||            S t	        j                  ||      S )NrU   c                 D   t        | t              rt        | j                        dkD  sJ | j                         d }t	        t
                  }| j                  D ];  } 	|      \  }}||}||k(  sJ ||| j                  f       |j                  |       = |t        |      fS t        | t              sJ | j                  }t        |t        j                        sJ |j                         \  }}}|j                  t        |j                  j                               fS ra  )r   r!   r   snodesr   r   updater/  r#   r&  r   ComputedBufferget_default_sizes_bodyr  indexing_exprsrf  )
r&  r  ri  snodevexprscomp_bufferr   r\  get_indexing_ranges_exprss
            rW   rn  z5CppScheduling.fuse.<locals>.get_indexing_ranges_exprs  s   !$(:;"4;;/!3@T[[@3%)
)3C):%)[[ 9E'@'GHAu)1-.
#-?PZDKK4PP?*11%89  *4+???)$>>>&*ii)+r7H7HIII%0%G%G%I
4#T5H5H5O5O5Q0RRRrV   )extra_indexing_constraints)
is_foreachr    r3  is_templater!   _why_fuse_nodesrI  rO  r   r#   rC  r   recompute_size_and_bodyre  r-  can_fuse_vertical_outer_loopr  _get_outer_loop_fusion_depth)r:  r  r   r   vars1reduce1vars2reduce2node_to_recompref_noderef_indexing_constraints#node_to_recomp_indexing_constraintsrj  rn  s                @rW   r3  zCppScheduling.fuse  s   !1!1!3-225%@@ ((***%**5%88 $$UE2#BBC "%-9K)LMMM!%-9K)LMMM&+kk##E7&+kk##E7"}BJ'8JJ6S& +.e*s5z*Au!.-@@@$'JU$;5+DX+N(66/G 7  !&:E1 %:E1E>-225%@@ 7P"73 h6443V 5  &h0BCCC!) )%???557Z 6 
  2(2D2DhooVH %:E1 %:E1~5u~5~)..ue<<225%@2775$"C"CE5"Q  *..ue<<rV   Nc                     |j                   \  }\  }}|j                   \  }\  }}||k(  r||k(  rt        j                  S |dk(  r|||z   k(  rt        j                  S | j	                  ||      rt        j
                  S y )NrU   )rC  rI  rM  rN  &_can_fuse_nodes_with_compatible_rangesrO  )r:  r  r   r   rv  rw  rx  ry  s           rW   rr  zCppScheduling._why_fuse_nodes  s    #kkE7#kkE7E>g0#444b=Uego5#88866ueD#BBBrV   c                 <   |j                   \  }\  }}|j                   \  }\  }}|dk(  xr |dk(  }t        j                  |      t        j                  |      k(  }	t        |      dk(  xs t        |      dk(  }
|r|	r|
syt        |      t        |      k  r|n|}t        |      t        |      k  r|n|}t	        |t
              ryt	        |t              sJ t	        |j                  t        j                        ryt	        |j                  t        j                        sJ |j                  j                  j                         }d }t	        |t
              rt        t        t        df             }|j                   D ]  }t	        |j                  t        j                        r ndt	        |j                  t        j                        sJ |j#                  t        |j                  j                  j                                       t        |      dk7  ryt%        t'        t)        |                  }n\t	        |t              sJ t	        |j                  t        j                        sJ |j                  j                  j                         }||k7  ryy)NrU   r4   F.T)rC  r  rd   r   r   r!   r#   r&  r   TemplateBufferrg  dataget_sizer   r   r   re  r  r/  nextiter)r:  r  r   r   rv  rw  rx  ry  c1c2c3rz  r{  ranges2ranges1
ranges_setrj  s                    rW   r  z4CppScheduling._can_fuse_nodes_with_compatible_ranges"  s    $kkE7#kkE7],w"}YYu5!11Z1_/E
arb"%e*s5z"9uJU35 n&89 .-888n))2+<+<=.--r/@/@AAA !%%**335h 23#E#s(O46J! Bejj"*;*;<!%**b.?.?@@@uUZZ__%=%=%?@A	B :!#4Z 012Gh666hmmR->->???mm((113GgrV   c                     t        |t        t        t        f      sJ t        |t        t        f      sJ t	        d ||fD              ry| j                  ||      d uS )Nc              3   H   K   | ]  }t        |t        t        f        y wr  )r   r  r   r$  s     rW   r'  z:CppScheduling._can_fuse_horizontal_impl.<locals>.<genexpr>_  s&      
 t9;TUV
s    "F)r   r!   r#   r   rf   rr  r:  r  r   s      rW   _can_fuse_horizontal_implz'CppScheduling._can_fuse_horizontal_implZ  sp    &7PQ
 	
 
 %"4m!DEEE 

 
 ##E51==rV   c                    |j                         s|j                         ryt        |j                               t        |j                               z   t        j                  j
                  kD  ry| j                  ||      S r  )rq  r   r7  r   r  max_horizontal_fusion_sizer  r  s      rW   can_fuse_horizontalz!CppScheduling.can_fuse_horizontalf  sf    %"3"3"5!"S):%;;jj334 --eU;;rV   r  r   c                 p   |j                         x}rt        |j                  t        j                        xr~ t        |j
                  t        j                        xrX t        |j
                  j                        dk(  xr4 |j
                  j                  d   j                         |j                  k(  S y)Nr4   r   F)get_template_noder   layoutr   MultiOutputLayoutr&  MultiOutputr   inputsrh  r   )r:  r  r   template_bufs       rW   can_fuse_multi_outputs_templatez-CppScheduling.can_fuse_multi_outputs_templateq  s     !2244<4<..0D0DE Iuzz2>>:I

))*a/I JJ%%a(113|7H7HH	 rV   c                    d}t        d ||fD              s|S t        |t              r|j                         d   n|}t        |t        t
        f      sJ t        |t              r|j                         d   n|}t        |t        t
        f      sJ |j                  \  }\  }}|j                  \  }\  }	}
|dk(  r|	dk(  r|dk7  r|
dk7  r|S t        d ||fD              r'|j                  |j                  k(  r|j                  S |S t        t        |      t        |	            }|dk\  rI|d | |	d | k(  r>t        d ||fD              r(t        |      t        u r|n|}|j                  |k(  r|S |S |S |S )Nr   c              3   T   K   | ]   }t        |      t        t        t        fv  " y wr  )r#  r  r!   r#   r$  s     rW   r'  z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>  s/      
  J+-?OP
r(  r  rU   c              3   >   K   | ]  }t        |      t        u   y wr  r*  r$  s     rW   r'  z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>  r+  r,  r4   c              3   >   K   | ]  }t        |      t        u   y wr  r*  r$  s     rW   r'  z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>  s      >BT
99r,  )r.  r   r  r0  r!   r#   rC  r2  r^   r   rf   r#  )r:  r  r   DISABLE_OUTER_LOOP_FUSION_node1_node2r   rv  rw  rx  ry  r2  _compare_nodes                rW   ru  z*CppScheduling._get_outer_loop_fusion_depth}  s   $%! 
 
 

 -, %!<= !!#B' 	
 &#5}"EFFF %!<= !!#A& 	
 &#5}"EFFF$llE7$llE7B;5B;7b=W],,TeU^TT 00E4Q4QQ -- /
 #&c%j#e*"=#q(../59Q:Q3RR GLen  "%[,GGEU  !88<SS2244 /.((rV   c                    |j                          xro |j                          xr\ |j                         |j                  z  xr= | j                  ||      xr |j	                           xr | j                  ||      dk\  S r   )rq  get_operation_names	ancestorsr  r  ru  r  s      rW   rt  z*CppScheduling.can_fuse_vertical_outer_loop  s    !!## E%%''E))+eoo=E ..ue< -**,,E 11%?1D		
rV   c                 *    | j                  ||      ryyrj  )rt  r  s      rW   get_fusion_pair_priorityz&CppScheduling.get_fusion_pair_priority  s    ,,UE:rV   c                     |j                         ry|j                         r%t        ||g      \  }}|j                          xr |S | j                  ||      xr |j                          xs | j	                  ||      S r  )rq  rM   r  r  rt  )r:  r  r   template_fusion_supportedr   s        rW   can_fuse_verticalzCppScheduling.can_fuse_vertical  s    +Sw,(%q ))++I0II**5%8UASASAU=U=..ue<	=rV   rL  c                    t        d |D              r|S ddd}d}d}d}d}g }|D ]  }t        |j                  t        j                        sJ |j                  j                         }	|j                  ||	f       |	\  \  }
}}}|j                  j                         D ]2  \  }t        |t        j                        s"|j                  t              D ]  t        fd|j                  D              r|k7  r}|dz  }|dkD  r|c c c S t        j                  d   t        j                  j                   j"                        svj                  d   |j                  v st%        fd|j                  j                         D              sÉj                  d   dkD  s։j                  d   j                  d   d	}|}|
} 5  |s|S |J t'        |      }|D ]0  \  }\  \  }
}}}|j                  vr|c S t'        |
      |k7  s.|c S  dfd
}|D ]  }||k(  s	|j)                  |        |D ]  }||k7  s	|j)                  |        |S )aI  
        Apply loop split optimization.
        When one of the indexing_exprs contains a division, we eliminate the division by splitting the loop
        to avoid non-contiguous loads, subject to the following conditions:
            1. No reduction and no mudular index for all nodes.
            2. The indexing_exprs of all nodes contain only one (or more, but all the same) division,
               where the divisor is an integer and not too small (the divisor > 8), the dividend is
               one of the iter_vars, and this var, i.e. the dimension that needs to be split, is
               contiguous in all other indexing_exprs.

        For example, if the node's var_ranges: {z0: 2, z1: 9216, z2: 960} and indexing_exprs:
        {'index0': 8847360*z0 + 960*z1 + z2, 'index1': 32*z0 + (z2//30), 'index2': z2},
        we will split z2 -> 30*z2 + z3, then the node's var_ranges will be changed to
        {z0: 2, z1: 9216, z2: 32, z3: 30} and indexing_exprs will be changed to
        {'index0': 8847360*z0 + 960*z1 + 30*z2 + z3, 'index1': 32*z0 + z2, 'index2': 30*z2 + z3}.
        c              3      K   | ]X  }t        |j                  d    d          dk7  xs4 t        d |j                  j                  j                         D               Z yw)r4   r   c              3   F   K   | ]  }|j                  t                y wr  )r   r   )r%  r  s     rW   r'  z9CppScheduling.try_loop_split.<locals>.<genexpr>.<genexpr>  s      .2)r<  N)r   rC  rf   rF  ri  rf  r$  s     rW   r'  z/CppScheduling.try_loop_split.<locals>.<genexpr>  sc      

 	 

1a !Q&  6:jj6O6O6V6V6X 
s   AA Nr   Fc              3   @   K   | ]  }j                  |        y wr  )r   )r%  r   div_exprs     rW   r'  z/CppScheduling.try_loop_split.<locals>.<genexpr>  s     Q#HLL-Qr]  r4   c              3   d   K   | ]'  \  }}|k7  rt        |j                  d          dv  ) yw)r   rl  N)r  r  )r%  name_expr_r  r   s      rW   r'  z/CppScheduling.try_loop_split.<locals>.<genexpr>  s9        ,u$} 0x}}Q7GHFR r  r  Tc                    | \  }}|\  }}|j                        }|j                         }||   z  ||<   |j                  |dz          t        j                  ||d      \  \  }	}
}|	j                         }|j                  |dz         }||   z  |z   ||<   t        j                  |||g||	|      }s/|j                  t        |j                  j                               f||f||	|ffS )Nr4   r
  )rO  )r   copyinsertr	   index_vars_no_squeezepopr   r   r  r/  ri  rf  )ra  r\  r  
index_sizereduce_sizer+  reduce_vars	split_idxnew_index_sizenew_index_varsr   r  	iter_varsdivisor_varro  split_number	split_vars                 rW   
loop_splitz0CppScheduling.try_loop_split.<locals>.loop_split/  s   &+#J&*#J"((3I'__.N(29(=(MN9%!!)a->.:.P.PC/+^Q '++-I#--	A6K#/)I2F#F#TIi ;;y+.
NKD .OO,,3356.*
  -- rV   )recompute_sizes_body_func)ro  r  )rf   r   r&  r   rg  rh  r  ri  r  r   r  findr   r  r  corenumbersrZ  r.  r   rs  )r:  rL  num_div	div_expr_	match_divmatched_nodematched_index_sizenode_bodiesr&  
sizes_bodyr  r   original_bodyr  matched_num_dimsr  r  ro  r   r  r  s                   @@@@@rW   try_loop_splitzCppScheduling.try_loop_split  s   &  

 
 
 L			! .0  	8Ddii):):;;;99;Jj120:-OZ]A+::@@B 8
d!$

3 $		( 3 8HQ9P9PQQ$	1$,	1{$"8==#3UZZ5G5G5O5OP$MM!,0G0GG ,  0=0L0L0R0R0T  
 %MM!,q0$,MM!$4	'/}}Q'7$(	'+-7*188 	8F L "---129D 	5D5?JM1 7 77:"22		 &*"	:  	SD|#,,z,R	S  	D|#,,/I.8 - 	 rV   r&  c                 X   	  j                   t        j                  }g g 	t        |t              sJ dt        f	 fd} ||      s|t        _        j                          	j                          t        j                  j                  j                  d      5  |j                         D ]^  }t        |t        t        f      sJ |j                         } j                        }|j                  |       j!                  ||       ` 	 ddd       yy# 1 sw Y   yxY w)a  
        Generate the code for the outer loop fused scheduler node.
        1. Codegen with fused outer loop: depends on the analysis of
            the outer loop fused scheduler node, with or without the local buffer.
        2. If failed, fallback to standard codegen.
        r&  c           	          t         t              sJ j                          j                          dt        fdg }i t	         fd j                         D              rt                j                         D ]  t        t              sJ j                  j                                j                         st        j                               dk7  rbj                         d   t	         fdj                  D              sj                  }t        |t         j"                        sJ |j%                         } j&                  t                     z
  }fd}|j)                         r |       s	g }|j*                  d   }       |d	 }t-        |      D ]  }	|j/                  d|       ||	z  } t!        j0                  |j2                  |j4                  ||      }
fd
}d} ||
|      }|sEt!        j6                  | dt        |       |
      }|j9                  |       g |j:                  <   |j:                     j9                  |        t=        j>                        5 }t        |      dkD  r4|D ]/  }|j:                  J |jA                  ||j:                            1  j                         D ]z  }t        |tB        t        f      sJ jE                        }|jG                  |j                                j9                  |       j9                  |j                                |  jI                   j&                        sD|jJ                  D ]+  }tL        jN                  jJ                  jQ                  |       - 	 d	d	d	       ytR        jT                  j9                  tS        jV                  t              t        |jX                                      j[                        }j]                  |g t^        j`                  jc                               d	d	d	       y# 1 sw Y   yxY w)zN
            Codegen code with fused outer loop and local Buffer.
            r&  c                     t        | t        t        f      sJ | j                         }t	        |d       j
                  \  }\  }}t        |      t        |      z   }|S )Nc                 4    t        | j                               S r  )rm   r  r  s    rW   r  z~CppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges.<locals>.<lambda>t  s    Q^^-=)> rV   rb  )r   r#   r!   r7  r_   rC  r   )r&  rL  r   rC  rD  r  s         rW   get_call_rangeszlCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_rangesp  s`    !$8J(KLLL-1^^-=.1>/% ,+E? $ElU?-CC""rV   c              3   `   K   | ]%  }t         |            j                  d z   k(   ' yw)r4   N)r   r2  )r%  r<  r  r&  s     rW   r'  zfCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.<genexpr>|  s3       OE*+t/K/Ka/OOs   +.r4   r   c              3   V   K   | ]   }|j                   j                         v  " y wr  )r&  r7  )r%  r  r&  s     rW   r'  zfCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.<genexpr>  s&      :>		T^^%55   &)c                  B   dd} t        j                  j                  j                               D ]  \  }}| |z  z  | |z  }  j                  j	                  j                               }fd |      xr t        fdj                  D              S )Nr   r4   c                     | k(  S r  rU   )r  contiguous_index_exprs    rW   is_contiguous_indexzCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.is_contiguous_index  s    '(,A'A ArV   c              3      K   | ]Y  }t        |j                  t              xr9  |j                  j                  j	                  j                                      [ y wr  )r   r&  r#   rF  get_read_exprrh  )r%  r  r  scheduler_buffers     rW   r'  zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.<genexpr>  s\      Q %) !+499m D !"$7$(IIOO$A$A(8(A(A(C%&%"!"Qs   AA")rB  rF  r  r  get_write_exprrh  r.  r  )r{  r   rS  write_index_exprr  r  r  r  s       @@rW   is_all_write_read_contiguouszyCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous  s    451%&F.6 . 4 4 ? ? E E G/ 0
U !6# E 5 &%0 0>/C/C/R/R 0 9 9 ;0,B $77G#H $S Q -=,B,BQ N rV   r  Nc                 ~    |D ]7  }| |j                   k(  st        fd|j                     D              s5|c S  y )Nc              3      K   | ]]  }|j                   Ot        fdt        j                  j                  j
                  |j                      j                  D               _ y w)Nc              3   V   K   | ]   }|j                   j                         v  " y wr  )r&  rh  )r%  r  visited_scheduler_nodess     rW   r'  zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>.<genexpr>  s,      (&,0 )-		(:(:(<@W(W(&r  )r   r.  r3   r  r-  name_to_bufr  )r%  global_bufferr  s     rW   r'  zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>  s`      S" )6 (5'9'9'E %( (&45GG4E4E4Q4Q,9,>,>5**/%	(& %&S"s   A#A&)r  r.  r   )local_buffer_layoutre  	local_buflocal_to_global_buffersr  s      rW   try_share_local_bufferzsCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer  sS    -: 5	#6):J:J#Js S" :Q(1:&S" P" ,5$45 $(rV   local_buffer_datar   )r   r  F)local_buffer_numberT)2r   r  clearr   r.  r0  r   r7  r#   r  rh  r  r   get_outputsr  r&  r   rg  rg  r2  is_contiguousr{  rB  r  FixedLayoutrS  r   Bufferr  r   rJ   r  add_local_bufferr!   rR  r2  r_  r  r3   r  remover   !cpp_outer_loop_fused_inner_countsCppOuterLoopFusedCountre  rj  finalize_kernelr  r  from_iterable)r&  re  r  global_buffer_layoutsize_offsetr  local_buffer_strider{  local_buffer_sizeszr  r  local_buf_prefixlocal_buffer_usedscoperi  r<  r]  removed_bufferouter_fusion_cpp_kernel_proxyr  r  r  r  r  r[  rb  
nodes_listr:  s   `                   @@@@@rW   $try_outer_loop_fusion_with_local_bufzSCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_bufh  sO    d$?@@@!'')#&7 # .0MBD# !113  <F<'&*nn&6 fN%nmDDD+//0G0G0IJ&335~99;<A '5'A'A'CA'F$ BRBXBX  )9(=(=)-9J9JKKK/</G/G/I,&*&B&BS+N;F '8 1>>@ < >$9;+!5!<!<R!@,;N,K'L-) #++<"= )B/66q&A"bLF) /1nn077066-/	/+(" ,?(,B/-)  102		(8'93};M:N%O':1- *001BCNP34E4J4JK/0A0F0FGNN)IfP $L$5$56 "%}%)(5 +00<<<..(*A,BSBS*T
 "113 9E%e.@--PQQQ'+'<'<\'J$$225??3DE)001AB%%eoo&789 >>)4+G+G +0*?*? G //66~FG !+" ", 99@@2212,/0C0C,D 150O0O)1- ,,1@ioo33J?@?"H I"H s   D'Q<BQQ Fr   N)rb  r   r  r   r  r  r   r	  r   r
  r0  r!   r#   r7  rR  r2  r  )
r:  r&  r  r  r<  _nodesr]  r[  rb  r  s
   `      @@@rW   codegen_outer_loop_nodez%CppScheduling.codegen_outer_loop_nodeX  s%    (()0)O)O&=?02
$ ;<<<i	7R i	 i	V 4D95SG2!'') ''--e-D K!113 KE%e.@--PQQQ27//2CF'+'<'<\'J$$226: 001A6JKK K :K Ks   #A2D  D)c                 t   | j                   }t        |t              r| j                  |       nU|j	                         }| j                  |      }| j                  |      }|j                  |       |j                  ||       | j                         }|t        j                  kD  r| j                  d       yy)zC
        Turn an set of pre-fused nodes into a C++ kernel.
        TN)rb  r   r  r  r7  r  rR  r2  r  _get_scheduled_num_argsrQ  MAX_FUSED_KERNEL_ARGS_NUMr]  )r:  r&  rb  rL  r]  args_nums         rW   codegen_nodezCppScheduling.codegen_node#  s     ((d78((.)-)9E''.E#44\B**51(()95A//1m===""4( >rV   c                 n    t        |t              xr$ t        |j                  t        j                        S r  )r   r#   r&  r   CppTemplateBuffer)r:  r&  s     rW   is_cpp_templatezCppScheduling.is_cpp_template9  s,    $. 
:IIr++4
 	
rV   template_nodeepilogue_nodesprologue_nodesc                    |rJ |D cg c]  }t        |t        t        f      r| }}t        d   dxx   dz  cc<   t        d   dxx   t	        |      z  cc<   | j                  |      sJ d       t        t        |      }|j                  \  }\  }}|dk(  sJ t        t        j                  |j                        }|D cg c]  }|j                   }	}t        d |	D              sJ d       d	 }
 |
||j                  |	      }|j                  |||	
      \  }}|5  t        |j                        s|j                          |D ]  }|j                            |       }ddd       t!        j"                  |      5  |g|}| j%                  ||j&                        }ddd       t        |j                        rt	        |j(                        dk(  sJ d       |j(                  d   j*                  D ]r  }t        |j                  t,              sJ d       t        |j                  j                  t        j.                        sJ d       |j                  j                          t | j1                         |j3                  ||       t         j4                  xj6                  |j6                  z  c_        | j9                          yc c}w c c}w # 1 sw Y   xxY w# 1 sw Y   ExY w)zG
        Codegen a CPP template, possibly with fused epilogues
        inductorcpp_templated_kernel_counterr4   cpp_epilogue_fusion_counterzlTemplate node passed to CppScheduler.codegen_template must be a SchedulerNode that wraps a CppTemplateBufferrU   c              3   P   K   | ]  }t        |t        j                           y wr  )r   r   rg  )r%  r4  s     rW   r'  z1CppScheduling.codegen_template.<locals>.<genexpr>[  s     O:a!2!23Or  z9Epilogue nodes must all be instances of ir.ComputedBufferc                     sy| j                         |v sJ || j                            j                  }t        fd|D               S )NFc              3      K   | ]8  }t        |j                  t              xr |j                  j                  v  : y wr  )r   r&  r   )r%  r  r  s     rW   r'  zZCppScheduling.codegen_template.<locals>.template_buffer_has_other_users.<locals>.<genexpr>g  sA        499&78 5IINNn45s   >A)rh  r  r.  )template_bufferoutputs_by_namer  r  s     ` rW   template_buffer_has_other_userszGCppScheduling.codegen_template.<locals>.template_buffer_has_other_users_  s\     ""++-@@@#O$<$<$>?EEE  "   rV   )$flag_template_buffer_has_other_usersr  NzSMulti outputs template should be with 1 output template buffer of MultiOutputLayoutr   z?Multi outputs template should be with ExternKernelSchedulerNodez7Multi outputs template has multi users with MultiOutput)r   r#   r!   r   r   r  r   rC  r   r  r&  r.  r  make_kernel_renderr(   r)  r3   set_kernel_handlerdefine_kernelr  outputsr  r   r  codegen_commentcall_kernelr  r  free_buffers_in_scheduler)r:  r  r  r  epilogue_noder   rnumelctbr4  epilogue_ir_nodesr  r  rf  renderr&  src_codenode_schedulekernel_namer  s                      rW   codegen_templatezCppScheduling.codegen_template>  s    "!!
 "0
--9K)LM 
 
 	;<A<:;s>?RR;##M2 	
z	
2 ]M:&,,;Av||$()=)=}?Q?Q$RHV7W17W7WO=NOO 	
G	
O	 0O..0A0
, //1U, 0 

  	 ,]-?-?@&&(&   xH	  !!&) 	S*<^<M,,X}fkkRK	S %]%7%78 },,-2 e2 &--a066 %!$))-FG UG "$)).."..A MA 		""$% 	]K8;,	6#9#99&&(Q
  8X6	  	 	S 	Ss$   KK"AK#K)K&)K3c                 6    | j                   j                         S r  )rb  get_num_argsr?  s    rW   r	  z%CppScheduling._get_scheduled_num_args  s      --//rV   c                     | j                   S r  r\  r?  s    rW   ready_to_flushzCppScheduling.ready_to_flush  s    ###rV   c                      y r  rU   r?  s    rW   codegen_synczCppScheduling.codegen_sync  s    rV   c                    t         j                  j                  }||j                  v r|j                  |   }|S t        j
                  j                  r$t        |t        j
                  j                        nd}dj                  d||j                         g      }||j                  |<   t         j                  j                  r|nd}|j                  t        t        j                        |      }|j                  t        t        j                        |      }|j                  dd      }|j!                  d      }|j#                  d|      }	t$        r|j#                  d|	d	z         }	|||	d	z     d
}
t'               }|| j(                  j*                  n|}|j-                         \  }}}t         j                  j                  s|j/                  d|d       |j1                  |d       t         j                  j                  s|j/                  d       |j3                  ||j5                         d|
       |S )NrR   r   r  rf  z#pragma CMTz//z
extern "C"r   r4   z;
zasync_compile.cpp_pybinding(z, r'''T)stripz''')F)gpucpp_definition)r3   r  wrapper_codesrc_to_kernelr   r  descriptive_namesr&   r  next_kernel_suffixcpp_wrapperr  ro   r+   KERNEL_NAMEDESCRIPTIVE_NAMErfindr  rT   r<   rb  r  cpp_argdefsr   r$  r   getvalue)r:  r*  rL  kernel_argsr  r,  
fused_namekernel_decl_name
first_char	last_charkernel_definitioncompile_wrapperr  r   	arg_typess                  rW   r   zCppScheduling.define_kernel  s   ''&&w,,,!//9KR K ::// &eVZZ-I-IJ 
 ((E:w7Q7Q7S#TUK.9G!!(+./gg.A.A{x''K,C,C(DFVWH''K,H,H(I;WH  ''t<H "5J c:6I$MM#y1}=	#+JQ#G"H L,.O-8-@4$$))kD"..0OAq)77&&))29-vF ""84"877&&))&1!!((*0	 "  rV   c                    | j                   j                         }|rB| j                  || j                   j                        }| j	                  | j                   j                  |       t
        j                  j                  rft        j                  j                  j                          t        j                  j                  j                  || j                   j                         | j                   j                  t        j                  j                  |       t
        j                  j                  r(t        j                  j                  j                          | j                          | j!                  d       y r  )rb  codegen_groupr   scheduled_nodesr"  r   r  enable_kernel_profiler3   r  r8   write_kernel_context_guard_beginwrite_kernel_context_guardr#  write_kernel_context_guard_endrX  r]  )r:  r*  r,  s      rW   flushzCppScheduling.flush  s   $$224,,$++;;K   !2!2!B!BKPzz//$$EEG$$??%%55 ))!''*>*>Lzz//$$CCE!u%rV   c                 t    t         j                  j                  }t        ||      }|j	                  ||       y r  )r3   r  r8  r   write_provenance_debug_handle)r:  r+  r,  r  debug_handles        rW   r"  zCppScheduling.codegen_comment  s3    ''&&>

 	--k<HrV   r  )3r  r  r  rJ  rR  r#  r  r
  r   r5   INPLACE_BUFFERSREDUCE_TO_SINGLE_ELEMENTrU  rk  r   rS  rV  r9  rk   r]  r`  rX  r3  rI  rr  r  r  r  r   r  ru  rt  r  r  r/  r#   r  r  r  r!   r  r  r   r-  r	  r1  r3  r   rQ  r"  rl  rm  s   @rW   rQ  rQ    s    .<d>*; !$!**33	
 $%,, $:n;U $ $%& &N*P=d/?$/F 6p
>	<
&
/@
	
4)l

=CD$7 CJIK)IKV)),>>N),
$5 
$ 

T)(T) !!23T) !!23	T)l0$,\&(IrV   rQ  c                   D     e Zd Z fdZd Zd Zd ZddefdZd Z	 xZ
S )	rb  c                    t         |           t               | _        t	               | _        t        | j
                        | _        t        j                         | _
        | j                  j                  | j                         g | _        y r  )r8  r9  r>   r  r6   
loops_codeWorkSharingr  r   r   r   r   rL  )r:  r=  s    rW   r9  zKernelGroup.__init__  s^    L	&.doo.))+


  )!rV   c                 :     || j                   t               g| S r  )r  r*   )r:  r1  r  s      rW   r  zKernelGroup.new_kernel  s    49924<t<<rV   c                     | xj                   |z  c_         | j                  }| j                  }|j                  ||       y r  )rL  rY  r  rr  )r:  r  rL  r   r  s        rW   r  zKernelGroup.finalize_kernel  s5    %WW  r*rV   c                 X    | j                   j                         \  }}}t        |      }|S r  )r  r@  r   )r:  arg_defs
_call_args
_arg_typesr  s        rW   r/  zKernelGroup.get_num_args  s)    +/99+@+@+B(*jx=rV   r   c                 X   | j                   j                          | j                  syt               }t        j
                  j                  xr t        j                  dv }|r|j                  dg       |j                  d       |t        t        j                        n|}|t        t        j                        n|}| j                  j!                         \  }}}dj#                  d      j%                  |      }t'               }t        j
                  j(                  rdnd}	|j                  d| d	|	 d
| d| d	       |j+                         5  |rNt,        j.                  j0                  }
|
dt        |
      z   dz   nd}|j                  d||z    d||z    dg       | j                  j3                         D ]  \  }}|j                  d| d| d        |j5                  | j6                         d d d        |j9                         S # 1 sw Y   |j9                         S xY w)NrR   )linuxrP   z3#include <torch/csrc/inductor/aoti_runtime/utils.h>z+#include <torch/csrc/inductor/cpp_prefix.h>z,
   C10_ALWAYS_INLINE_ATTRIBUTEzextern "C" z void r   r  r   graph_r   z9torch::aot_inductor::RAIIAtenRecordFunctionHandle record_z_("z", nullptr);r   r   r   )r   rS  rL  r6   r   r  rM  sysplatformr   r   ro   r+   r=  r>  r  r@  ljustr  rX   force_inline_kernelr   r3   r  graph_idaliasesr$  rY  rA  )r:  r   r   rM  rD  r,  r^  r   func_export_declinline_attrrj  rO  oldnews                 rW   rK  zKernelGroup.codegen_group  s   

##~ !'

 @ @ !
S\\ V
 F
 !OORSTDE <@<3{667T;?<c+667T..0!Q;;r?''113-3ZZ-K-K)QS 	 	*+6+a@P?QQRS[R\\]^	

 [[] 	)$77++;C;OCM1C7UW&&,{&:%;3v?S>TT`b !II--/ 7Sse3se1567KK(	) }}	) }}s   B&HH)c                 j    | j                   j                         \  }}}|j                  ||d|       y )NF)tritonrI  )r  r@  generate_kernel_call)r:  r  r,  r   	call_argsrI  s         rW   r#  zKernelGroup.call_kernel0  s:    "&))"7"7"99i$$	 	% 	
rV   r  )r  r  r  r9  r  r  r/  ro   rK  r#  rl  rm  s   @rW   rb  rb    s)    "=+
,# ,\
rV   rb  c                   0    e Zd Zd Zd Zd Zd Zd Zd Zy)rZ  c                 `    || _         d| _        d | _        t        j                         | _        y r  )r   in_parallelr  r   r   r   )r:  r   s     rW   r9  zWorkSharing.__init__;  s)    	 ))+
rV   c                 V   | j                   r|| j                  k7  r| j                          | j                   s|| _        d| _         d}t        j                  j
                  dk\  rd}n|t        j                         k(  rd}nd}|st        j                  j                  r| j                  j                  d       n| j                  j                  d| d       | j                  j                  | j                  j                                | j                  j                  d       y y )NTFr4   z#pragma omp parallelz!#pragma omp parallel num_threads(r   zint tid = omp_get_thread_num();)rv  r  rS  r   r  rY  os	cpu_countr  r   r   r   r   r   )r:  rY  use_dynamics      rW   rE  zWorkSharing.parallelA  s    4+;+; ;JJL&D#DKzz!!Q&#BLLN*" $fjj88		##$:;		##&GyPQ$RSJJ$$TYY%5%5%78II1)  rV   c                 h    | j                   r| j                  j                  d       | j                   S )Nz#pragma omp single)rv  r   r   r?  s    rW   rc  zWorkSharing.single]  s*    II 45rV   c                 F    | j                   j                          d| _        y r  )r   rS  rv  r?  s    rW   rS  zWorkSharing.closeb  s    

 rV   c                 :    | j                   j                          | S r  )r   ry  r?  s    rW   ry  zWorkSharing.__enter__f  s    

rV   c                 >    | j                   j                  |||       y r  )r   r  r{  s       rW   r  zWorkSharing.__exit__j  s    

Hgv6rV   N)	r  r  r  r9  rE  rc  rS  ry  r  rU   rV   rW   rZ  rZ  :  s     ,8 
!7rV   rZ  c                      e Zd ZU dZej
                  dz  ed<   dZej
                  dz  ed<   ej                  j                  Z
ej
                  ed<   ej                  j                  Zej
                  ed<   ej                  j                  Zej
                  ed<   dZeed<   d	Zeed
<   d	Zeed<   d	Zeed<   d	Zeed<   d Zd Zd Zy)r;  Nr   rF  rG  r  rH  r   rE  Fsimd_ompsimd_vec	collapsedr  c                 j    t        j                         }|r|j                         | _        y d| _        y ra  )r   r  rp  simd_nelements)r:  r  s     rW   __post_init__zLoopLevel.__post_init__  s-     .9-E-E-GAO>#;#;#=UVrV   c                    t        j                  |      }t        | j                  | j                        }||_        d|_        t        |j                  |      |z  |_        | j                  |_	        d|_
        | j                  |_        |S )NTF)r   rZ  r;  r   rF  rH  r  r   r  rE  r  r  )r:  r1  sympy_factorrU  s       rW   r  zLoopLevel.tile  sn    }}V,499-!
"499l;lJ --rV   c                    t        | j                        }t        | j                        }t        j                  j
                  r||k(  ry | j                  r| j                  dkD  rd| j                   dnd}| j                  rFd}| j                  dkD  r|d| j                   dz  }| j                  r\|j                  dd|       }nF| j                  rd}n7| j                  rd	| }n%| j                  st        j                         rd
}nd}t         d| j                   d| }| j                   d| }| j                   j"                  r%| j                   dt        | j                          }n;| j                   dt        | j                          dt        | j                          d}d| d| d| d}| j$                  s|s|gS ||gS )Nr4   zsimd simdlen(z) rR   z#pragma omp forz
 collapse(r   z for z#pragma omp z#pragma GCC ivdepr   r  r  r  z+=(z == 0 ? 1 : zfor(r   )rD   rG  rF  r   r  no_redundant_loopsr  r  rE  r  r  r  r   r  rI   r   rH  r  r  )	r:  offset_expr	size_exprsimdline1
offset_strr  	steps_strline2s	            rW   r]  zLoopLevel.lines  s   !$++.		*	::(([I-E }}!4!4q!8 D//03 	
 ==%E}}q :dmm_A66}}gtf~>]]E]]"4&)E""{'9'9';'EE"|1TXXJa}=
hhZq,::88*B{4::'>&?@I
 88*CDJJ 78 9"4::./q2  zl"XJb1=>>7Nu~rV   )r  r  r  r   r   r  r  rF  r   r   rG  r  OnerH  rE  rm   r  rk   r  r  r  r  r  r]  rU   rV   rW   r;  r;  n  s    !Cd	!"D%**t
"FEJJ% #WW\\J

)E5::#HcHdHdItL$
W	'rV   r;  c                       e Zd ZU dZdZee   dz  ed<   dZe	dz  ed<   e
de	fd       Zd Zed        Zd Zd	 Zd
e	fdZd ZdefdZy)rR  aV  
    A loop-nest-like structure. It is built with the `build` method
    as a loop nest and then will perform loop-tiling at some depth.

    A typical case is for vectorization, where we typically do loop-tiling
    at the innermost loop level. A more complicated case is when we do
    2D tiling at both the innermost and outer levels.
    NrN  rf  c                 $   | j                   }| j                  }| j                  }|J d}t        t	        ||            D ]B  \  }\  }}t        ||      }|s|g}n|j                  |       ||k\  s2| j                  |_        D t        |      }	|	S )z4Build a LoopNest with the given `kernel` as the leafN)	ru  rY  r  r   r%  r;  r  r  rR  )
rf  ru  rY  r  rN  loop_idxr   rF  rU  rT  s
             rW   rq  zLoopNest.build  s     ?? 00***(,%.s8V/D%E 	8!HksDS$'DT"?*$*$7$7!	8 UO	rV   c                 ,    t        | j                        S r  )rk   rN  r?  s    rW   __bool__zLoopNest.__bool__  s    DJJrV   c                    | j                   t        dd      S d}d}| j                   d   j                  }t        j                  d      }| j                   D ];  }|j                  |k7  r n*|t        |j                  |j                        z  }|dz  }= d } || j                         }dt        fd}|t        | j                         k  rt        |t        j                        rt        | j                   |   j                  t        j                        r|dz  t        | j                   |   j                  | j                   |   j                        k  r|&||kD  r!| j                   |   j                  r ||       sd|}d}| j                   |   j                  }t        |t        | j                               D ]%  }	| j                   |	   j                  |k7  r n|dz  }' t        ||      S )a  
        Maximal allowed depth for parallelism: All reduction or non-reduction levels.
        When the range of the first inner loop beyond the maximum parallel depth is much
        larger than the range of all outer loops within the maximum parallel depth,
        change the starting depth of parallelism to the first inner loop and recalculate
        the maximum parallel depth.
        r   r{  r4   c                 J    t        |       D ]  \  }}|j                  s|c S  y r  )r   r  )rN  r   rU  s      rW   get_simd_vec_depthz7LoopNest.max_parallel_depth.<locals>.get_simd_vec_depth   s+    $U+ 4==H rV   rT  c                     t        | j                  t              sJ t        d | j                  j                  D              S )Nc              3   >   K   | ]  }t        |t                 y wr  )r   r  )r%  rf  s     rW   r'  zILoopNest.max_parallel_depth.<locals>.has_scalar_kernel.<locals>.<genexpr>  s"       v|44r,  )r   rf  rJ  rf   r  )rT  s    rW   has_scalar_kernelz6LoopNest.max_parallel_depth.<locals>.has_scalar_kernel	  s>    i..??? '..66  rV   rQ  )rN  r  r  r   rZ  r   rF  rH  rR  r   r   rS  )
r:  r  	max_depthr  	num_stepsrU  r  simd_vec_depthr  r   s
             rW   ra  zLoopNest.max_parallel_depth  s    :: qAA	zz!}11MM!$	JJ 	D  L0!HTYY

$CCINI			 ,DJJ7	 	 DJJ'9emm44::i055u}}ECtzz),114::i3H3N3NOP *.JJy)66%d+ $KI::k2??L;DJJ8 ::a=--=Q	 I;OOrV   c                    |j                   | j                         j                   k  sJ d       | j                  J t        | j                        |j                   k\  sJ | j                  |j                     }|j                   |_        |j                  rt        xj                  dz  c_        t        |j                  dz   |j                         D ]  }d| j                  |   _
         y )Nz?Parallel depth cannot exceed the maximal allowed parallel depthr4   T)r  ra  rN  r   r  rE  r  r   parallel_reduction_countrS  r  )r:  rG  rU  r   s       rW   rb  zLoopNest.mark_parallel)  s    ''4+B+B+D+S+SS 	
M	
S zz%%%4::)":"::::zz)//0!00,,1,y,,q0)2J2JK 	+A&*DJJqM#	+rV   c                     | j                   sJ | j                   |   j                  |      | j                   |<   | j                   |   S )z
        Do loop-tiling at the `depth` level with `factor`.
            for (x0 = 0; x0 < x0_end; x0++)
            ->
            for (x0 = 0; x0 < x0_end; x0 += factor)
        See details in Note [tiled_size].
        )rN  r  )r:  rQ  r1  s      rW   r  zLoopNest.tile7  sA     zzz JJu-226:

5zz%  rV   r   c                 6    | j                   sJ | j                   S r  rf  r?  s    rW   rI  zLoopNest.get_kernelC  s    {{{{{rV   c                     || _         y r  r  r5  s     rW   r  zLoopNest.set_kernelG  s	    rV   levelc                     | j                   sJ t        | j                         |k\  sJ |t        | j                         k(  rd n| j                   |d  }t        || j                        S r  )rN  r   rR  rf  )r:  r  rN  s      rW   rd  zLoopNest.from_loop_levelJ  sV    zzz4::%'''TZZ0djj6Ht{{++rV   )r  r  r  r  rN  r/  r;  r  rf  r  r  rq  r  r$   ra  rb  r  rI  r  rm   rd  rU   rV   rW   rR  rR    s     %)E4	?T!(#FI#i  (  <P <P|+
!I ,S ,rV   rR  )NNNr  )r   dataclassesrU  r  r  rW  rx  r   rf  rO  collections.abcr   r   enumr   typingr   r   r   r   r   torch.fxtorch._inductorr	   torch._prims_commonr
   r   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   _dynamo.utilsr   rR   r   r   r   r   r   debugr   r[  r   r-  r   r   r   r    r!   r"   r#   utilsr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   virtualizedr0   r1   r2   r3   commonr5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   	cpp_utilsrA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rg  rT   cacherX   _logginggetArtifactLoggerr  schedule_logNATIVE_OMP_RTYPESRTYPE_TO_CPPr  PYTHON_TO_CPPCONTAINER_PYTHON_TO_CPPr]  r^  r   r\  rn   rk   r_  r`  r#  r  ra  rb  rv   r/  r   r  r   r   r   r   r   r  ro   r   rm   r   r   r   	lru_cacher   r  r  	dataclassr  r  ro  r  r  _initialize_pointwise_overridesr  r  r  r  r  r  r   rU  rW  rJ  rc  rI  rQ  rb  rZ  r;  rR  rU   rV   rW   <module>r     s         	 	 
  .  & &    ( @ / K K O O % < < ;        > =       & llg% : : ~~//*EBC   !   #&   
NN	MM 
MM	KK	NN	MM	JJ	KK	JJ	KK	KK		* T%++& )D !%2)
 <<$2)j-
-jj- - 

	-
 jj- -`;  ;;	
 
sBV^ V3 V# V/ /2 -UZZ -ell - - ;uzz ;

 ;PS ; ;| CG!::!!LL!69Dj! !   ^!"4 ^!B! !B&|"; |"~  , ,U 3M7l M7`  / / 9  % % '7 7R Rjy9 yx[
l [
|)$ )$eEKK$4F4L.M )$X_K _KDg	
Y g	
T 
9  
FFt FGIN GITM
 M
`17 17h R R Rj I, I, I,rV   