
    9jٗ                     <
   d dl Z d dlZd dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZmZmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZI ddlJmKZKmLZLmMZMmNZNmOZOmPZP 	 d dlQZQ eeQj                        ZSdZT ej                  eW      ZXej&                  j                  ZYej&                  j                  ZZ e;deNej                  j                  eSdk\  r eLd      n eLd      dd       Z] e;d!eO eLd"      #      Z^ e;d$eO eLd%      #      Z_ e;d&eO eLd'      #      Z` e;d(eO eLd)      #      Za e;d*eO eLd+      #      Zbe j                  d,        Zd e8ej                  d-eYj                  j                  .      Zg e8ej                  d/d0eYj                  j                  1      Zi e8ej                  d2eYj                  j                  .      Zk e8ej                  d3eYj                  j                  .      Zm e8ej                  d4deYj                  j                  5      Zp e8ej                  d6eYj                  j                  .      Zrd7 Zsdddd8d9Ztdid;Zu e8etd      Zvd< Zw G d= d>e(      Zx ex       Zy G d? d@e(      ZzdA Z{dB Z| ezdCdDe{      Z} ezdEdFe|      Z~ e4eYj                  dG      djddHdI       Z e4eYj                  dG      ddHdJ       Z e4eYj                  dG      ddddKdL       Z e4eYj                  dG      dddMdN       Zej                  ej                  fej                  ej                  fej
                  ej                  fej
                  ej
                  fej                  ej
                  fgZej                  ej                  gZej
                  ej                  gZdOed:efdPZdOedQed:efdRZdOedSedTedQed:ef
dUZdOedSed:efdVZ	 dkdWedXej                   dYedQed:ef
dZZd:efd[Zd\ed]ed^ej                   d_ej                   d:eeef   f
d`Z e4eYj                  j                  dG      	 	 	 	 	 dlda       Ze j                  dbedz  d:efdc       Zdd Z	 	 dmdeedz  fdfZdg Zdh Zy# eU$ r  ed      ZSdZTY 0w xY w)n    N)Any)counters)AutoHeuristicSelectAlgorithm)	AHContextcontext_add_stridescontext_add_using_tf32mm_operations)CppGemmTemplate)gen_best_config)opsV)make_fx)ScalingType)TorchVersion   )configdistributed_autotune)CUTLASS2xGemmTemplateCUTLASS3xGemmTemplate)CKTileGemmTemplate)CKGemmTemplate)SubgraphChoiceCallerSubgraphTemplate)BufferChoiceCaller	is_tritonLayout)MMKernelInputs)	loweringsmake_pointwisemake_reductionregister_loweringtransform_args)autotune_select_algorithmExternKernelChoiceKernelTemplaterealize_inputsTritonTemplate)_use_cutlass_for_opceildivuse_aten_gemm_kernelsuse_ck_gemm_templateuse_ck_tile_gemm_templateuse_cpp_gemm_templateuse_cutlass_templateuse_decompose_k_choiceuse_nv_universal_gemm_template!use_triton_blackwell_tma_templateuse_triton_scaling_templateuse_triton_templateuse_triton_tma_template   )_is_static_problemload_kernel_templatemm_argsmm_gridpersistent_mm_griduse_native_matmulTz0.0.0Fmmz3.3.0	triton_mmtriton_mm_rocm)namegridsource"cache_codegen_enabled_for_templateprologue_loads_all_inputsmm_persistent_tmatriton_persistent_tma_mm)r@   rA   rB   mm_persistenttriton_persistent_mm%scaled_mm_device_tma_epilogue_scalingtriton_epilogue_scaled_mm&scaled_mm_device_tma_main_loop_scalingtriton_main_loop_scaled_mm"blackwell_ws_persistent_device_tma,triton_blackwell_ws_persistent_device_tma_mmc                     t        |       S N)r%   )fns    Y/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/kernel/mm.pylazy_register_extern_choicerS      s    b!!    z
at::mm_out)op_overloadzat::mm_dtype_outmm_dtype)r@   rU   zat::addmm_outzat::_int_mm_outzat::_sparse_semi_structured_mm)has_out_variantrU   zat::_scaled_mm_outc                 b    | j                         t        j                  t        j                  fv S rP   )	get_dtypetorchint8uint8)mats    rR   _is_int8_matr^      s     ==?uzz5;;777rT   outalphabetac                    | j                  d      dk(  r| j                  d      dk7  s| j                  d      dk(  rt        j                  | d   |||||      S t        j                  | |||||      S )z
    Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
    kernel under the hood.  There are a few shapes where this is slower,
    but they are rare.
    r   r6   r_   )stridesizerZ   addmm)inpmat1mat2r`   ra   rb   s         rR   
bias_addmmrj      sh     	

1sxx{a/CHHQK14D{{3q643e$OO;;sD$Cu4HHrT   returnc                 X    dt         fd}dt         fd}dt         fd}t        j                   | j                               xs  | j	                                fd       t        j                   |j                               xs  |j	                               fd       y )Nrk   c                 \    t         j                  j                  j                  | d   d      S )Nr6   r   graphsizevarsstatically_known_equalsrd   s    rR   is_row_majorz.check_supported_striding.<locals>.is_row_major   #    ww77q	1EErT   c                 \    t         j                  j                  j                  | d   d      S Nr   r6   rn   rr   s    rR   is_col_majorz.check_supported_striding.<locals>.is_col_major   rt   rT   c                     t        t        j                  j                  j	                  | d   d      xs- t        j                  j                  j	                  | d   d            S rv   )boolr   ro   rp   rq   )re   s    rR   has_zero_dimz.check_supported_striding.<locals>.has_zero_dim   sQ    GG44T!Wa@ Dww77QC
 	
rT   c                  *    d j                          S )Nz$mat_a must be row_major, got stride 
get_stride)mat_as   rR   <lambda>z*check_supported_striding.<locals>.<lambda>       6u7G7G7I6JK rT   c                  *    d j                          S )Nz$mat_b must be col_major, got stride r|   )mat_bs   rR   r   z*check_supported_striding.<locals>.<lambda>   r   rT   )ry   rZ   _checkr}   get_size)r~   r   rs   rw   rz   s   ``   rR   check_supported_stridingr      s    F FF F
d 
 
LLU%%'(JL9I,JK 
LLU%%'(JL9I,JKrT   c                    | j                   d   }|j                   d   }| j                   d   }||z  }|}t        j                  | j                  |||      d      }|j                  |||      }	t        j                  ||	t        j
                        }
t        j                  |
d      }|j                  | j                        S )Nr   r6   )r6   r   r   	out_dtype)	shaperZ   permutereshapebmmfloat32sumtodtype)abk_splitsmnkk_partsB
a_reshaped
b_reshapedresultreduced_bufs               rR   
decomposeKr      s    	
A	
A	
A8mGAqyyAw7CJ1gq)JYYz:GF))FA&K>>!''""rT   c                   @     e Zd Z fdZdee   dededef fdZ	 xZ
S )DecomposeKSugraphTemplatec                 &    t         |   d       y )Ndecompose_kr@   )super__init__)self	__class__s    rR   r   z"DecomposeKSugraphTemplate.__init__   s     	 	
rT   input_nodeslayoutk_splitrk   c                     ddl m} ddlm} d| d}d|} |       5   |       }t	        t        j                  t        |      |      }	t        
| %  ||||	|	      cd d d        S # 1 sw Y   y xY w)
Nr   enable_python_dispatcherr   select_decomp_tabledecompose_k_mm__splitzk_split=)r   r@   r   r   make_fx_graphdescription)
torch._dispatch.pythonr   decompositionr   r   	functoolspartialr   r   generate)r   r   r   r   r   r   r@   r   decompositionsrQ   r   s             rR   r   z"DecomposeKSugraphTemplate.generate   s     	D7 	0!
m%' 	02N!!*w?B
 7#' ' $ 	 	 	s   A A**A3)__name__
__module____qualname__r   listr   r   intr   r   __classcell__r   s   @rR   r   r      s<    

&\  	
 
 rT   r   c                   J     e Zd Zdededef fdZdee   dede	f fdZ
 xZS )	ContiguousTemplater@   r   rQ   c                 P    || _         || _        || _        t        |   |       y )Nr   )r@   r   rQ   r   r   )r   r@   r   rQ   r   s       rR   r   zContiguousTemplate.__init__  s.    	& 	 	
rT   r   r   rk   c                     ddl m} ddlm}  |       5   |       }t	        | j
                  |      }t        |   | j                  |||| j                        cd d d        S # 1 sw Y   y xY w)Nr   r   r   r   r   )
r   r   r   r   r   rQ   r   r   r@   r   )r   r   r   r   r   r   rQ   r   s          rR   r   zContiguousTemplate.generate  sp    
 	D7%' 	02NB
 7#YY'  ,, $ 	 	 	s   AA$$A-)r   r   r   strr   r   r   r   r   r   r   r   r   s   @rR   r   r     sG    
S 
s 
 
&\  
	 rT   r   c                 J    t        j                  | |j                               S rP   )rZ   r=   
contiguous)r   r   s     rR   contiguous_mmr   (  s    88Aq||~&&rT   c                 L    t        j                  | ||j                               S rP   )rZ   rf   r   )rg   r   r   s      rR   contiguous_addmmr   ,  s    ;;sAq||~..rT   r   zcontiguous mmr   zcontiguous addmm)type_promotion_kindr   c                p    | j                         }t        j                  |j                         |k(  d        t        j                   j                         j                  dv d        t        j                  ||k(  xs7 |t        j
                  k(  xr" |t        j                  t        j                  fv d        t         |      rt        t        j                      d       t        t        j                     |d      }t         |gi ddd	
      \  }}t        j                  j                  rO j                   t        j                  t        j                  fv r# fd}|D cg c]  } t#        |      |       }} t#        t$        j&                        | }	 t)        d      |	d      }
|
S t+         |||      \  }}}} }t-        |      \  }}d}t/         |g|      }t0        d   d| d| d| xx   dz  cc<   t2        j5                  d||| j                         |j                         |       g }t-        |      \  }}t6        }i }|
t8        }d|i}g }i }t;               r"|j=                  |       |r|||j>                  <   ||rtA        |d      rtC        |||      r|j=                  tD               t        jF                  dk(  }|stC        |||d      s|j=                  tH               tK         ||d      r|j=                  tL               nTtO         ||d      rEt        jP                  jR                  |j=                  tT               n|j=                  tV               |j=                  tX               |j[                  t\        j^                  ja                  ||d|             |@|r>tc        ||||      r0te        d      r%tg        jh                  |||jk                                |5|r3tm        ||||      r%to        jp                  |||jk                                |5|r3ts        ||||      r%tu        jv                  |||jk                                |"|r ty        |||| |      rddl=m>}  ||||       |2t        | |      r%t        jv                  |||jk                                 |g}||rtA        |      rt        j                  j                  j                  |      rt               rg }t;               r|j=                  d       t        |      }|j[                  t\        j^                  ja                  |tH        gd             t         |||||||t               dd|       }t        j                  j                  j                  |      s*|#t        |      dkD  r|D cg c]	  }||v s| }}n|d| }|Mt        j                  D ]:  }|j=                  t        |      j                  |jk                         |             < d}|0t        j                  j                  j                  rt         |      }t        j                  |||jk                         |      x}r|S t        |||jk                         ||!      \  } }!| S c c}w c c}w )"z_
    Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
    Nc                       y)Nzinput dtypes must be the same r   rT   rR   r   ztuned_mm.<locals>.<lambda>A      rT   )cudaxpuc                       y)Nz+out_dtype is only supported for CUDA or XPUr   r   rT   rR   r   ztuned_mm.<locals>.<lambda>E  r   rT   c                       y)NzFout_dtype must be the same as input dtype or fp32 for fp16/bf16 inputsr   r   rT   rR   r   ztuned_mm.<locals>.<lambda>M  r   rT   r   TF)argskwargs	broadcastr   convert_input_to_boolc                 H    t        j                  | j                  d      S )NF)use_compute_types)r   to_dtyper   )xrh   s    rR   	_to_dtypeztuned_mm.<locals>._to_dtypen  s    ||AtzzUKKrT   dotr6   r   r   r=   r   aten_mm_infozaten.mm__zOTuned aten.mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr   check_max_autotune
exhaustiver   )threshold_multipleoutput_layout
add_guardskwarg_overrides)add_nv_universal_gemm_choices	extern_mmzmm-ah
   )top_kalways_included)best_config_future)QrY   rZ   r   
get_devicetyper   float16bfloat16r<   r   aten	unsqueezer#   inductor_configtritoncodegen_upcast_to_fp32r   r    r   r   r!   r9   r7   r   r   loginfoaten_mmaten_mm_dtyper+   appenduidr4   r0   decompose_k_subgraph_templatemax_autotune_gemm_search_spacemm_templater2   .blackwell_ws_persistent_device_tma_mm_templater5   versionhippersistent_tma_mm_templatepersistent_mm_templatemm_contiguous_subgraph_templateextendr   choicesget_template_configsr/   r)   r   add_cutlass_gemm_choicesnodesr,   r   add_ck_gemm_choicesr-   r   add_choicesr1   codegen.nv_universal_gemmr   r.   r
   	_inductorr   run_autoheuristicr   lenmm_autoheuristicr	   collect_autoheuristicexternal_matmulrS   bindremote_gemm_autotune_cacher   r   maybe_autotune_remoter$   )"rh   ri   r   r   input_dtyper   r   r   r   mul_pointwisedot_reductionr   r   r   static_shape
is_nonzeror@   kernel_inputsr  aten_handleraten_extra_kwargstemplates_to_user   is_exhaustiver   r   r    num_choices_before_extra_configs
ah_choiceschoicer   boxnoder   s"   `                                 rR   tuned_mmr*  8  s   
 nn&NN+3	
 	OO""o5A	
 	$ U]]* CEMM5>>#BB\	
. t$(r2(q1% $"'
f !!88TZZMMNNL
 >

L ;??Q-N9-a0?D?/sww/6-u-mQ? #*d6Y#Aq!VT4  2&9L*D #D$<9EM ^xs!A3as3494HHY			 #%G1&9L*'.L(*$()4BD13O-0AOL,,- 	4@!!Q*##$AB (FF,V 6q!QST U##K00d&T !''(VW(d&T ==$$,$++,FG$++,BC ?@NN			&&+	 	' 	
 	 Aq1%66V]002	
 Z,@Aq,Q**7FM<O<O<QRZ,EfaQRTU,V&&w8K8K8MN 	*61aD$GM%gv}E264F##!	
 ,K'OO""44T:dO """;/+.w<(II** 		
 &O+

 %%;;DA%#j/A*=
 18Pf6Z;O6PP!"C#CD 00 	ANN+A.33M4G4G4I6R	
 U__33NN -T48"88g}**,f s  
'-GD! Kw @~ Qs   *Z.	Z3Z3c          	         t        | ||t        j                        \  }}}}} }d}t        d   d| d| d| xx   dz  cc<   t        j                  d|||| j                         |j                         |       t        |      \  }}|xr |xr t        ||||      }	g }
t        | |gt        j                        }g }t               r|j                  t               |r#t        |d	d
      r|j                  t               |
j                  t         j"                  j%                  |||             |	r3t'        |      r(t)        j*                  |
||j-                         d	d	       t/        ||
|j-                         |      \  }}|S )Nr   int_mmr   zaten._int_mm_r   r6   zTTuned aten._int_mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr   TF)enable_int32r   fuseablenon_fuseable)r9   rZ   int32r   r   r   rY   r7   r/   r   r+   r   aten__int_mmr4   r  r
  r   r  r  r)   r   r  r  r$   )rh   ri   r   r   r   r   r@   r  r  use_cutlassr  r   r#  r)  r   s                  rR   tuned_int_mmr4  /  s    #*d6U[[#Aq!VT4 D^}QCq1QC89Q>9HH^			  2&9L*W:W2FvqRSUV2WK"$G #D$<5;;GM CE-)Te 	, NN			&&}6FM *4066V]002TPT	
 (g}7J7J7LfUGD!KrT   )ra   rb   r   c          	      t   t        ||      r|dk(  rd}nt        t        j                     ||       }|dk(  rd}n8t        t        j                     |t        t        j                     ||            }t        t        j
                     ||      S t        ||| |      \  }}	}
}}}}t        |      \  }}d}t        |||gt        ||            }t        | ||gt        ||            }g }t        d   d| d|	 d|
 xx   d	z  cc<   t        j                  d
||	|
|j                         |j                         |       |r t        j                  sft        j                   sV|j#                  t$        j&                  j)                  |t*        g|             t-        |||j/                         |      \  }}|S g }t1               rt*        g}| j3                         d   dk(  ret5        | j7                               dk(  rIt        j8                  j:                  r/t$        j<                  j>                  s|jA                  tB               |j#                  t$        j&                  j)                  |||             |rtE        |d      r|jA                  tF               tI        |||d      r|jA                  tJ               nTtM        |||d      rEtN        jP                  jR                  |jA                  tT               n|jA                  tV               |jA                  tX               |j#                  t$        j&                  j)                  |||             |rHt[        |||	|
      r:t]        |      r/t_        j`                  |||j/                  g d      ||g d       |r=tc        |||	|
      r/te        jf                  |||j/                  g d      ||g d       ti        |||      r)tk        jl                  |||j/                         ||d       t-        |||j/                         |      \  }}|S )zb
    Lowering for autotuning aten.addmm with different backends (Aten, Triton, CUTLASS, etc.)
    r   r   rf   )ra   rb   )scalarsr   zaten.addmm_r   r6   zRTuned aten.addmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr   Fr   Tr   )r6   r   r   )reorder)r   r   r6   )ra   rb   input_reorder)ra   rb   has_bias)7r<   r   r   mulr=   addr9   r7   r   dictr   r   r   rY   r   max_autotunemax_autotune_gemmr
  r   r  r  
aten_addmmr$   r  r+   r}   r  r   r   autotune_cublasLtro   cpp_wrapperr   aten_bias_addmmr4   r  r2   r  r5   rZ   r  r  r  r  "addmm_contiguous_subgraph_templater/   r)   r   r  r,   r   r  r.   r
   r  )rg   rh   ri   ra   rb   r   arg1arg2r   r   r   inp_expandedr  r  r@   r   kernel_inputs_atenr  r)  r   r#  aten_templatess                         rR   tuned_addmmrI  a  s   
 t$19DTXX&tS1DA:DTXX&ui.@t.LMD"4.. 18dCPV0W-Aq!VT41&9L*D #	tT"Du4,HM (	dD4e$#? #%G ^{1#Qqc1#671<7HH\			 ))_-N-NII**"	
 ,'=..0&
a BDEOLNNQ1$CLLN#q(&&88GG''!!/2 	II**+=~tT	
 )&UK,,$f
 ##$RS$T4vRVW}}  ( ''(BC ''(>? BC NN			&&}6FM
 	 Aq1%66 	2#		
 *61a;** 	2#		
 VT40##!	
 (g}7J7J7LfUGD!KrT   )r   r   c                   ddl m}  || ||      \  } }}| j                         \  }}|j                         \  }}	|j                         \  }
}t        j                  j
                  j                  ||      }t        j                  j
                  j                  d|z  |
      }|6ddlm}  ||j                         |r|n|j                         ||g|dg      }n	|J d       t               rt        j                  | ||f||      gng }||z  dk7  r6t        ||||      r(t        d      rt!        j"                  ||| ||gd	d	
       t%        d|| ||f|      \  }}	|S )Nr   )r'   r   )FixedLayoutr6   z,out_dtype is ignored if layout is specified.r   sparse_semi_structured_mmTr.  ) torch._inductor.select_algorithmr'   r   r   ro   rp   check_equals_and_simplifytorch._inductor.irrK  r   rY   r+   aten__sparse_semi_structured_mmr  r/   r)   r   r  r$   )rh   	mat1_metari   r   r   r'   m1k1m2r   k2r   r   r   rK  r  r)  s                    rR   tuned_sparse_semi_structured_mmrV    s}    @ +4DAD)T]]_FB EBMMOEB	222r:A	221r62>A~2OO"I(8FF	
  P"PP  !"	 ,00y$'9 1 	
   	
A
 Aq1 ;<66VdD)4tRV	
 (#WtY.EvGD! KrT   szc                 F    t        |       dk(  xs t        d | D              S )Nr   c              3   p   K   | ].  }t         j                  j                  j                  |d        0 yw)r6   Nrn   ).0ds     rR   	<genexpr>z)_is_tensorwise_scaling.<locals>.<genexpr>+  s,      !;<00A6!s   46)r  all)rW  s    rR   _is_tensorwise_scalingr^  *  s+    GqL S !@B!  rT   	transposec                 h    |rdnd}t         j                  j                  j                  | |   d      S )Nr   r   r6   rn   )rW  r_  idxs      rR   _is_rowwise_scalingrb  0  s,    !bC7733BsGQ??rT   	tensor_sz	tile_sizec                     |rdnd}|rdnd}t         j                  j                  j                  | |   ||         xr: t         j                  j                  j                  | |   t	        ||   |            S )Nr6   r   r   ro   rp   rq   r*   )rW  rc  rd  r_  lhsrhss         rR   _is_blockwise1xTILESIZE_scalingri  5  sq     !aC!aC7733
33 
''


2
2
333rT   c                     t         j                  j                  j                  | d   t	        |d   d            xr: t         j                  j                  j                  | d   t	        |d   d            S )Nr      r6   rf  )rW  rc  s     rR   _is_blockwise128x128_scalingrl  A  sd    7733
1wy|S) V
''


2
22a5')A,PS:T
UVrT   t
scale_sizescaling_typec                 X   |xt         j                  k(  r t        |      S xt         j                  k(  r t	        ||      S xt         j
                  k(  r t        || j                         d|      S t         j                  k(  rt        || j                               S 	 t        d|       )Nrk  Unsupported scaling type )r   
TensorWiser^  RowWiserb  BlockWise1x128ri  r   BlockWise128x128rl  AssertionError)rm  rn  ro  r_  s       rR   is_desired_scalingrw  G  s     #[##)*55 [  &z9=='[''2AJJL#y  ))/
AJJLII #<\N!KLLrT   c                 t    | xt         j                  k(  r yt         j                  k(  ry	 t        d|  d      )Nrk  rq  z in get_tile_size)r   ru  rt  rv  )scale_options    rR   get_tile_sizerz  \  s<    
)[))'' +L>9JK rT   r~   r   scale_a_sizescale_b_sizec                     t         D ](  \  }}t        | ||      st        |||d      s$||fc S  t        d| d|       )NT)r_  z1Inductor Triton does not support scale_a.shape = z, scale_b.shape = )scaling_pairsrw  rv  )r~   r   r{  r|  scale_option_ascale_option_bs         rR   get_scaling_optionsr  h  sc     +8 2&<
 nPTU!>11	2 
;L>I[\h[ij rT   c	           	         t        | |||      \  }	}
}}} }t        d   d|	 d|
 d| xx   dz  cc<   t        j                  d|	|
|| j	                         |j	                         |       d}t        | |       t        ||      \  }}|s| |||g}nt        |      }| ||||g}t        |dd|	      }g }g }i }t               r3|j                  t               t        ||
      |t        j                  <   t        |      \  }}|j                  t        j                   k(  r|rt#        |dd      rqt        |      }|j$                  |j$                  }}t'        | |||      \  }}t)        | ||d      r|s|j*                  |d<   |j*                  |d<   t-        ||t.              r)|j                  t0               ||t0        j                  <   nat-        ||t2              rEt5        |      |d<   t5        |      |d<   |j                  t6               ||t6        j                  <   nt9        d      t;        | ||d      r*|s(|j                  t<               ||t<        j                  <   t-        ||t.              r(|j                  t>               ||t>        j                  <   |jA                  tB        jD                  jG                  ||||             |r"tI        ||	|
|| |      rddl%m&}  |||||       |j                  t        j                   k7  rtO        ||||      \  }}|S |r@tQ        ||	|
|      r2tS        |      r'tU        jV                  |||jY                         |       |r3t[        ||	|
|      r%t]        j^                  |||jY                                tO        |||jY                         |      \  }}|S )a9  
    Performs an optimized matrix multiplication where scaling factors are applied
    to the inputs and/or output.

    Args:
        mat1 (Tensor): First input matrix
        mat2 (Tensor): Second input matrix
        scale1 (Tensor): Scale factor applied to mat1 (supports broadcasting)
        scale2 (Tensor): Scale factor applied to mat2 (supports broadcasting)
        bias (Tensor, optional): Optional bias tensor to add to the result
        layout: Layout hint for optimization

    Returns:
        Tensor: The result of the scaled matrix multiplication
    r   r   zaten._scaled_mm.default_r   r6   z_Tuned aten._scaled_mm.default: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s	scaled_mmr   )mat1_idxmat2_idxr   )r   use_fast_accumTF)enable_float8r   )USE_FAST_ACCUMr   SCALE_RECIPE_ASCALE_RECIPE_BTILE_SIZE_ATILE_SIZE_BzpInductor Triton does not support scaling options that are present in both epilogue scaling and main loop scalingr   r   )$add_nv_universal_scaled_gemm_choices)r   )r  )0r9   r   r   r   rY   r   r'   r   r+   r   aten__fp8_mmr<  r   r7   r   rZ   r   r4   r   r  r5   valuer3   epilogue_scaling_types.scaled_mm_device_tma_epilogue_scaling_templatemain_loop_scaling_typesrz  /scaled_mm_device_tma_main_loop_scaling_templaterv  r2   r  r  r
  r   r  r  r1   r  r  r$   r/   r)   r   r  r  r,   r   r  )r~   r   scale_ascale_bbiasscale_resultr   r  r   r   r   r   r@   scale_a_realscale_b_realr   	bias_realr   r  r#  r   r   r  
overridersr{  r|  r  r  r  r)  s                                 rR   tuned_scaled_mmr  y  s   8 %,uVy%!Aq!VUE ^7s!A3asCDIDHHi			 DUE*!/!AL, e\<@"4(	e\<K #a!yM #%G CEO-,0-
(() 'v.MAz 	&duU8
%1%7%79K9Kl)<5,*
& $E5SWX+9+?+?J'(+9+?+?J'(*0F !''(VW   N R RS -0G -:.,I
=),9.,I
=) ''(WX   O S ST %G  .uFt ##$RS JNNO 'N,B
 ##K0/9OKOO, NN			&&+	 	' 	
 4VQ1eUST,'		
 }}%+D';Oa 	 Aq1%66!)		
 *61a;**7FM<O<O<QR'g}7J7J7LfUGD!KrT   indexc                 f    t         j                  j                  | xs d      }|j                  dk  S )Nr      )rZ   r   get_device_propertiesmajor)r  propss     rR   _is_sm7x_or_older_gpur  /  s)    JJ,,UZa8E;;!rT   c                 &    t        d | D              S )Nc              3   <   K   | ]  }t        |t                y wrP   )
isinstancer   )rZ  dims     rR   r\  zdims_are_int.<locals>.<genexpr>6  s     4z#s#4s   )r]  )dimss    rR   dims_are_intr  5  s    4t444rT   r   c           	          t        | ||||      \  }}}t        |||g      sy t        | |      \  }}fd}d } ||||| |||      }t        ||||||	      }|
|j	                  |
|      S |j                         S )Nc                 V   t               }|j                  d|        |j                  d|       |j                  d|       |j                  d|j                  j                  d       |j                  d|j                  j                  d       t	        |d|       t	        |d	|       |j                  d
|j                  j                         d       |j                  d|j                  j                         d       dk(  r t        ||j                  j                         |S )Nr   r   r   
mat1_dtypeT)is_categorical
mat2_dtyperh   ri   mat1_iscontigmat2_iscontigr=   )r   add_featurer   r   r   is_contiguousr   )	r   r   r   rh   ri   mat1_stridemat2_stridecontextr@   s	           rR   get_contextz%mm_autoheuristic.<locals>.get_contextL  s   +C#C#C#L$++*;*;DQL$++*;*;DQGV[9GV[9T[[668 	 	
 	T[[668 	 	
 4<"7DKK,=,=>rT   c                       y rP   r   r   rT   rR   fallbackz"mm_autoheuristic.<locals>.fallback_  s    rT   )r  r  r   r  r@   augment_contextprecondition)r   )get_size_hintsr  get_size_hints_stridesr   get_top_k_choices_callerget_choice_caller)rh   ri   r   r   r   r  r@   r   r   r  r   r   r  r  r  r  r  autoheuristics         `           rR   r  r  9  s     T4Aq1GAq!Aq	"5dDAK& !Q4{KHG0!M 55? 6 
 	
 **,,rT   c                 t   t        |t              rt        |t              s:t        j                  j                  j                  | j                               \  }}t        |t              rt        |t              s:t        j                  j                  j                  |j                               \  }}|||fS rP   )r  r   r   ro   rp   optimization_hintsr   )rh   ri   r   r   r   s        rR   r  r  v  s{    aZ3%7!!44T]]_EAaZ3%7!!44T]]_EAa7NrT   c                    | j                   j                  }|j                   j                  }||g}g }|D ]L  }t        |t              s)t        j
                  j                  j                  |      }|j                  |       N |d   |d   fS rv   )	r   rd   r  r   r   ro   rp   r  r   )rh   ri   r  r  stridesstrides_hintsrd   s          rR   r  r    s    ++$$K++$$KK(GM %&#&WW%%88@FV$% ]1---rT   )rk   NrP   )F)NNNFN)NN)r   loggingtypingr   rZ   torch._dynamo.utilsr   +torch._inductor.autoheuristic.autoheuristicr   1torch._inductor.autoheuristic.autoheuristic_utilsr   r   r   r	   )torch._inductor.codegen.cpp_gemm_templater
   *torch._inductor.remote_gemm_autotune_cacher   torch._inductor.virtualizedr   r   "torch.fx.experimental.proxy_tensorr   torch.nn.functionalr   torch.torch_versionr    r   r   r   codegen.cutlass.gemm_templater   r   ,codegen.rocm.ck_tile_universal_gemm_templater   'codegen.rocm.ck_universal_gemm_templater   codegen.subgraphr   r   irr   r   r   r   r   r   loweringr   r    r!   r"   r#   select_algorithmr$   r%   r&   r'   r(   utilsr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   	mm_commonr7   r8   r9   r:   r;   r<   r   __version__triton_version
has_tritonImportError	getLoggerr   r   r   primsr  r  r  r  r  r  r  r  cacherS   r=   r`   r   	dtype_outr   rf   r?  _int_mmr2  _sparse_semi_structured_mmdefaultrP  
_scaled_mmr  r^   rj   r   rB  r   r   r  r   r   r   r	  rC  r*  r4  rI  rV  rr  rs  rt  ru  r~  r  r  ry   r^  rb  r   ri  rl  Tensorrw  rz  tupler  r  r  r  r  r  r  r   rT   rR   <module>r     s       ( T  F F . 6 + , > X M D E 8 8 *      !&"4"45NJ
 g!yy~~		
 		!n&?  ,
 
.	/'+" ,		 :;  (		 67  2@	0	 ;<2 . 3A	1	 <=3 / 2@	-	 NO2 . " " UXX|
M"	HH	!!	  	KKdjjnn
 "	MM$$,,2B2B #5	$$$//77	#  "	*8K8K
8 (,11 I4 %Z6#  0  F !: ; ) D'/ #5_m#  &8*,<& "
 4775s4 s 6sl 4<<T:'+ . ;.b 4::48*+!D F 9FR 422M(,T. N.d [334+--.!=!=>!;!;<!!;#=#=> &00+2E2EF &55{7S7ST s t @C @D @T @
			(+	8<			VS VS VT V 	M
MM M 	M
 
M*	3 	 ,, ,,	
 ;#$" 4??**E 
r Frj t   
5 :- ::-z	.m!  !'*NJs   T TT