
    9j'                     V   d dl Z d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZmZmZ ddlmZmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+m,Z, erddlm-Z- ddlm.Z.  e j^                  e0      Z1ejL                  jd                  Z2ed        Z3 ede3 e	d      d      Z4 eejj                  de2jj                  jl                        Z7 eejj                  ejp                  js                         rdndde2jj                  jt                        Z; eejx                  de2jx                  jl                        Z= ej|                  e2jj                        d"ddd       Z? ej|                  e2jx                        dddd d!       Z@y)#    N)TYPE_CHECKING)counters)CKGemmTemplate)load_kernel_template   )configirlowering)MMKernelInputs)	loweringsmake_pointwisemake_reductiontransform_argsget_max_y_grid)autotune_select_algorithmExternKernelChoiceSymbolicGridFnTritonTemplate)_use_cutlass_for_opuse_aten_gemm_kernelsuse_ck_gemm_templateuse_cpp_bmm_templateuse_cutlass_templateuse_nv_universal_gemm_templateuse_triton_template)opsV   )_is_static_problemis_batch_stride_largest_or_zeromm_argsuse_native_matmul)ChoiceCaller)KernelTemplatec                     |||d          |||d         z  }t               } | || |      d      } || |      }	||	|fS )NBLOCK_MBLOCK_Nr   r   )
bmnmetacdivmaxtiles
max_y_gridgrid_zgrid_ys
             Z/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/kernel/bmm.pybmm_gridr4   .   sW    DO$tAtI'??E  !Ja$a(F!V_F66""    bmm
triton_bmmT)namegridsource"cache_codegen_enabled_for_templatezat::bmm_out)op_overloadzat::_bmm_out_dtype_xpuzat::_bmm_out_dtype_cuda	bmm_dtype)r8   r<   zat::baddbmm_outlayoutc          
           t        d  |fD              r j                         d   dk(  s|j                         d   dk(  rWt        j                   d       t        j                  |d      }t        j                  t        j
                   |      d      S d }d   fd} |       r0t        j                  j                  j                  d	   } | |        ||      r0t        j                  j                  j                  d   } |||      }t         |      rt        t        j                      d       t        t        j                     |d      }t         |gi d
dd      \  }}	t        j                  j                   rO j"                  t$        j&                  t$        j(                  fv r# fd}
|D cg c]  } t+        |
      |       }} t+        t,        j.                        | } t1        d      |d      }|S t3         |||      \  }}}} }d}t5         |g|      } j                         d	   }t6        d   d| d| d| d| xx   dz  cc<   t8        j;                  d|||| j=                         |j=                         |       t>        }i }|r- jA                         jB                  dv sJ d       tD        }d|i}g }g }i }tG               r |jI                  |       |||jJ                  <   tM        |d      r|jI                  tN               |jQ                  t        jR                  jU                  ||||             tW        |      \  }}tY         ||      }|rB|r@t[        ||||      r2t]        |      r'ddl/m0} |jc                  |||je                                tg        | |      r'ddl4m5} |jm                  |||je                                to        ||||      r%tq        jr                  |||je                                |r tu        |||| |      rddl;m<}  ||||       t{        |||je                         |      \  }}|S c c}w )z`
    Lowering for autotuning aten.bmm with different backends (Aten, Triton, CUTLASS, etc.)
    c              3   V   K   | ]!  }|j                         j                  d k(   # yw)cpuN)
get_devicetype).0xs     r3   	<genexpr>ztuned_bmm.<locals>.<genexpr>T   s!     
>A1<<>%'
>s   ')r   r   )axisc                     t        j                  |       syt        j                  | d      \  }}t        |t         j                        S )NTF)freeze)r	   is_storage_and_layoutas_storage_and_layout
isinstanceFlexibleLayout)t_r?   s      r3   is_valid_to_require_contiguousz1tuned_bmm.<locals>.is_valid_to_require_contiguous[   s<    ++A.005AIAvfb&7&788r5   c                     |d   dk(  xr | d   dk(  xs |d   | d   k\  xs |d   dk(  xr | d   dk(  xs |d   | d   k\  S )NrH   r    )sizesstridess     r3    is_preferred_layout_as_bmm_inputz3tuned_bmm.<locals>.is_preferred_layout_as_bmm_inputa   sf     q QeBi1n&PuRy8PU"+"Sb	Q(R'"+r:RUr5   c                     |j                   d   j                         }|j                   d   j                         } ||      st        j                  j                  |       } | S )Nval)r,   sizestrider	   ExternKernelrequire_contiguous)rP   meta_trV   rW   rX   s       r3   may_require_contiguousz)tuned_bmm.<locals>.may_require_contiguousk   sT    KK&++-Ekk%(//1G3E7COO66q9Hr5   r   TNF)argskwargs	broadcasttype_promotion_kindconvert_input_to_boolc                 H    t        j                  | j                  d      S )NF)use_compute_types)r   to_dtypedtype)rF   mat1s    r3   	_to_dtypeztuned_bmm.<locals>._to_dtype   s    ||AtzzUKKr5   dot)r?   	out_dtyper6   )rm   aten_mm_infoz	aten.bmm_rQ   zZTuned aten.bmm: batch=%s, m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s)cudaxpuz+out_dtype is only supported for CUDA or XPUrm   check_max_autotune)kwarg_overrides)CUTLASS3xGemmTemplate)CppBmmTemplate)add_nv_universal_gemm_choices)>allget_sizeL	unsqueezesum_mulr   graphcurrent_nodera   r#   r   atenr   inductor_configtritoncodegen_upcast_to_fp32ri   torchfloat16bfloat16r   r   rl   r   r"   r   r   loginfo	get_dtypeaten_bmmrC   rD   aten_bmm_dtyper   appenduidr   bmm_templateextendchoicesget_template_configsr    r!   r   r   codegen.cutlass.gemm_templatert   add_cutlass_gemm_choicesnodesr   codegen.cpp_bmm_templateru   add_choicesr   r   add_ck_gemm_choicesr   codegen.nv_universal_gemmrv   r   )!rj   mat2rm   r?   rR   r`   	meta_mat1	meta_mat2ra   rb   rk   rF   mul_pointwisedot_reductionr*   r+   kr8   kernel_inputs
batch_sizeaten_handleraten_extra_kwargsr   templates_to_users   rQ   
is_nonzerobatch_stride_largest_or_zerort   ru   rv   noderX   s!   `                               @r3   	tuned_bmmr   O   s;   
 
>$
>>==?1"dmmoa&8A&=;;tR(D;;tQ'D66!%%d+!44	9	U	 *$/,,11!4I)$	:D)$/,,11!4I)$	:Dt$(r2(q1% $"'
f !!88TZZMMNNL
 >

L ;??Q-N9-a0?D?/sww/6-u-mQ? #*d6Y#Aq!VT4 D #D$<9EM #J^yAaS!AaSABaGBHHd				 (0L %%8 	
9	
8 &()4"$G CEO-,=(()6e<- NN			&&+	 	' 	
 'v.MAz#B4v#V $ Aq1%I66V]002	
 FD$/=""!	
 FAq!,**7FM<O<O<QR4VQ1dDQM%gv}E'g}7J7J7LfUGD!K{ @s   ?Q)alphabetar?   c                   t        ||      r|dk(  rd}nt        t        j                     ||       }|dk(  rd}n8t        t        j                     |t        t        j                     ||            }t        t        j
                     ||      S t        ||| |      \  }}	}
}}}} t        | ||gt        ||            }|j                         d   }t        d   d| d| d|	 d|
 xx   dz  cc<   t        j                  d	|||	|
|j                         |j                         | j                         |	       d
}g }g }t               r|j                  t                t#        |d      r|j                  t$               |j'                  t(        j*                  j-                  |||             t/        |||j1                         |      \  }}|S )z_
    Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
    r   r>   )r   r   )scalarsrn   zaten.baddbmm_rQ   r   zkTuned aten.baddbmm: batch_size=%s, m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, inp=%s, output_layout=%sbaddbmmFrq   )r#   r   r   r|   r6   addr"   r   dictrx   r   r   r   r   r   r   aten_baddbmmr   r   r   r   r   r   r   r   )inprj   r   r   r   r?   arg1arg2r*   r+   r   r   r   r8   r   r   r   rQ   s                     r3   tuned_baddbmmr      s   
 t$19DTXX&tS1DA:DTXX&ui.A$.MND"4.. (/tT3v'N$Aq!VT4 #	dD4e$#?M
 #J^}ZL!AaS!EF!KFHHu			
 D"$G CE-6e<- NN			&&}6FM (g}7J7J7LfUGD!Kr5   )N)Aloggingtypingr   r   torch._dynamo.utilsr   7torch._inductor.codegen.rocm.ck_universal_gemm_templater    torch._inductor.kernel.mm_commonr    r   r   r	   r
   ry   r   r   r   r   r   r   runtime.runtime_utilsr   select_algorithmr   r   r   r   utilsr   r   r   r   r   r   r   virtualizedr   r   	mm_commonr    r!   r"   r#   r$   r%   	getLogger__name__r   r   r4   r   r6   outr   rp   _is_compiled	dtype_outr   r   r   register_loweringr   r   rU   r5   r3   <module>r      s}       ( R A ; ; * P P 2    !  !1g!yy~~ # # 		-'+	 eiiDHHLLQ#	II %		 6 6 8>W	""	 "	MM$$,,2B2B
 TXXYD Y Yx T\\",-Ad 9 #9r5   