
    9jt                     
   d dl Z d dlZd dlmc mZ d dlmZ ddlm	Z	m
Z
 ddlmZ ddlmZ ddlmZ ddl
mZ dd	lmZmZmZmZmZmZmZ dd
lmZmZmZ ddlmZmZ ddl m!Z!m"Z"m#Z# dejH                  de
j"                  de
j"                  de
j"                  de
j"                  de%e&e
j"                  e
j"                  dz  f   fdZ'de&de"de"de"dz  de"dz  de"dz  de"dz  de"fdZ(	 	 	 	 d"dede)e   de)e   fdZ*d e*_+        d! Z,y)#    N)mm_args   )configir)CppGemmTemplate)CppGroupedGemmTemplatecreate_epilogue_with_attr)	TensorBox)addadd_needs_realized_inputsatenpermuteregister_loweringto_dtypeview)autotune_select_algorithmChoiceCallerExternKernelChoice)use_aten_gemm_kernelsuse_cpp_gemm_template)opsOpsValueVW_tensorpacked_weightx_scalex_zpw_scalereturnc                 R   d }t        d |||fD              }|r#t        j                  j                  |j	                            t        j                  j                  |j	                            z  }t        j                  j                  ||j	                         dz         }t        j                  | j                  t        j                        d      }t        j                  j                  |j	                            }	||z  |	z  }t        j                  j                  ||j	                         dz         }
nft        j                  | j                  t        j                        d      }t        j                  j                  ||j	                         dz         }
||
|fS )Nc              3   8  K   | ]  }t        |t        j                        xrr |j                         t        j
                  j                  v xrF t        |j                  d       xr. t        |j                  j                  t        j                          yw)dataN)

isinstancer   r   get_namer   graph	constantshasattrr#   ConstantBuffer).0items     `/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/mkldnn_lowerings.py	<genexpr>z+create_int8_compensation.<locals>.<genexpr>+   sy      *
 	 	4& 	:MMOqww000	:DIIv&	: tyy~~r'8'89	:*s   BB_x_w_compensnamer   )dim_BMatrixCompens)
allr   r&   r'   r%   add_tensor_constanttorchsumtofloat)r   r   r   r   r   	x_w_scaleuse_int8_fast_compensation_pathx_w_scale_tensorweight_compens_tensorx_zp_tensorweight_compenss              r,   create_int8_compensationr?      sz    &*I&) *
 dG,* '# 'GGg..01gg 0 0 234 	 GG//'')N: 0 
	 !&		(++ekk*B Jgg''8 58H H; V44!''),== 5 

 !&		(++ekk*B J44!''),== 5 

 	(     r:   input_weight_compo_x_scale_x_zp_w_scale
_x_w_scalec                 Z   | r,t        j                  t        j                  ||      |      }|S t        j                  t        j                  ||      |      }t        j                  |t        j                  t        j                  t        j                  ||      |      |            }|S N)r   submul)r:   rA   rB   rC   rD   rE   rF   temps           r,   'codegen_int8_gemm_template_compensationrL   O   s     'wwGG 
H K9 wwGG 
 wwGGGG    	
 Kr@   xwbc           	         | j                         }t        |      dkD  rt        | d|d   g      } t        |      }t        j                  st        j
                  sJ |D 	cg c]%  }	|	|	nt        j                  j                  |	      ' }}	g }
t        | t        |d   ddg      |      ^ }}} }|D 	cg c]  }	|	d u c}	dd t        j                  t        |      |       d}| g|}|j                  |D 	cg c]  }	|	|		 c}	       t        j                   |
||fi | t        |
      dk7  sJ t#        d|
||      \  }}|j$                  j$                  }t        |      D cg c]   }t        j&                  ||t(        |fg      " }}t        j*                  |d   j-                         	      |_        ||_        t        |      D cg c]$  }t        j2                  j5                  ||         & }}t        |      dkD  r>t        |      D ]0  }t        ||   g |d d ||   j                         d         ||<   2 |S c c}	w c c}	w c c}	w c c}w c c}w )
N   r   r   layoutT)has_biastrans_wepilogue_creatoract_mappinggrouped_gemm)device)get_sizelenr   r   max_autotunemax_autotune_gemmr   ExternKernelrealize_inputr   r   dictfromkeysrangeextendr   add_choicesr   r#   MultiOutputlistMultiOutputLayout
get_devicerT   outputsr   create)rM   rN   rO   attrscalars	algorithmrT   x_sizenum_gemmbiaschoices_kwargsinput_nodesresulttemplate_bufgemm_idxreturn_bufsreturn_tensorss                      r,   grouped_gemm_loweringr{      sy    ZZ\F
6{QR$%1vH&":":::STU42??#@#@#F	FUAU"$Gq'!A$A"7GQ1 344$T%4 }}U8_a8	F 'q'K?d.>?@&& 	 w<1)	IFA ;;##L h 	v|tX.>-?@K 
 ..k!n6O6O6QRL&LCH?7?K12N  6{Qh 	H'+x(G&"+G~h7@@B2FG(N8$	
 ] 	V 5 @"s$   !*I5I9II(%I)ITc            !         t         j                  j                  rIddlm t        t         j                  j                  j                  ddj                  j                        t        t         j                  j                  j                  j                  ddj                  j                        t        t         j                  j                  j                  ddj                  j                        t        t         j                  j                  j                  j                  ddj                   j                        t         j                  j                  j"                  t         j                  j                  j$                  t         j                  j                  j&                  t         j                  j                  j                  t(        j*                  j,                  t         j                  j                  j.                  g} t1        t         j                  j                  j"                        dt2        dt2        d	t2        ffd
       }t1        t         j                  j                  j"                  j                        dt2        dt2        dt2        d	t2        ffd       }t1        t         j                  j                  j$                  j                        dt2        dt2        dt2        d	t2        ffd       }t1        t         j                  j                  j                        	 d3dt2        dt2        dt2        ffd       }t1        t         j                  j                  j                  j                        	 d3dt2        dt2        dt2        dt2        ffd       }t1        t         j                  j                  j&                        dt2        dt2        d	t2        ffd       }t1        t(        j*                  j,                        dt2        dt2        dt2        dt2        dt2        dt2        dt2        dt4        dt6        t8           dt8        dt8        dt8        dt4        d t4        d!t4        d"t4        f fd#       }t1        t         j                  j                  j.                  d $      dt2        d%t2        d&t2        d	t2        ffd'       }t1        t         j                  j                  j:                  j                  d $      t1        t         j                  j                  j:                  j<                  d $      dt2        d%t2        d&t2        d(t2        d	t2        f
fd)              }	t1        t         j                  j                  j                  d $      	 d3dt2        d%t2        d&t2        d*t2        d	t2        f
fd+       }
t1        t         j                  j                  j                  j                  d $      t1        t         j                  j                  j                  j<                  d $      	 d3dt2        d%t2        d&t2        d*t2        d,t2        d	t2        ffd-              }t         j                  j>                  rt        t         j                  j@                  jB                  d.djD                  j                        | jG                  t         j                  j@                  jB                         t1        t         j                  j@                  jB                        d d/dt2        d0t2        d1t2        dt2        d z  ffd2       }tI        |        y y )4Nr   )	mkldnn_irzmkldnn::_linear_pointwiseF)has_out_variantkernel_creatorzonednn::qlinear_pointwiserM   weightrq   c
                 r    t        j                  
j                  j                  | |||||||||	
            S rH   )r   rk   ConvolutionUnary)rM   r   rq   paddingstridedilationgroupsrl   rm   rn   r}   s             r,   convolution_unaryz5register_onednn_fusion_ops.<locals>.convolution_unary   sJ     ##**11 r@   otherc                 x    t        j                  j                  j                  | |||||||||	|
||            S rH   )r   rk   ConvolutionBinaryrM   r   r   rq   r   r   r   r   binary_attrbinary_alpha
unary_attrunary_scalarsunary_algorithmr}   s                r,   convolution_binaryz6register_onednn_fusion_ops.<locals>.convolution_binary  sS      ##++22 !# r@   c                 x    t        j                  j                  j                  | |||||||||	|
||            S rH   )r   rk   ConvolutionBinaryInplacer   s                r,   convolution_binary_inplacez>register_onednn_fusion_ops.<locals>.convolution_binary_inplace*  sS      ##2299 !# r@   rN   rO   c                    | j                         }t        |      dkD  rt        | d|d   g      } |t        j                  j                  |      }g }t        j                  st        j                  rnt        |ddg      }	t        | |	|      ^ }
}} }	t        || |	      r@fd}|d uddk(  rd n|d	}|g d
|d<   t        j                  |||| |gn| ||gfi | t        |      dk(  s
t               rAt              }|d |d<   |j!                   j"                  || |gn| ||g|fi |       |j%                         t&        j(                  j*                  v sJ dd i}t-        d||| |gn| ||g||      \  }}
t        |      dkD  r%t        |g |d d |j                         d         }|S )NrQ   rR   r   r   rS   c                 "    t        |       S )Nrm   rn   r	   )bufrn   rl   rm   s    r,   rW   zJregister_onednn_fusion_ops.<locals>.linear_unary.<locals>.epilogue_creatorb  s    8w)  r@   TnonerU   rV   rW   )rQ   r   r   input_indices)rl   rm   rn   Bc                 X    t         j                  j                  | j                            S rH   r   r&   r'   r%   rM   s    r,   <lambda>zBregister_onednn_fusion_ops.<locals>.linear_unary.<locals>.<lambda>      QWW..qzz|< r@   linear_unaryinput_gen_fnsr[   r\   r   r   r_   r`   r   r]   r^   r   r   r   r   re   r   ra   appendbindr%   r   r&   r'   r   )rM   rN   rO   rl   rm   rn   rT   ro   rr   transposed_wrs   rW   rt   r   rv   aten_mkldnn_linear_unarys      ```         r,   r   z0register_onednn_fusion_ops.<locals>.linear_unaryL  s    ZZ\F6{QR,-}OO11!4*,G""f&>&>&q1a&1.5af.U+FA|(LA %&TM#'$(FND8H	F }2;/#//"#)A!Q !	 7|q $9$;4IN9"&F3K1,11"#)A!Q ! ::<177#4#4444<M 2)A!Q+IFA 6{Qf&Ks&KV__5Fr5J&KLMr@   yc                    | j                         }t        |      dkD  rt        | d|d   g      } j                         }t        |      dkD  rt        d|d   g      |t        j                  j                  |      }g }t        j                  st        j                  rnt        |ddg      }	t        | |	|      ^ }
}} }	t        || |	      r>fd}|d ud|d}|g d	ng d
|d<   t        j                  |||| |gn| ||gfi | t        |      dk(  s
t               rAt              }|d |d<   |j!                   j"                  || |gn| ||g|fi |       |j%                         t&        j(                  j*                  v sJ dd i}t-        d||| |gn| ||g||      \  }}
t        |      dkD  r%t        |g |d d |j                         d         }|S )NrQ   rR   r   r   rS   c                      t        |       S )N)r   r	   )r   rl   r   s    r,   rW   zKregister_onednn_fusion_ops.<locals>.linear_binary.<locals>.epilogue_creator  s    8d!LLr@   Tr   )r   rQ   r   )   r   rQ   r   r   )rl   r   c                 X    t         j                  j                  | j                            S rH   r   r   s    r,   r   zCregister_onednn_fusion_ops.<locals>.linear_binary.<locals>.<lambda>  r   r@   linear_binaryr   r   )rM   r   rN   rO   rl   rT   ro   y_sizerr   r   rs   rW   rt   r   rv   aten_mkldnn_linear_binarys    `  `          r,   r   z1register_onednn_fusion_ops.<locals>.linear_binary  s0    ZZ\F6{QR,-ZZ\F6{QR,-}OO11!4*,G""f&>&>&q1a&118|Qv2.FA|Q )LAM %&TM#',<F <=9i,F?+#//%&YAq	Q1aL !	 7|q $9$;49"&F3K2-22%&YAq	Q1aL ! ::<177#4#4444<M 2YAq	Q1aL+IFA 6{Qf&Ks&KV__5Fr5J&KLMr@   c                 t    t        j                  j                  j                  | |||||||||	|
            S rH   )r   rk   ConvolutionTransposeUnary)rM   r   rq   r   output_paddingr   r   r   rl   rm   rn   r}   s              r,   convolution_transpose_unaryz?register_onednn_fusion_ops.<locals>.convolution_transpose_unary  sM     ##33::" r@   w0w1w2w3hxcxreversebatch_sizesmodehidden_size
num_layers
has_biasesbidirectionalbatch_firsttrainc                     t        j                  t        j                  j                  j                  | |||||||||	|
|||||            S rH   )pytreetree_mapr   rk   MkldnnRnnLayer)rM   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r}   s                   r,   mkldnn_rnn_layerz4register_onednn_fusion_ops.<locals>.mkldnn_rnn_layer  sc    & ??  ((//!! r@   )type_promotion_kindr   r   c                 \   t        |t        j                        sWt        |      t        u sJ t
        j                  j                  t        j                  |t        j                        d      }|Dt
        j                  j                  t        j                  dt        j                        d      }t        |t        j                        sWt        |      t        u sJ t
        j                  j                  t        j                  |t        j                        d      }|Dt
        j                  j                  t        j                  dt        j                        d      }t        j                  j                  j                  | |||||||||	|
||||||            S )Ndtyper   r/   r   r   w_zp)r$   r   r   typer8   r   r&   r4   r5   tensorfloat32int32intrk   QConvPointWisePT2E)rM   r   r   r   r   r   rq   r   r   r   r   o_inv_scaleo_zero_pointoutput_dtyperl   rm   rn   r}   s                    r,   qconvolution_unaryz6register_onednn_fusion_ops.<locals>.qconvolution_unary  se   ( gr||4G}---''55LL>Y 6  |ww22LL%++6V 3  dBLL1DzS(((ww22LLU[[9 3  |ww22LL%++6V 3  ##,,33!  # r@   accumc                 R   t        |t        j                        sWt        |      t        u sJ t
        j                  j                  t        j                  |t        j                        d      }|Dt
        j                  j                  t        j                  dt        j                        d      }t        |t        j                        sWt        |      t        u sJ t
        j                  j                  t        j                  |t        j                        d      }|Dt
        j                  j                  t        j                  dt        j                        d      }|dk(  rq|t        j                  t        j                  fv rO|j                         t        j                  t        j                  fv r|j                         |k7  rt        ||      }t        j                   j"                  j!                  | |||||||||	|
|||||||||||            S )Nr   r   r/   r   r   r   r6   )r$   r   r   r   r8   r   r&   r4   r5   r   r   r   r   bfloat16	get_dtyper   rk   QConvPointWiseBinaryPT2E)rM   r   r   r   r   r   r   rq   r   r   r   r   r   r   r   accum_scaleaccum_zpr   alphar   r   r   r}   s                         r,   qconvolution_binaryz7register_onednn_fusion_ops.<locals>.qconvolution_binaryV  s   < gr||4G}---''55LL>Y 6  |ww22LL%++6V 3  dBLL1DzS(((ww22LLU[[9 3  |ww22LL%++6V 3 
 u$ U]]ENN$CCOO%%--)HHOO%5 !5##2299!  !#- r@   r   c                 ~  	
 |j                         t        j                  t        j                  fv sJ d       | j	                         }t        |      dkD  rt        | d|d   g      } t        t        j                        sXt              t        u sJ t        j                  j                  t        j                  t        j                         d      n^j#                          t%        d j	                         D              rt        g       t        j	                               dv sJ d	       Dt        j                  j                  t        j                  d
t        j&                        d      t        t        j                        sXt              t(        u sJ t        j                  j                  t        j                  t        j&                        d      nj#                          j+                         dk(  sJ d       |Dt        j                  j                  t        j                  d
t        j&                        d      }j#                          |j#                          |j                         t        j&                  k7  rt        t        j,                  j/                  |      t        j0                        rt        j                  j2                  |j5                            j7                  t        j&                        }t        j                  j                  t        j                  |t        j&                        |j5                               }d nj                         g }t8        j:                  st8        j<                  rt?        | ||	      ^ }}} }t        t        j,                  j/                  |      t        j0                        rNt        j@                  t        jB                  t        j                  j2                  |j5                                  t        j                  j2                  |j5                                  rtE        || |      rt        j                  j2                  |j5                            jG                         }tI        ||      \  
	fd}| j                         t        jJ                  t        j                  fv sJ tM        jN                  ||| ||gn| ||gd u|g dng d       t        |      d
k(  s
tQ               rLtS        	
      }d |d<   |jU                   jV                  | ||fn| ||f|fi |       |j5                         t        j                  j2                  v sJ d d d d d}t        t        j,                  j/                        t        j0                        rd |d<   t        t        j,                  j/                        t        j0                        rd |d<   tY        d|| ||gn| ||g||      \  }}t        |      dkD  r%t        |g |d d |j	                         d         }|S )Nz=Only int8 and e4m3fn weights are supported by oneDNN qlinear.rQ   rR   r   r   r/   c              3   &   K   | ]	  }|d k(    ywr   N r*   r1   s     r,   r-   zDregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<genexpr>       >Csax>   r   r   x_scale must be 0D or 1Dr   r   r   z(x_zp is incompatible with oneDNN qlinearr   rT   	out_dtypec                 Z  	
 t         j                  t         j                  t         j                  t         j                  fv sJ | j                         j                         d rJ j                         j                         j                         
j                         d j                         
f
d}t        j                  | j                         t         j                  || j                               }dk7  rt        |      }t         j                  k(  rM|j                         fd}t        j                  |j                         ||j                               }|S t         j                  t         j                  fv rzddlm |j                         		fd}t        j                  |j                         t        j                  |t!              t#              	      |j                               }|S )
Nc           	        
  |       }t        j                  |t        j                        }| d   f}d }d }d }s d      } d      } |      } |      }d }rJ  |      }t	        ||||||      }
y |      }	t        j                  t        j
                  fv sJ t        j
                  k(  r$t        j                  |	t        j                        }	t        j                  ||	      }|S NrR   r   r   r   r5   r   rL   r   r   )indexrA   weight_compens_indexrC   rD   rE   rB   rF   rK   _biasrq   
bias_dtypebias_loaderinput_loaderr:   w_scale_loaderweight_compens_loaderx_scale_loaderx_w_scale_loaderx_zp_loaders             r,   inner_fnz]register_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator.<locals>.inner_fn5  s   $0$7E %(LL$FE49"I<0'+H$(E'+H#B+9"+=(3B+9:N+O,ABV,WM)-J>'7'C C'C-=>R-S
#J ? % - ( % ( *$D  $/(34H(I'1emmU^^5T'T T'T#-#?,/LL,NE'*wwtU';#'Kr@   rZ   r   r   rangesr   r   c                 @     |       }t        j                  |      S rH   r   r   r   rA   output_cast_loaderr   s     r,   inner_fn_cast_output_to_bf16zqregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator.<locals>.inner_fn_cast_output_to_bf16p      (:5(A'*||E<'H Hr@   r   _create_constantsc                     |       } 	d|z  |t         j                        \  }}t        j                  ||z        |z   }
t         j                  k(  r 	ddt         j                        \  }}n 	ddt         j                        \  }}t        j
                  t        j                  ||      |      }t        j                  |
      S Ng      ?r   r      i   r5   r   r   rounduint8minimummaximumr   r   scale
zero_pointrA   	inv_scalevalqminqmaxclampedr  r   requant_input_loaders            r,   inner_fn_requantzeregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator.<locals>.inner_fn_requant  s    (<U(C8I$'%K5==9" 5	: '*ii	0A&BZ&O#/5;;#>1B()3emm2&JD$ 2C(,c2&JD$ +.++ckk#t6Ld*S'*||G\'J Jr@   r  r  r5   r   r   r  int8make_loaderr   	Pointwiseri   r[   r
   get_device_or_errorloweringr  	functoolspartialr8   r   )input_bufferr   
output_bufr  r  r  r   r   r  r  r   r   r   r   r   rn   rl   rq   r   o_scaler   r   rm   r:   r   r>   r   r9   r   s        @@@@@@@@@@r,   rW   zKregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator  s   +!MM!NN!KK!JJ	0      (4'?'?'A0>0J0J0L-+/(:#,#88#8/8/D/D/F,)0)<)<)>)0)<)<)>&*&6&6&8&*+*.*:*:*<K(( ((T &(\\#/#:#:#<"'--%-#/#8#8#:	&
  6>)B *D'Y*J
 (5>>91;1G1G1I.I *,'1'E'E'G&2)E'1':':'<	*JN  *)C *ekk5::-FFC3=3I3I3K0K" *,'1'E'E'G&2)2):):$4*/./2</@*"
 (2':':'<	*J  *)r@   )r   r   r   rQ         )   r   r   r   rQ   r&  r'  rU   rW   r   )output_scaleoutput_zero_pointr   post_op_namepost_op_argspost_op_algorithmrq   c                 X    t         j                  j                  | j                            S rH   r   r   s    r,   r   zCregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<lambda>  r   r@   c                 X    t         j                  j                  | j                            S rH   r   r   s    r,   r   zCregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<lambda>  r   r@   c                 X    t         j                  j                  | j                            S rH   r   r   s    r,   r   zCregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<lambda>  r   r@   c                 X    t         j                  j                  | j                            S rH   r   r   s    r,   r   zCregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<lambda>  r   r@   )r   r&  r'  r(  c                 X    t         j                  j                  | j                            S rH   r   r   s    r,   r   zCregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<lambda>      QWW->->qzz|-L r@   c                 X    t         j                  j                  | j                            S rH   r   r   s    r,   r   zCregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<lambda>  r4  r@   qlinear_unaryr   )-r   r5   r  float8_e4m3fnr[   r\   r   r$   r   r   r   r8   r   r&   r4   r   r   realizer3   r   r   	get_numelInputsKernelunwrap_storage_for_inputr)   r'   r%   r7   r   r]   r^   r   equal
zeros_liker   to_denser?   r  r   re   r   ra   r   r   r   )rM   r   r   r   r   r   rq   r%  r   r   rl   rm   rn   rT   ro   w_zp_tensorrr   rs   r   rW   rt   r   rv   r   r:   r>   r9   aten_mkldnn_qlinear_unarys    `` ` ```````          @@@@r,   r6  z1register_onednn_fusion_ops.<locals>.qlinear_unary  s   " !**,U=P=P0QQ OQ ZZ\F6{QR,-gr||4G}---''55LL>Y 6  !>7+;+;+=>> #7B/G7++-.&8T:TT8|
 ww22LL%++6V 3  dBLL1DzS(((ww22LLU[[9 3  >>#q(T*TT(
 |
 ww22LL%++6V 3  OOLLN~~5;;.:88>!!4
  gg//@CCEKKPww22LLEKK@t}} 3  "&4>>3CJ*,G""f&>&>/6}V|0,FA} @@F)) (():):4==?)KL))$--/: ,FA}E ww001G1G1IJSSUH 1 %	7&!|* |* |*| ;;=U[[%**,EEEE#//< GT='4H$wdS!%T!1)9< '92 7|q $9$;!(&2!-!%!(&/ <%)F6N2-22< GT='4H$wdS	
 ! !))+qww/@/@@@@<<<<	M 88A!!
 $Ma 88>!! $Ma 1< GT='4@$wdK+IFA 6{Qf&Ks&KV__5Fr5J&KLMr@   x2c                   	
 ! | j                         }j                         }t        |      t        |      k(  sJ t        |      dkD  r&|dv r"t        | d|d   g      } t        d|d   g      t        t        j
                        sXt              t        u sJ t        j                  j                  t        j                  t        j                        d      n^j                          t        d j                         D              rt        g       t        j                               dv sJ d	       Dt        j                  j                  t        j                  d
t        j                         d      |Dt        j                  j                  t        j                  d
t        j                         d      }t        t        j
                        sXt              t"        u sJ t        j                  j                  t        j                  t        j                         d      nj                          j                          |j                          |j%                         t        j                   k7  rt        t        j&                  j)                  |      t        j*                        rt        j                  j,                  |j/                            j1                  t        j                         }t        j                  j                  t        j                  |t        j                         |j/                               }|dk(  r
t        j                  t        j2                  fv rPj%                         t        j                  t        j2                  fv r j%                         
k7  r't5        
      nj%                         
k(  sJ d       j%                          j%                         nd g }t6        j8                  st6        j:                  r|dv rt=        | ||
      ^ }}} }t        t        j&                  j)                        t        j*                        rt        j?                         j@                        d
k(  rZt        t        j&                  j)                  |      t        j*                        r"t        jB                  t        jD                  t        j                  j,                  |j/                                  t        j                  j,                  |j/                                  rtG        || |      rt        j                  j,                  |j/                            }|jI                         }tK        ||      \  !	
 !fd}tM        jN                  ||	| ||gn	| ||gd u|g dng d       t        |      d
k(  s
tQ               rRtS        	
||||
      }d |d<   |jU                   "jV                  	| ||fn	| ||f|fi |       |j/                         t        j                  j,                  v sJ d d d d}d |d<   tY        d|	| ||gn	| ||g||      \  }}t        |jZ                  jZ                  t        j\                        r|dk(  r|jZ                  jZ                  j^                  j?                         k(  rt        j
                  ja                  t	        j\                  t	        jb                  t	        jd                  j?                                     |jZ                  jZ                  jf                  |jZ                  jZ                  jh                  |jZ                  jZ                  jj                  |jZ                  jZ                  jl                              }t        |      dkD  r)|dv r%t        |g |d d |j                         d         }|S ) NrQ   )r   r6   rR   r   r   r/   c              3   &   K   | ]	  }|d k(    ywr   r   r   s     r,   r-   zEregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.<genexpr>  r   r   r   r   r   r   r   r6   zCdtype of accum for qlinear post op sum should be the same as outputr   c                   	
 t         j                  t         j                  t         j                  t         j                  fv sJ | j                         j                         j                         d rJ j                         j                         j                         
j                         d j                         
fd}t        j                  | j                         t         j                  || j                               }dk7  rt        |      }t         j                  k(  rM|j                         fd}t        j                  |j                         ||j                               }|S t         j                  t         j                  fv rddlm |j                         		fd}t        j                  |j                         t         j                  t        j                  |t!              t#              	      |j                               }|S )
Nc           	          |       } |       }d }d }d }| d   f}s d      } d      } |      }t        j                  |t        j                        } |      }d }rJ  |      }t	        ||||||      }	y |      }
t        j                  t        j
                  fv sJ t        j
                  k(  r$t        j                  |
t        j                        }
t        j                  |	|
      }	t        j                  t        j
                  fv sJ t        j
                  k(  r$t        j                  |t        j                        }t        j                  |	|      }	|	S r   r   )r   rA   _x2rC   rD   rE   r   rB   rF   rK   r   rq   r   r   r   r:   r   r   x2_dtype	x2_loaderr   r   r   s              r,   r   z^register_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creator.<locals>.inner_fn|  s`   $0$7E"+E"2C'+H$(E'+H49"I<0#B+9"+=(3B+9:N+O$'LL$FE,ABV,WM)-J>'7'C C'C-=>R-S
#J ? % - ( % ( *$D  $/(34H(I'1emmU^^5T'T T'T#-#?,/LL,NE'*wwtU'; $,u~~/N#NN#N'5>>9&)ll3&F#&774#5D#'Kr@   r   r   r   c                 @     |       }t        j                  |      S rH   r   r  s     r,   r  zrregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creator.<locals>.inner_fn_cast_output_to_bf16  r  r@   r   r  c                     |       } 	d|z  |t         j                        \  }}t        j                  ||z        |z   }
t         j                  k(  r 	ddt         j                        \  }}n 	ddt         j                        \  }}t        j
                  t        j                  ||      |      }t        j                  |t         j                        S r  r  r  s            r,   r  zfregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creator.<locals>.inner_fn_requant  s    (<U(C8I$'%K5==9" 5	: '*ii	0A&BZ&O#/5;;#>1B()3emm2&JD$ 2C(,c2&JD$ +.++ckk#t6Ld*S'*||GU[['I Ir@   r  r  ) r#  r   r$  r  r  r  r   r   r  r  r   r   rH  r   r   r   rq   r   r%  r   r   r   r   r   r:   r   r>   rA  rG  r   r9   r   s         @@@@@@@@@@@r,   rW   zLregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creatord  s	   +!MM!NN!KK!JJ	0      (4'?'?'A$&NN$4	0>0J0J0L-+/(:#,#88#8/8/D/D/F,)0)<)<)>)0)<)<)>&*&6&6&8&*+*.*:*:*<K.( .(` &(\\#/#:#:#<"'--%-#/#8#8#:	&
 &/)B * *(5*9	*J (5>>91;1G1G1I.I *,'1'E'E'G&2)E'1':':'<	*JN  *)C *ekk5::-FFC3=3I3I3K0J" *,'1'E'E'G&+kk)2):):$4*/./2</@*"
 (2':':'<	*J  *)r@   )r   r   r   rQ   r&  r'  r(  )   r   r   r   rQ   r&  r'  r(  r)  )
r*  r+  r   other_scaleother_zpbinary_post_opr   unary_post_opunary_post_op_argsunary_post_op_algorithmrq   c                 X    t         j                  j                  | j                            S rH   r   r   s    r,   r   zDregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.<lambda>  r   r@   c                 X    t         j                  j                  | j                            S rH   r   r   s    r,   r   zDregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.<lambda>  r   r@   c                 X    t         j                  j                  | j                            S rH   r   r   s    r,   r   zDregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.<lambda>  r   r@   )r   r&  r'  c                 X    t         j                  j                  | j                            S rH   r   r   s    r,   r   zDregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.<lambda>  r4  r@   rK  qlinear_binaryr   )r#   rT   )rT   inputsmake_kernel_rendertemplatechoice)7r[   r\   r   r$   r   r   r   r8   r   r&   r4   r5   r   r   r8  r3   r   r   r   r:  r;  r)   r'   r%   r7   r   r   r   r]   r^   r   
get_layoutsizer<  r=  r   r>  r?   r   re   r   ra   r   r   r   r#   CppTemplateBufferrT   rk   NonOwningLayoutReinterpretViewrW  rX  rY  rZ  )#rM   r   r   r   r   r   rA  rq   r%  r   r   x2_scalex2_zpr   r   r   r   r   rT   ro   x2_sizer?  rr   rs   r   rW   rt   r   rv   r   r:   r>   rG  r9   aten_mkldnn_qlinear_binarys#    `` ` `````    ```           @@@@@r,   rV  z2register_onednn_fusion_ops.<locals>.qlinear_binary  s!   6 ZZ\FkkmGv;#g,...6{Q;.#@R,-"r72;/0gr||4G}---''55LL>Y 6  !>7+;+;+=>> #7B/G7++-.&8T:TT8|ww22LL%++6V 3  |ww22LL%++6V 3  dBLL1DzS(((ww22LLU[[9 3  
 OOLLN~~5;;.:88>!!4  gg//@CCEKKPww22LLEKK@t}} 3  e#MMNN$  lln(GG||~5
 &b,7<<>\9 ]9 ||~H-1-=)4J*,G##v'?'?[ U F 4;}b<40FA}b @@F)) DOO-223q8"@@F)) (():):4==?)KL))$--/: .faG ww001G1G1IJH'002H
 1 %	7&!G* G* G*R $//< GT='4L$wbRVW!%T!1)9  < '<5 7|q $9$;!(&2!- ("#.!&",'4,; <%)F6N3.33< GT='4L$wbRVW	
 ! !))+qww/@/@@@@<<<M
 #La 1 < GT='4D$wb$O+IFA 6;;++R-A-AB5(KK$$++r}}> ,,((!11..Br}}O   &{{//66+1;;+;+;+N+N!'!1!1!:!:%{{//66
 6{Q;.#@f&Ks&KV__5Fr5J&KLMr@   zmkl::_mkl_linearrS   packed_worig_wc                j   g }t         j                  st         j                  rMt        |ddg      }t	        | ||      ^ }}} }t        || |      rt        j                  ||| ||gdddg       t        |      dk(  s
t               r'|j                  j                  | ||f|d |             |j                         t        j                  j                  v sJ |j                         t        j                  j                  v sJ d d	 d
}	t!        d|| ||g||	      \  }
}|t#        |
|      }
|
S )Nr   r   rS   TrQ   )rV   r   )r   
batch_sizec                 X    t         j                  j                  | j                            S rH   r   r   s    r,   r   zGregister_onednn_fusion_ops.<locals>.mkl_packed_linear.<locals>.<lambda>h      !2!21::<!@ r@   c                 X    t         j                  j                  | j                            S rH   r   r   s    r,   r   zGregister_onednn_fusion_ops.<locals>.mkl_packed_linear.<locals>.<lambda>i  ri  r@   )r   rQ   packed_linearr   )r   r]   r^   r   r   r   r   re   r\   r   r   r   r%   r   r&   r'   r   r   )rM   rd  re  rO   rg  rT   rr   r   rs   r   rv   aten_mkl_linears              r,   mkl_packed_linearz5register_onednn_fusion_ops.<locals>.mkl_packed_linearC  sM    /1&&&*B*B#*6Aq6#:L29<3/Q< -VQE'33#"&1$(+,a& w<1$(=(?NN',,&16Tj -   ((*agg.?.????(AGG,=,==== A@!
 6#&)"/	 = ^Fr@   rH   )%r5   _C_has_mkldnn r}   r   r   mkldnn_linear_pointwiseLinearUnaryrk   binaryLinearBinaryonednnqlinear_pointwiseQLinearPointwisePT2EQLinearPointwiseBinaryPT2E_convolution_pointwise_convolution_pointwise_ _convolution_transpose_pointwiser   r   defaultqconv_pointwiser   r   boolrg   r   qconv2d_pointwisebinary_tensorhas_mklmkl_mkl_linearMKLPackedLinearr   r   )cpu_needs_realized_inputsr   r   r   r   r   r   r   r   r   r6  rV  rm  rl  r   r   rc  r@  r}   s                @@@@@@r,   register_onednn_fusion_opsr     s   xx#5II..'!$0077	$
  %7II..55'!$1188	%
! %7II..'!$99@@	%
! &8II..55'!$??FF	&
" II33II44II==II..!!))II,,
 	" 
599++BB	C			 	 
D	6 
599++BBII	J			 	 		 
K	B 
599++CCJJ	K			 	 		 
L	B 
599++==	> A	A	A	 A	 
?A	F 
599++==DD	EQU=	=	&=	+4=	9B=	 
F=	~ 
599++LL	M			 	 
N	: 
40088	9&	&	&	 &	 	&	
 &	 &	 &	 &	 c&	 &	 &	 &	 &	  &	 &	  !&	 
:&	P 
599++;;QU	V<	<	 %	<	
 <	 <	 
W<	| 
II..554

 
II..<<RV

Q	Q	 %	Q	
 Q	 Q	 Q	



Q	f 
599++==SW	X l	l	 %	l	
 l	 l	 l	 
Yl	\	 
II..554

 
II..<<RV

, 'T	T	 %	T	
 T	 T	 T	 T	



T	l
 880		))" %(88??	O &,,UYY]]-F-FGuyy}}889 11#1 "1 t#	1 :1f 	"";<c% r@   )NNNN)-r!  r5   torch.utils._pytreeutils_pytreer    torch._inductor.kernel.mm_commonr   rp  r   r   codegen.cpp_gemm_templater   !codegen.cpp_grouped_gemm_templater   codegen.cpp_utilsr
   r   r   r   r   r   r   r   r   r   select_algorithmr   r   r   r   r   virtualizedr   r   r   Tensortupler  r?   rL   rg   r{   _inductor_lowering_functionr  r   r@   r,   <module>r     sc     $ $ 4  6 E 8    
 @ ) )-ll-<<- \\- ,,	-
 \\- LLLL4-`.%).. . o	.
 d?. o. 4. .j 
??I? I?D 59  1r=r@   