
    9j^                        d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlZd dl	Z	d dl
mc mZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZmZmZmZmZmZmZ d d	lm Z  d d
l!m"Z" d dl#m$Z$ d dl%m&Z&  ejN                  e(      Z)de	jT                  jV                  de,e   de-defdZ. G d dej^                        Z0 G d de      Z1y)    N)Callable)Any)counters)ir)SubgraphCPUBenchmarkRequestSubgraphGPUBenchmarkRequest
TensorMeta)KernelTemplate)BufferFixedLayoutget_free_symbolsget_symbolic_inputsgm_original_output_stridesir_node_to_tensorLayout)benchmarker)do_bench_using_profilingV)
OrderedSetgminputsnamereturnc                     ddl m} t        j                  j                  }	 | t        j                  _         || |      |t        j                  _        S # |t        j                  _        w xY w)a.  Inline a subgraph by converting its FX operations to individual IR nodes.

    This converts a subgraph to multiple ComputedBuffer nodes (fusable),
    enabling epilogue fusion with subsequent operations.

    Returns:
        TensorBox containing the final operation result as individual IR nodes
    r   )process_subgraph_nodes)torch._inductor.loweringr   r   graphmodule)r   r   r   r   original_modules        `/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/codegen/subgraph.pyinline_subgraph_to_ir_nodesr"   &   sH     @ ggnnO)%b&1(s   A A,c                       e Zd ZdZ	 ddedee   dededede	f   d	e
eee	gej                  f   f   dz  d
df fdZd
ee   fdZdede	f   de
ee	f   d
dfdZd
efdZd
e	fdZd
eez  fdZed
eez  fd       ZddZdee	   dej                  d
efdZdee	   dej                  d
dfdZd
efdZd
ej<                  fdZd
e
ee	f   fdZ d
efdZ! xZ"S )SubgraphChoiceCallerz
    Represents a Subgraph Autotuning choice, and the subgraph can be any arbitrary
    GraphModule. Compiles the Subgraph down to a module for benchmarking.
    Nr   input_nodeslayoutdescriptionmake_fx_graph.input_gen_fnsr   c                     t         
|   ||||       g }g | _        t        j                  5  t        | j                        D ]  \  }}	t        t        |	j                         d            dk(  sJ t        t        |	j                         d            dk(  sJ |	j                  j                          |j                  t        |	             |)||v r%| j                  j                   ||   |	             | j                  j                  t        |	d              	 d d d         || | _        t!        | j                         || _        t%        | j                        | _        | j)                         | _        d | _        i | _        i | _        d | _        d | _        t6        j8                  rDt        j                  5  | j;                         | _        | j=                         | _        d d d        y y # 1 sw Y   xY w# 1 sw Y   y xY w)NT)unbacked_onlyr   )replace_symbols_with_hints)super__init__benchmark_inputsr   	fake_mode	enumerater%   lenr   get_size
get_stridedatafreeze_layoutappendr   r   r   example_inputsr   
sym_inputs_compute_sym_input_valuessym_input_valuesdecompositiondecomposition_kwargsconfig_patches_compiled_module_bmreqconfigpipeline_max_autotune_gemm_compile_for_benchmarking_create_benchmark_request)selfr   r%   r&   r'   r(   r)   trace_inputsiinp	__class__s             r!   r.   zSubgraphChoiceCaller.__init__B   s    	{FK@  "[[ 	#D$4$45 3+CLLN$OPTUUUU+CNN,<DQRVWWWW&&( ##$5c$:; !,m1C))001Aq1A#1FG))00)#$O	(  ."477+*-d.>.>? $ > > @ 9=46!.0%)  	 ,, ?(,(F(F(H%"<<>? ? -U	 	V? ?s   C5G8+H8HHc           	      h   t        | j                  D cg c]  }t        |d      s|j                   c}      }i }t	        | j
                  | j                        D ]  \  }}t        |t        j                        s!t	        |j                         |j                        D ]]  \  }}t        |t        j                        rt        |      ||j                  <   9t        |      |v sGt        |      |t        |      <   _  g }| j                  D ]  }	t        |	t        j                        r-|	j                  |v r|j!                  ||	j                            Jt"        j$                  j&                  j(                  j+                  |	d      }
|j!                  t        |
              |S c c}w )a)  Extract concrete dimension values for sym_inputs from benchmark_inputs.

        The compiled module expects symbolic dimension values as runtime arguments.
        This maps each symbolic variable to its concrete value from the benchmark tensors.
        Used for range based autotuning.
        r      )fallback)r   r9   hasattrr   zipr%   r/   
isinstancetorchTensorr3   shapesympySymbolintstrr7   r   r   sizevars	shape_envoptimization_hint)rE   ssym_input_namessym_name_to_valueinp_nodebenchmark_inpsym_dim
actual_dimresultsym_varhints              r!   r:   z.SubgraphChoiceCaller._compute_sym_input_values   sj    %!__C60BQVVC

 -/'*4+;+;T=R=R'S 	J#Hm-6+.%%')<)<, J'GZ "'5<<8:=j/)',,7W8:=j/)#g,7J	J  	)G'5<<0W\\EV5V/=>ww''11CCGVWCXc$i(	) - Ds
   F/F/r<   kwargsc                      || _         || _        y)zHCache decomposition function and kwargs for range-based dispatch lookup.N)r<   r=   )rE   r<   rd   s      r!   cache_decompositionz(SubgraphChoiceCaller.cache_decomposition   s     +$*!    c                 "    d| j                    dS )NzSubgraphCaller()r   rE   s    r!   __str__zSubgraphChoiceCaller.__str__   s     1--rg   c                 @   ddl m} | j                  j                  dd      j                  dd      }| j                  r| j
                  n| j                  }t        j                  d| j                  | j                         | j                  J  || j                  |t        j                  j                  t        j                  j                  t        j                  j                  t        j                  j                  t        j                  j                   t        j                  j"                  d| 		      }| j                  D ]@  }||j$                  |j                  <   |j&                  j)                  |j                         B t        j*                  |      5  d
d
dd
d
d| j,                  }t/        j0                  |      5   |j2                  |  |j5                         cddd       cddd       S # 1 sw Y   nxY w	 ddd       y# 1 sw Y   yxY w)zCCompile the subgraph for benchmarking, returns the compiled module.r   )GraphLoweringz::_.z#Benchmark compile %s: sym_inputs=%sN
benchmark_)	r   r8   rX   cpp_wrapperaot_modeextern_node_serializeris_inferenceis_backwardr   FATEN)max_autotunemax_autotune_gemmmax_autotune_gemm_backendsbenchmark_fusionrB   )torch._inductor.graphrn   r   replacer9   r8   r/   logdebugr   r   r   
_shape_envrr   rs   rt   ru   rv   graph_inputsgraph_input_namesr7   set_graph_handlerr>   rA   patchruncompile_to_module)rE   rn   	safe_namecompile_inputsbm_graph_loweringsym_inpbenchmark_configs          r!   rC   z.SubgraphChoiceCaller._compile_for_benchmarking   s   7II%%dC088cB	 $(??D8M8M 	 			7DOOTww""")ww)gg((++WW%%#$77#A#A--++i[)

  	EG;B**7<<8//66w||D	E   !23 	= !&%*.4$).30 %%0 ./ =%!%%~6(::<= =	= 	== = =	= 	= 	=s$   "*HG>+	H>H	HHc           	         | j                   J d       t        j                  | j                        }t        j                  | j                        }| j                  j
                  j                  dk(  rt        }nt        } || j                  ||t               | j                   j                  | j                   j                  | j                        S )z0Create a benchmark request for async autotuning.z9Module must be compiled before creating benchmark requestcpu)kernel_nameinput_tensor_metaoutput_tensor_meta
extra_argsmodule_pathmodule_cache_keyr;   )r?   r	   from_irnodesr%   r&   devicetyper   r   r   tuple__file__keyr;   )rE   r   r   	bmreq_clss       r!   rD   z.SubgraphChoiceCaller._create_benchmark_request   s     $$0 	
G	
0 '33D4D4DE'44T[[A;;""e+3I3I		/1w--66!2266!22
 	
rg   c                 @    | j                   J d       | j                   S )z`Benchmark request for async autotuning. Pre-compiled when pipeline_max_autotune_gemm is enabled.zMbmreq accessed but pipeline_max_autotune_gemm was not enabled during __init__)r@   rk   s    r!   bmreqzSubgraphChoiceCaller.bmreq   s*    
 {{& 	
[	
& {{rg   c                 H    | j                   | j                         | _         yy)zKEnsure the module is compiled. Used for lazy compilation in non-async path.N)r?   rC   rk   s    r!   _ensure_compiledz%SubgraphChoiceCaller._ensure_compiled   s$      ($($B$B$DD! )rg   argsoutc                X   | j                          | j                  j                  | j                  dt        ffd}| j
                  rt        j                  |      S t        j                  rt        |      S t        j                  |t        j                  g        S )z>Regular benchmarking: compile if needed, then use benchmarker.r   c                       g        S N )r   bm_funcr9   s   r!   fnz*SubgraphChoiceCaller.benchmark.<locals>.fn
  s    /Z/$/00rg   )r   )r   r?   callr;   r   _benchmark_with_cudagraphsr   benchmark_gpu_with_cuda_graphrA   /profile_bandwidth_with_do_bench_using_profilingr   	benchmarkinfer_device)rE   r   r   r   r   r9   s     ` @@r!   r   zSubgraphChoiceCaller.benchmark  s    '',,**
	1C 	1 **<<R@@AA+B//$$++?Z?$?
 	
rg   c                v    | j                          | j                  j                  g | j                  |       y)zFRun once for collective benchmarking (barrier sync handled by caller).N)r   r?   r   r;   )rE   r   r   s      r!   benchmark_collectivez)SubgraphChoiceCaller.benchmark_collective  s3    ""#BT%:%:#BT#BCrg   c           
         | j                   J dj                  | j                  j                  dd      d   g| j                  D cg c]  }t        |j                                c}| j                  D cg c]  }t        |j                                c}t        | j                   j                              S c c}w c c}w )N-ro   rK   r   )	r   joinr   rsplitr%   rV   r3   r4   r   )rE   rH   s     r!   hash_keyzSubgraphChoiceCaller.hash_key  s    ww"""xx		  a(+151A1AB##clln%B 483C3CDC#cnn&'D DGGMM"	
 	
 CDs    B?
8 C
c           
      4   | j                   J t        j                  j                  t        j                  | j
                  | j                  | j                   | j                  | j                  | j                  r| j                              S d             S )N)r&   r%   r   r8   subgraph_namer>   )
r   r   	TensorBoxcreateSubgraphBufferr&   r%   r8   r   r>   rk   s    r!   output_nodez SubgraphChoiceCaller.output_node'  s    ww"""||""{{ ,,77#22"ii6:6I6It22	
 		
 PT	
 		
rg   c                      d| j                   dS )zRInformation returned here is logged to the autotune log file when that is enabled.subgraph)backendr   rj   rk   s    r!   	info_dictzSubgraphChoiceCaller.info_dict4  s     "99
 	
rg   c                      d| j                    S )N	subgraph_rj   rk   s    r!   autoheuristic_idz%SubgraphChoiceCaller.autoheuristic_id;  s    499+&&rg   r   )r   N)#__name__
__module____qualname____doc__rV   listr   r   r   r   dictrU   rP   rQ   r.   r:   rf   rl   rC   r   r   rD   propertyr   r   floatr   r   r   r   r   r   r   r   __classcell__rI   s   @r!   r$   r$   <   s    JN@?@? &\@? 	@?
 @?  S)@? C3%*=!>>?$F@? 
@?D49 @+%c3h/+9=c3h+	+. .,=3 ,=\
	$'B	B
2 	$'B	B E

tCy 
u|| 
 
&D$s) D%,, D4 D
	
# 	

R\\ 

4S> 
'# 'rg   r$   c                   Z    e Zd ZdZ ej
                         Zdef fdZ	 	 ddede	e
   dededef   d	ed
eeeegej"                  f   f   dz  dedefdZ	 	 	 ddede	edef      de	e
   de	eeef      dedef   dz  d
eeeegej"                  f   f   dz  de	eeef      dz  de	e   fdZdedef   deeef   defdZdeeef   ddfdZdede	edef      de	e   ddfdZ	 	 dde	e
   dedef   deeef   dedef   dz  d
eeeegej"                  f   f   dz  defdZ xZS )SubgraphTemplatea  
    A template for subgraph evaluation to be used in autotuning.

    This class allows creating customized subgraphs that can be appended
    as choices during the autotuning process, enabling the selection of
    optimal implementations for complex operations.
    r   c                 &    t         |   |       y)z
        Initialize a subgraph template.

        Args:
            name: The name of this template
            graph: The FX graph
        rj   N)r-   r.   )rE   r   rI   s     r!   r.   zSubgraphTemplate.__init__J  s     	d#rg   Nr%   r&   r(   .r'   r)   rd   r   c                 \    t        | dt        t        j                         |||||      S )a  
        Generate a SubgraphChoiceCaller instance for autotuning.

        Args:
            name: The name for this subgraph choice
            input_nodes: List of input nodes to the subgraph
            layout: Memory layout information for the output
            make_fx_graph: Callable that creates the FX graph for this subgraph
            description: Optional description of this choice
            input_gen_fns: Optional dict mapping input indices to tensor generators
            **kwargs: Additional keyword arguments

        Returns:
            SubgraphChoiceCaller: A callable object that can be used for autotuning
        ro   )r   r%   r&   r'   r(   r)   )r$   nextr   index_counter)rE   r   r%   r&   r(   r'   r)   rd   s           r!   generatezSubgraphTemplate.generateW  s;    4 $64 0 > >?@A##''
 	
rg   decompositionsnon_tensor_argsdefault_implconfig_patches_listc                 8   |sg S t        |      t        |      k(  s J dt        |       dt        |       d       ||D cg c]  }i  }}t        ||      D 	
cg c]  \  }	}
| j                  ||	|
||       }}	}
| j                  |||       |d   }g }t        |||      D ]  \  }	}}ddlm} |	|ddt        dt        d	t        f   d
t        t        t        f   dt        fd}| j                  |	|      }	 | j                  | d| |||d|	j                   |      }|j!                  |	|       ||_        |j%                  |        |S c c}w c c}
}	w # |$ r7 t        j                  d|	j                         t        d   dxx   dz  cc<   Y w xY w)a  
        Generate multiple SubgraphChoiceCaller instances for custom op autotuning.

        This method extends SubgraphTemplate to support custom op decompositions,
        allowing multiple implementations to compete in autotuning.

        Args:
            name: Base name for the choices
            decompositions: List of decomposition functions to compete in autotuning
            input_nodes: List of tensor inputs. All tensor arguments must be passed here.
            non_tensor_args: List of non-tensor kwargs only, one dict per corresponding decomposition.
            default_impl: Default implementation for layout inference
            input_gen_fns: Optional dict mapping input indices to tensor generators
            config_patches_list: Optional list of config patches per decomposition

        Returns:
            List of SubgraphChoiceCaller instances for autotuning
        z>decompositions and non_tensor_args must have same length, got z decompositions and z kwargsr   )_ShapeEnvGuardError)decompdecomp_kwargsr   r   .r   r   c           	         ddl m} ddlm}  |       }t        j
                  j                  }||j                         nt        j                         }|5    |t        j                  | fi ||d      | cd d d        S # 1 sw Y   y xY w)Nr   )make_fx   )select_decomp_tablesymbolic)decomposition_tabletracing_mode)"torch.fx.experimental.proxy_tensorr   r<   r   r   r0   rX   error_on_new_guards
contextlibnullcontext	functoolspartial)r   r   r   r   r   r   rX   	guard_ctxs           r!   r(   zBSubgraphTemplate.generate_custom_op_choices.<locals>.make_fx_graph  s     G?&9&;#KK11	
 !, 113#//1 
  7!))&BMB,?%/ 	  s   "BBro   z	CustomOp )r   r%   r&   r(   r'   r)   z5Skipping decomposition %s: adds guards during tracinginductorcustom_op_decomp_guard_skipsrK   )r2   rN   _infer_custom_op_layout_validate_layout_equivalence%torch.fx.experimental.symbolic_shapesr   r   r   r   rV   _generate_variant_namer   r   r~   infor   rf   r>   r7   )rE   r   r   r%   r   r   r)   r   ro   r   rd   layoutsr&   choicesr   r>   r   r(   variant_namechoices                       r!   generate_custom_op_choicesz+SubgraphTemplate.generate_custom_op_choicesz  s   8 I>"c/&:: 	
~&'';C<P;QQXZ	
: &/=">!2">"> #&no"F	
  ((VV\=
 
 	))$H.058O-@6
 ;	#1FM>
 R .40= c*  $CH~ 	8  66v}ML 6<.1 +!"/"+FOO+< ="/ ' " &&v}=$2F!NN6"w;	#z Y #?
v ' KOO $%CDIDs   	EE;)E9FFr   c                     ddl |j                  }|s|S dt        dt        ffddj	                  fdt        |j                               D              }| d| S )zLGenerate a descriptive name for a decomposition variant with its parameters.r   Nvr   c                 x    t        |       }j                  dd|      }|r|d   j                         rd|z   }|S )z7Convert a value to a valid Python identifier component.z[^a-zA-Z0-9_]ro   r   )rV   subisdigit)r   rZ   res     r!   sanitize_valuez?SubgraphTemplate._generate_variant_name.<locals>.sanitize_value  s<    AA'a0AQqT\\^!GHrg   ro   c              3   >   K   | ]  \  }}| d  |         yw)ro   Nr   ).0kr   r   s      r!   	<genexpr>z:SubgraphTemplate._generate_variant_name.<locals>.<genexpr>  s+       
+/1aqc>!$%& 
s   )r   r   r   rV   r   sorteditems)rE   r   rd   	base_nameparam_suffixr   r   s        @@r!   r   z'SubgraphTemplate._generate_variant_name  sh     	OO		c 	c 	 xx  
39&,,.3I 
 
 Al^,,rg   c                     |j                         D ]<  \  }}t        |t        j                  t        f      s'J d| dt        |       d        y)z8Validate that kwargs contains only non-tensor arguments.zkwargs['z'] contains tensor zo. Tensor arguments should be in input_nodes, not kwargs. Only scalar/non-tensor parameters should be in kwargs.N)r   rO   rP   rQ   r   r   )rE   rd   r   values       r!   _validate_non_tensor_kwargsz,SubgraphTemplate._validate_non_tensor_kwargs  sZ     ,,. 	JC!%%,,)?@ 3%24;- @I J@	rg   op_namer   c                 ,   |sy|d   }t        |dd d      D ]  \  }}|j                  |j                  |j                  |j                  f|j                  |j                  |j                  |j                  fk7  sdt        d| d||   j                   d|j                   d|j                   d|j                   d|j                   d	|d   j                   d|j                   d|j                   d|j                   d|j                   d
       y)zXEnsure all layouts have consistent stride, device, dtype, and sizes for fair autotuning.Nr   rK   )startzLayout mismatch in custom op 'z': decomposition 'z' produces (z, z) but 'ri   )r1   r   dtypesizestrideAssertionErrorr   )rE   r  r   r   	referencerG   r&   s          r!   r   z-SubgraphTemplate._validate_layout_equivalence  s(    AJ	"712;a8 	IAvv||V[[&--H    	M  %4WI >&&4Q&7&@&@%A BbbR W*1-667 8!(()IOO+<By~~>NbQZQaQaPbbc	e 	rg   function_decompositionc           	      ,   ddl }ddlm} | j                  |       |j                  5  g }t        |      D ]  \  }	}
|r|	|v r ||	   |
      }n|
j                         }|j                  j                  j                  |      }|
j                         }|j                  j                  j                  |      }t        j                  |||
j                         |
j                               }|j                  |         |j                   |fi |} || }t#        |t        j$                        sJ dt'        |       d       t)        |j*                  |j,                  |j.                  |j1                               cddd       S # 1 sw Y   yxY w)zInfer output layout for custom ops using the default implementation when available.
        Note that the Subgraph assumes custom ops return exactly one tensor output.
        TODO: Add support for multiple output custom ops.
        r   Nr   )r	  r   z#Expected single tensor output, got z:. Multi-output custom ops not yet supported in autotuning.)r   r	  r
  r  )r   torch._inductor.virtualizedr   r  r0   r1   r3   r   rW   optimization_hintsr4   rP   empty_strided	get_dtype
get_devicer7   r   rO   rQ   r   r   r   r	  rR   r  )rE   r%   r  rd   r   r)   r   r   r8   rG   rH   fake_tensor	raw_shapeconcrete_shape
raw_strideconcrete_strider   outputs                     r!   r   z(SubgraphTemplate._infer_custom_op_layout*  ss    	1 	((0[[  	N#K0 33 Q-%7"2-"23"7K #I%&WW%5%5%H%H%SN!$!1J&'gg&6&6&I&I*&UO"'"5"5&'!mmo"~~/	#K %%k23  #""#9DVDB(F fell3 5d6l^ DK L3
 }}ll\\}}	7 	  	  	s   EF

F) N)NNN)NN)r   r   r   r   	itertoolscountr   rV   r.   r   r   r   r   r   r   rU   rP   rQ   r$   r   r   r   r  r   r   r   r   s   @r!   r   r   ?  s    $IOO%M$$& IM!
!
 &\!
 	!

  S)!
 !
 C3%*=!>>?$F!
 !
 
!
R 37IM;?rr Xc3h/0r &\	r
 d38n-r sCx(4/r C3%*=!>>?$Fr "$sCx.1D8r 
"	#rh-sCx(-26sCx.-	-2$sCx. T  Xc3h/0 f	
 
> 37IM3&\3 !)c 23 S#X	3
 sCx(4/3 C3%*=!>>?$F3 
3rg   r   )2r   r   r  loggingcollections.abcr   typingr   rS   rP   torch._inductor.config	_inductorrA   torch._dynamo.utilsr   torch._inductorr    torch._inductor.autotune_processr   r   r	   torch._inductor.codegen.commonr
   torch._inductor.irr   r   r   r   r   r   r   $torch._inductor.runtime.benchmarkingr   torch._inductor.utilsr   r  r   torch.utils._ordered_setr   	getLoggerr   r~   fxGraphModuler   rV   r"   ChoiceCallerr$   r   r   rg   r!   <module>r/     s        $    ' ' (  
 :   = : ) / g!))&*3i)7:)),@'2?? @'F^~ ^rg   