
    9j                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZ	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZ d dlmZmZmZ d dlmZmZmZmZ d dlm Z m!Z!m"Z" d dl#Z#d dl$Z#d dl%m&Z& d dl'm(Z( d d	l)m*Z* d d
l+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6m7Z7m8Z8m9Z9 d dl:m;Z; d dl<m=Z=  e>e
j~                  j                  dd            ZAe"rd dlBmCZC d dlDmEZEmFZFmGZG ddlHmIZI ddlJmKZK ddlLmMZM dZN e;eOd      ZP G d deQ      ZR G d d      ZS G d d      ZTe*j                  e*j                  z  ZWej                   G d d              ZYej                   G d! d"             ZZ G d# d$eZ      Z[ G d% d&      Z\ G d' d(      Z] G d) d*eZ      Z^ G d+ d,e\e^      Z_ G d- d.e]e^      Z` G d/ d0eZ      Za G d1 d2e\ea      Zb G d3 d4e]ea      Zc G d5 d6eZ      Zd G d7 d8e\ed      Ze G d9 d:e]ed      Zf G d; d<e\eZ      Zg G d= d>e]eZ      Zh G d? d@e\eZ      Ziej                  dLdA       Zk	 	 	 	 dMdBZl G dC dD      ZmdNdEZndOdFZo	 	 	 	 dPdGZp G dH dI      Zq G dJ dK      Zry)Q    )annotationsN)CallableIterableSequence)FutureProcessPoolExecutorThreadPoolExecutor)byrefc_size_tc_void_pCDLL)AnyIOTYPE_CHECKING)get_interface_for_device)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCacheXPUCodeCache)Timer)do_bench_using_profilingget_gpu_typeget_ld_library_pathis_gpupython_subprocess_env)getArtifactLogger)
OrderedSet.TORCHINDUCTOR_AUTOTUNE_POOL_INACTIVITY_TIMEOUT600)
ModuleType)ChoiceCallerPartialRenderTritonTemplateCaller   )config)benchmarker)VCUDA_VISIBLE_DEVICES
autotuningc                      e Zd Zy)!NonzeroWorkspaceNotSupportedErrorN__name__
__module____qualname__     `/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/autotune_process.pyr/   r/   K       r5   r/   c                      e Zd ZdZedd       Ze	 d	 	 	 	 	 	 	 dd       Zedd       ZddZd Z	ddZ
ddd	Zddd
ZdddZddZddZddZddZy)TuningProcesszF
    Class to launch and interact with a benchmarking subprocess.
    c                     t         j                  dt        j                         t        j                  j                  t                      fd}	  |        y# t        $ r Y yw xY w)z4
        Entry point for the child process.
        z3Started autotune subprocess %s. Visible devices: %sc                     	 t         j                        \  } }| y 	 |rt        j                  j	                  |        |        }t         j                  |       [# t
        $ r}|}Y d }~'d }~ww xY wN)r9   recvosenvironupdate	Exceptionsend)job	extra_envresulte	read_pipe
write_pipes       r6   workloopz,TuningProcess.process_main.<locals>.workloop_   sq    !.!3!3I!>Y; 

)))4 UF ""6:6  ! Fs   (A 	A3'A..A3N)autotuning_logdebugr>   getpidr?   getr,   EOFError)rG   rH   rI   s   `` r6   process_mainzTuningProcess.process_mainT   sQ    
 	AIIKJJNN/0	
	7	J 		s   A 	A('A(Nc                T    t        j                  | |f|       |j                          y r<   )pickledumpflush)objrH   rD   s      r6   rB   zTuningProcess.sends   s#     	S)$j1r5   c                ,    t        j                  |       S r<   )rQ   load)rG   s    r6   r=   zTuningProcess.recvz   s    {{9%%r5   c                2    || _         | j                          y r<   )devicestart)selfrX   s     r6   __init__zTuningProcess.__init__~   s    

r5   c                   t         j                  j                  t         j                  j                  t              d      }t        j
                         \  }}t        j
                         \  }}t        j                  |d      | _        t        j                  |d      | _        t        j                         | _        | j                  j                  | j                  t        j                         t        j                  |dt        j                           dt#        |       dt#        |       g}i t%               dt'               t(        j*                  rdndd	}| j,                  t#        | j,                        |t.        <   t1        j2                  ||||f      | _        t        j6                  |       t        j6                  |       d| _        y
)z4
        Start the benchmarking subprocess.
        z__autotune_main__.pywbrbz	--parent=z
--read-fd=z--write-fd=01)TORCH_WARM_POOLLD_LIBRARY_PATH3TORCHINDUCTOR_PROFILE_WITH_DO_BENCH_USING_PROFILINGN)envpass_fdsT)r>   pathjoindirname__file__pipefdopenrH   rG   	selectorsDefaultSelectorselectorregister
EVENT_READsys
executablerL   strr   r   r)   /profile_bandwidth_with_do_bench_using_profilingrX   r,   
subprocessPopenprocesscloserunning)rZ   entrysubproc_read_fdwrite_fdread_fdsubproc_write_fdcmdrd   s           r6   rY   zTuningProcess.start   sw    RWW__X68NO$&GGI!$&GGI!!))Hd37D1!113t~~y/C/CD NN		}%_-./#./01

#%
  #24 EE DG
 ;;"(+DKK(8C$%!''%'78

 	!
!"r5   c                V    | j                   xr | j                  j                         du S )z:
        True if the subprocess is still running.
        N)ry   rw   pollrZ   s    r6   alivezTuningProcess.alive   s%     ||; 1 1 3t ;;r5   c                    | j                         s| j                          t        j                  || j                  |       y)z8
        Push a work item to the child process.
        rD   N)r   rY   r9   rB   rH   )rZ   reqrD   s      r6   putzTuningProcess.put   s/     zz|JJL39Er5   c                   	 | j                   j                  |      s"t        d| j                  j                         t
        j                  | j                        \  }}t        |t              r||S # t        $ r | j                           t        $ r | j                           t        $ r< t        j                  d| j                  j                         | j                           w xY w)z
        Get a response from the child process. Raises TimeoutError on timeout;
        raises EOFError if the subprocess crashes.
        zTimeout in autotune subprocess z.Unexpected exception in autotune subprocess %s)rn   selectTimeoutErrorrw   pidr9   r=   rG   killrN   rx   rA   rJ   	exception
isinstance)rZ   timeoutrE   _s       r6   rM   zTuningProcess.get   s    
	==''0"%DT\\EUEUDV#WXX%**4>>:IFA fi(L!  	IIK 	JJL 	$$@$,,BRBR IIK	s   AA5 5A7C,c                    | j                         r t        j                  d| j                         |r| j	                          yy)zC
        Signal the child process to shut down gracefully.
        N)r   r9   rB   rH   waitrZ   r   s     r6   shutdownzTuningProcess.shutdown   s2     ::<tT__5IIK r5   c                x    | j                         r| j                  j                          | j                          y)z5
        Wait for the child process to exit.
        N)r   rw   r   rx   r   s    r6   r   zTuningProcess.wait   s&     ::<LL

r5   c                    | j                   j                          | j                  j                          | j                  j                          d| _        y)z"
        Close resources.
        FN)rn   rx   rG   rH   ry   r   s    r6   rx   zTuningProcess.close   s;     	r5   c                    | j                         rDt        j                  d| j                  j                         | j                  j                          | j                          y)z6
        Send a SIGKILL to the child process.
        z)Sending SIGKILL to autotune subprocess %dN)r   rJ   errorrw   r   r   rx   r   s    r6   r   zTuningProcess.kill   sF     ::<  ;   LL

r5   c                H    | j                  d       | j                          y)z8
        Gracefully restarts the child process.
        Tr   N)r   rY   r   s    r6   restartzTuningProcess.restart   s     	4 

r5   )rG   	IO[bytes]rH   r   returnNoner<   )rT   r   rH   r   rD   dict[str, str] | Noner   r   )rG   r   r   r   )rX   
int | Noner   bool)r   r   rD   r   r   r   )g      ^@)r   floatr   r   )T)r   r   r   r   r   r   )r1   r2   r3   __doc__staticmethodrO   rB   r=   r[   rY   r   r   rM   r   r   rx   r   r   r4   r5   r6   r9   r9   O   s      < LP'4I	  & &+Z<F6
r5   r9   c                  J    e Zd ZdZddZed	d       ZddZd
dZ	 	 	 	 ddZ	y)TuningProcessPoolz
    Maintains a pool of TuningProcesses to benchmark kernels in parallel
    across devices. By default, we create one TuningProcess per device and
    set the sub-process environment to make only that device visible.
    c                V   | j                         }t        j                  d|       |D cg c]  }t        |       c}| _        t        j                         | _        | j                  D ]  }| j                  j                  |        t        t        |            | _        yc c}w )z,
        Start the child processes.
        z$Sub-process autotune device list: %srX   max_workersN)get_device_listrJ   rK   r9   	processesqueueQueueprocess_queuer   r	   lenexecutor)rZ   devicesrX   ps       r6   r[   zTuningProcessPool.__init__  s     &&(CWM FMM6-v6M9> 	&A""1%	& +s7|D Ns   B&c                 l   t         j                  sdgS t               } t        |       }|j	                         }t
        t        j                  v rNt        j                  t
           j                  d      D cg c]  }t        |       }}t        |      |k  sJ |S t        t        |            S c c}w )zD
        Gather the list of devices to be used in the pool.
        N,)r)   autotune_multi_devicer   r   device_countr,   r>   r?   splitintr   listrange)gpu_typedevice_interfacecountdr   s        r6   r   z!TuningProcessPool.get_device_list   s    
 ++6M>3H= --/  2::-')zz2F'G'M'Mc'RS!s1vSGSw<5(((NE%L!!	 Ts   7B1c                    | j                   j                          | j                  D ]  }|j                  d        | j                  D ]  }|j                           y)z5
        Signal all child processes to exit.
        Fr   N)r   r   r   r   )rZ   r   s     r6   r   zTuningProcessPool.shutdown5  sQ     	  	#AJJEJ"	# 	AFFH	r5   c                R   |j                   J ddg}|D ci c])  }|t        j                  v s|t        j                  |   + }}| j                  j	                         }|j                  |j                   j                  |       	 |j	                  t        j                        | j                  j                  |       S c c}w # t        $ rB t        j                  d| d       t        d      cY | j                  j                  |       S t        $ rl}t        j                  d| d       t        |      }d	|v sd
|v r|j                          t        d      cY d}~| j                  j                  |       S d}~ww xY w# | j                  j                  |       w xY w)z
        Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
        remove it from the queue, execute the benchmark in that subprocess, and return
        the TuningProcess to the queue.
        NTORCHINDUCTOR_CACHE_DIRTRITON_CACHE_DIRr   zTimed out benchmarking choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.infzFailed to benchmark choice 'cudaErrorLaunchFailurecudaErrorIllegalAddress)bmreqr>   r?   r   rM   r   	benchmarkr)   +max_autotune_subproc_result_timeout_secondsr   warningswarnr   rA   rs   r   )rZ   choiceenv_varsvrD   rw   process_exception	error_msgs           r6   targetzTuningProcessPool.target@  s    ||'''-/AB/7K!1

?Q

1%K	K$$((*FLL**i@	,;;BB4 ""7+= L  	 MM1& :W W
 <" ""7+!  	 MM.vh 7W W -.I(I5,	9!<""7+!	   ""7+sG   CC	C -F5F	 FAF F!F	 FF	 	F&c           	     x    t        t        || j                  j                  | j                  |                  }|S )z>
        Benchmark each choice in a separate process.
        )dictzipr   mapr   )rZ   choicesresultss      r6   r   zTuningProcessPool.benchmarki  s/     s7DMM$5$5dkk7$KLMr5   Nr   )r   zSequence[int | None])r   r'   r   r   r   zlist[TritonTemplateCaller]r   z!dict[TritonTemplateCaller, float])
r1   r2   r3   r   r[   r   r   r   r   r   r4   r5   r6   r   r     sC    E& " "(	',R+ 
+r5   r   c                  p    e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   dZded<   e	 	 	 	 dd       ZddZy)
TensorMetaztorch.devicerX   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetNz
str | Nonenamec           
        ddl m} t        |t              r4|D cg c]  }| j	                  |       }}t        d |D              sJ |S |}t        |t        j                        rt        j                  d|      }|j                         }|J |j                         }|J t        ||t        j                  j                  j                  |j!                               t        j                  j                  j                   ||            t        j                  j                  j#                  |j%                         j&                        |j)                               S c c}w )Nr   )#get_strides_with_layout_constraintsc              3  <   K   | ]  }t        |t                y wr<   )r   r   .0xs     r6   	<genexpr>z*TensorMeta.from_irnodes.<locals>.<genexpr>  s     AQz!Z0A   fake)r   layout)rX   r   r   r   r   r   ) torch._inductor.select_algorithmr   r   r   from_irnodesallr   LayoutBuffer	get_dtype
get_devicer   r+   graphsizevarsoptimization_hintsget_sizeoptimization_hint
get_layoutr   get_name)clsirnodesr   r   rE   noder   rX   s           r6   r   zTensorMeta.from_irnodes  s#    	Ygx(>E F!1!1!!4 FF FA&AAAAMdBII&99&6D    "!!!''""55dmmoFGG$$773D9 77##55doo6G6N6NO	
 		
 !Gs   E&c                    t        | j                  | j                  | j                  | j                  | j
                        S )N)rX   r   
extra_size)r   r   r   rX   r   r   r   s    r6   	to_tensorzTensorMeta.to_tensor  s2    JJLL;;**{{
 	
r5   )r   z)LayoutOrBuffer | Sequence[LayoutOrBuffer]r   TensorMeta | list[TensorMeta])r   torch.Tensor)r1   r2   r3   __annotations__r   classmethodr   r   r4   r5   r6   r   r   {  sP    ((++KD*
?
	&
 
<
r5   r   c                  x    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 d	dZ	 	 	 	 	 	 d
dZddZdd	 	 	 	 	 ddZdd	 	 	 	 	 ddZy)BenchmarkRequesta1  
    Only handle triton template benchmark for now. The extern kernel benchmark
    can be done inside the same process since they usually don't cause crash.

    Important: Instances of this class and subclasses have to be serializable
    across process boundaries. Do not put CUDA Tensors in here!
    c                   || _         t        |t              r	|g| _        n|| _        rEt        t        t
        f      r/t              dkD  rt        fdD              sJ d   | _        n| _        || _	        d| _
        y )Nr(   c              3  d   K   | ]'  }d D ]   }t        d   |      t        ||      k(   " ) yw))rX   r   r   r   r   r   N)getattr)r   r   attroutput_tensor_metas      r6   r   z,BenchmarkRequest.__init__.<locals>.<genexpr>  sG       Q  .q148GAt<LLLs   -0r   F)kernel_namer   r   input_tensor_metatupler   r   r   r
  
extra_argsbenchmark_with_cudagraphs)rZ   r  r  r
  r  s      ` r6   r[   zBenchmarkRequest.__init__  s     ''48I7JD"7HD"*-?%"O%&* /   
 '9&;D# 3ED#$).&r5   c                   t         r<   NotImplementedErrorrZ   outinput_tensorss      r6   make_run_fnzBenchmarkRequest.make_run_fn  s
     "!r5   c                     y r<   r4   r   s    r6   cleanup_run_fnzBenchmarkRequest.cleanup_run_fn  s    r5   Nr  c                   t         r<   r  rZ   fnr  r  s       r6   do_benchzBenchmarkRequest.do_bench  s
     "!r5   c               p   t         j                  t        j                        }|rt	        j                         }|e| j
                  r| j                  sJ d       t        |      dk(  sJ t        d | j
                  D              }| j                  j                         }|r+t	        j                         z
  }t	        j                         }	  | j                  |d|i}|r+t	        j                         z
  }t	        j                         }| j                  rt        j                   |      }n | j"                  |g|| }|r0t	        j                         z
  }	t         j%                  d| |	       | j'                          |S # t        $ r# t         j                  d       t        d      cY S w xY w)NzJInput and output tensor meta must be populated when input_tensors is emptyr   c              3  <   K   | ]  }|j                           y wr<   )r   r   s     r6   r   z-BenchmarkRequest.benchmark.<locals>.<genexpr>  s     !PA!++-!Pr   r  z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)rJ   isEnabledForloggingDEBUGtimer  r
  r   r  r   r  r/   infor   r  r*   benchmark_gpu_with_cuda_graphr  rK   r  )
rZ   r  r  rK   start_tscreate_tensor_elapser  load_elapseresbench_elapses
             r6   r   zBenchmarkRequest.benchmark  s   
 ++GMM:yy{H ;))d.E.E \E }%***!!P9O9O!PPM))335C#'99;#9 yy{H	 !!!=:c:B ))+0Kyy{H));;B?C$--8]8C8C99;1L  H$ 	
1 1 	  RS<	 s   F	 	)F54F5)
r  rs   r  r   r
  r   r  Iterable[Any]r   r   r  r  r  r  r   zCallable[[], None]r   r  r  r  torch.Tensor | Noner   r   )	r1   r2   r3   r   r[   r  r  r  r   r4   r5   r6   r  r    s    // 9/ :	/
 "/ 
/>"*"1="	"
 $(	" %" !	"
 
" $(/$/ !/ 
	/r5   r  c                  N    e Zd ZdZ	 	 	 	 	 d	 	 	 	 	 	 	 	 	 ddZdd	 	 	 	 	 ddZy)	_TestBenchmarkRequestz
    Supports unit testing. Defined in this file instead of the test file so the
    TuningProcess sub-process can unpickle these objects.
    Nc                J    || _         || _        || _        || _        || _        y r<   )rE   rX   sleepexccrash)rZ   rE   rX   r2  r3  r4  s         r6   r[   z_TestBenchmarkRequest.__init__  s'     

r5   r  c               r   | j                   <t        j                  j                  t        d       t        | j                         k(  sJ | j                  rt        j                  | j                         | j                  r| j                  | j                  rt        j                  d       | j                  S )Nr(   )rX   r>   r?   rM   r,   rs   r2  r#  r3  r4  rq   exitrE   r  s      r6   r   z_TestBenchmarkRequest.benchmark,  sx     ;;"::>>"6=T[[AQQQQ::JJtzz"88((N::HHQK{{r5   )        NNNF)
rE   r   rX   r   r2  zfloat | Noner3  zException | Noner4  r   r-  )r1   r2   r3   r   r[   r   r4   r5   r6   r0  r0    sq     !" $  	
   HL*1D	r5   r0  c                  $    e Zd Zdd	 	 	 	 	 ddZy)GPUDeviceBenchmarkMixinNr  c                  t        d g ||D              }t        |      dk  s
J d|        t        d |D        d      }t        |      }t        |      dk(  rt        t	        |            }n|j                         }|j                  |      5  t        j                  ||      }|j                          d d d        |S # 1 sw Y   S xY w)Nc              3     K   | ]i  }t        |t        j                        rMt        |j                  j
                        r.|j                  j                  |j                  j                   k y wr<   )r   torchTensorr   rX   typeindexr   tensors     r6   r   z3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>A  sR      $
&%,,/v}}))*##/	 MM$
s   A/A1r(   zCan not mix devices c              3     K   | ]9  }t        |j                  j                        r|j                  j                   ; y wr<   )r   rX   r>  r@  s     r6   r   z3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>J  s4      &--,,- ""s   ?Acudar   )
r!   r   nextr   itercurrent_devicerX   r*   r   synchronize)	rZ   r  r  r  device_idx_setdevice_typer   
device_idxr)  s	            r6   r  z GPUDeviceBenchmarkMixin.do_bench;  s     $ $
/M/3/$
 
 >"a'P+??O)PP'+
 
 4K@~!#d>23J)88:J$$Z0 	+'';?C((*	+ 
		+ 
s   (CCr-  r1   r2   r3   r  r4   r5   r6   r9  r9  :  s*    
 $(	 % !	
 
r5   r9  c                  $    e Zd Zdd	 	 	 	 	 ddZy)CPUDeviceBenchmarkMixinNr  c               ,    t        j                  |      S r<   )r*   benchmark_cpur  s       r6   r  z CPUDeviceBenchmarkMixin.do_bench^  s     ((,,r5   r-  rK  r4   r5   r6   rM  rM  ]  s*    
 $(	- %- !	-
 
-r5   rM  c                       e Zd ZdZ	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZd Zd	dZ xZS )
TritonBenchmarkRequestz
    Represents a standalone benchmark request for a Triton Template.

    Important: Instances of this class have to be serializable
    across process boundaries. Do not put CUDA Tensors in here!
    c                    t         |   ||||       || _        || _        || _        || _        |	| _        |
| _        || _        || _	        || _
        || _        || _        y r<   )superr[   module_pathmodule_cache_key
num_stages	num_warpsnum_consumer_groupsnum_buffers_warp_specmatrix_instr_nonkdimwaves_per_eukpackworkspace_sizeworkspace_zero_fill)rZ   r  r  r
  r  rT  rU  rV  rW  rX  rY  rZ  r[  r\  r]  r^  	__class__s                   r6   r[   zTritonBenchmarkRequest.__init__o  so    $ 	&79KZX& 0$"#6 %:"$8!(
,#6 r5   c               |   t        j                  | j                  | j                        }t        j                  d| j                  | j                         t        || j                        j                  }t        | j                        }| j                  sddlm} t        j                  | j                  ft        j                   |j"                        }| j$                  r|j'                          |j)                  |      }|||<   d|j*                  _        i }	dd l}
d|
j1                  |      j2                  v rd|	d<   |j"                  j4                  dk(  rd}nP|j"                  j4                  }t7        |      }|j9                  | j:                  j"                  j(                        }t=        t        || j                        t        j>                  j@                  jB                  jD                        r!tG        jH                  |g|||i |	d|iS tG        jH                  |g|||i |	|d	d
S )Nz"benchmark module key: %s, path: %sr   )WORKSPACE_ARG_PLACEHOLDERr   rX   FwarmupcpustreamT)re  benchmark_run)%r   load_by_key_pathrU  rT  rJ   rK   r  r  runr   r  r]  r   ra  r<  emptyuint8rX   r^  zero_r?  __self__with_bandwidth_infoinspect	signature
parametersr>  r   get_raw_streamr
  r   	_inductorruntimetriton_heuristicsDebugAutotuner	functoolspartial)rZ   r  r  mod
run_methodr  ra  workspace_tensorworkspace_index
warmup_argrn  re  rI  r   s                 r6   r  z"TritonBenchmarkRequest.make_run_fn  s,    **4+@+@$BRBRS0!!	
 S$"2"2377
$//*

 *R${{$$&kkzz 
 '' &&((../HIO*:J'27
/ 
w((4???#(Jx ::??e#F**//K7D%44''..44F C))*OO##55DD
 $$  	
    $$  	
  " r5   c                    t        j                  | j                  | j                        }t	        || j
                        }|j                          |j                  d   j                  | _        y Nr   )	r   rg  rU  rT  r  r  
precompile	launchersn_regs)rZ   rx  kernels      r6   r  z!TritonBenchmarkRequest.precompile  sV    **4+@+@$BRBRSd../&&q)00r5   c                T    d| j                   d| j                  d| j                  S )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r  rT  rU  r   s    r6   __str__zTritonBenchmarkRequest.__str__  s2    #$""$$8t'7'7&99RD<Q<Q;STTr5   )r   r   r   r   r   NF) r  rs   r  r   r
  r   r  r+  rT  rs   rU  rs   rV  r   rW  r   rX  r   rY  r   rZ  r   r[  r   r\  r   r]  r   r^  r   r   r   r,  r   rs   	r1   r2   r3   r   r[   r  r  r  __classcell__r_  s   @r6   rQ  rQ  g  s    " $%%&$%%)$)!77 97 :	7
 "7 7 7 7 7 !7  #7 "7 7 7 #7  "!7" 
#7>E*E1=E	EN1Ur5   rQ  c                      e Zd Zy)TritonGPUBenchmarkRequestNr0   r4   r5   r6   r  r    r7   r5   r  c                      e Zd Zy)TritonCPUBenchmarkRequestNr0   r4   r5   r6   r  r    r7   r5   r  c                       e Zd ZdZ	 	 d
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZddd fdZddZd Zdd	Z	 xZ
S )ExternKernelBenchmarkRequesta*  
    A class to handle extern kernel benchmark requests. This allows extern kernels
    (like aten::mm) to be benchmarked in a subprocess, similar to Triton kernels.

    Important: Instances of this class have to be serializable across
    process boundaries. Do not put CUDA Tensors in here!
    Nc                \    t         |   ||||       || _        |xs i | _        || _        y r<   )rS  r[   callable_pathkwargshas_out_variant)	rZ   r  r  r
  r  r  r  r  r_  s	           r6   r[   z%ExternKernelBenchmarkRequest.__init__  s5     	&79KZX*l.r5   c                   | j                         }| j                  rt        j                  |g|d|iS t        j                  |g| S )Nr  )to_callabler  rv  rw  )rZ   r  r  r  s       r6   r  z(ExternKernelBenchmarkRequest.make_run_fn   sN     $$RA-ASAA $$R8-88r5   r  c               R   ||j                         dk(  ry| j                  st              dk(  rt        |   d|iS | j                           }|tt        j                  j                  j                  j                  |t        |j                               t        |j                                      |j                  |       | j                  rt!        j"                  fd      S t$        j&                  rt)        fd      S t!        j                  i       S )Nr   r7  r  c                        S r<   r4   algor  s   r6   <lambda>z8ExternKernelBenchmarkRequest.benchmark.<locals>.<lambda>  s    D-0 r5   c                        S r<   r4   r  s   r6   r  z8ExternKernelBenchmarkRequest.benchmark.<locals>.<lambda>  s    m8L r5   )numelr  r   rS  r   r  r<  _C_dynamoguardsassert_size_strider  sizestridecopy_r  r*   r%  r)   rt   r   )rZ   r  r  out_newr  r_  s     ` @r6   r   z&ExternKernelBenchmarkRequest.benchmark  s    ?syy{a/3}#5#:7$m===##%DM*G  ''::U388:.cjjl0C 		'"--"@@0  EE/0LMM((}bAAr5   c                     y r<   r4   r   s    r6   r  z'ExternKernelBenchmarkRequest.precompile!      r5   c                    ddl m} t        || j                        }| j                  r t        j                  |fi | j                  S |S )Nr   )extern_kernels)r   r  r  r  r  rv  rw  )rZ   r  r  s      r6   r  z(ExternKernelBenchmarkRequest.to_callable%  s@     	D^T%5%56;;$$R74;;77	r5   c                "    d| j                    dS )NzExternKernelBenchmarkRequest())r  r   s    r6   r  z$ExternKernelBenchmarkRequest.__str__1  s    .t/A/A.B!DDr5   )NT)r  rs   r  r   r
  r   r  r+  r  rs   r  zdict[str, Any] | Noner  r   r   r   r,  )r  r  r  r.  r   r  )r1   r2   r3   r   r[   r  r   r  r  r  r  r  s   @r6   r  r    s     )- $// 9/ :	/
 "/ / &/ / 
/	9*	91=	9		9 RV B,
Er5   r  c                      e Zd Zy)ExternKernelGPUBenchmarkRequestNr0   r4   r5   r6   r  r  5       	r5   r  c                      e Zd Zy)ExternKernelCPUBenchmarkRequestNr0   r4   r5   r6   r  r  ;  r  r5   r  c                  h     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZddZd	dZ xZS )
SubgraphBenchmarkRequestz
    Benchmark request for subgraph choices.

    Pre-compiles the subgraph in the main process and stores
    the module path/cache key for loading in subprocess.
    c                T    t         |   ||||       || _        || _        || _        y r<   )rS  r[   rT  rU  sym_input_values)	rZ   r  r  r
  r  rT  rU  r  r_  s	           r6   r[   z!SubgraphBenchmarkRequest.__init__I  s2     	&79KZX& 0 0r5   c                   t        j                  | j                  | j                        | j                  fdS )Nc                 .    j                  g        S r<   )call)r  rx  r  s   r6   r  z6SubgraphBenchmarkRequest.make_run_fn.<locals>.<lambda>^  s    sxx C"2 C] CD r5   )r   rg  rU  rT  r  )rZ   r  r  rx  r  s     `@@r6   r  z$SubgraphBenchmarkRequest.make_run_fnX  s5     **4+@+@$BRBRS00DDr5   c                     y r<   r4   r   s    r6   r  z#SubgraphBenchmarkRequest.precompile`  r  r5   c                <    d| j                    d| j                   dS )NzSubgraphBenchmarkRequest(z, r  )r  rT  r   s    r6   r  z SubgraphBenchmarkRequest.__str__d  s&    *4+;+;*<Bt?O?O>PPQRRr5   )r  rs   r  r   r
  r   r  r+  rT  rs   rU  rs   r  z	list[int]r   r   r,  r   r  r  r  s   @r6   r  r  A  s    11 91 :	1
 "1 1 1 $1 
1E*E1=E	ESr5   r  c                      e Zd Zy)SubgraphGPUBenchmarkRequestNr0   r4   r5   r6   r  r  h  r7   r5   r  c                      e Zd Zy)SubgraphCPUBenchmarkRequestNr0   r4   r5   r6   r  r  l  r7   r5   r  c                       e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZd Z	 	 	 	 	 	 ddZddZd ZddZ	ddZ
dd	Zdd
Z xZS )CUTLASSBenchmarkRequestae  
    A class to handle CUDA (CUTLASS) benchmark requests. This class is for
    managing the lifecycle of a CUDA kernel benchmark, including compiling
    the source code, managing workspace memory, and executing the kernel.

    Important: Instances of this class have to be serializable across
    process boundaries. Do not put CUDA Tensors in here!
    c                L   t         |   ||||       || _        d| _        d | _        d | _        d| _        d| _        d| _        || _	        |dk(  rt        nt        | _        t        |      | _        | j                  j                  | j                  d      \  | _        | _        y )Nr   F xpuso)rS  r[   source_coder]  	workspaceDLL_workspace_size_updatedhash_keysource_filerI  r   r   codecache_clsr   r   write)rZ   r  r  r
  r  r  rI  r_  s          r6   r[   z CUTLASSBenchmarkRequest.__init__z  s     	&79KZX&#$.2&*',$ "&-8E-A\} 8 E*.*<*<*B*Bd+
't'r5   c                    t         j                  d|        | j                  j                  | j                  d       t         j                  d|        y)z
        Precompile the CUDA source code to populate the CUDACodeCache.
        This may happen in a separate thread pool.
        Precompiling %sr  Done precompiling %sN)rJ   rK   r  compiler  r   s    r6   r  z"CUTLASSBenchmarkRequest.precompile  sB    
 	.5""4#3#3T:3T:r5   c          	        | j                          | j                          t        |      |gz   D cg c]  }t        |j	                                }}t
        j                  d| j                  | j                  | j                  | j                  || j                         t        | j                  j                  | j                  j                                     }t        | j                  | j                        }t        d      }| j                   dkD  rht#        j$                  | j                   dz   dz  t"        j&                  |j(                        | _        t        | j*                  j	                               }t-        j.                  |g|| j                  d|| }	  |        |S c c}w # t0        $ r,}	t3        |	      fd}
| j5                          |
cY d}	~	S d}	~	ww xY w)zg
        Create a function to run the CUDA/XPU kernel with the given input and output tensors.
        zqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         rb  Nc                     t               r<   )RuntimeError)err_msgs   r6   raise_runtime_errorz@CUTLASSBenchmarkRequest.make_run_fn.<locals>.raise_runtime_error  s    "7++r5   )ensure_dll_loadedupdate_workspace_sizer   r   data_ptrrJ   rK   r  r  r  r  r  r   rq  rF  r  r]  r<  zerosfloat64rX   r  rv  rw  r  rs   r  )rZ   r  r  rA  args
stream_ptrry  workspace_ptrretrF   r  r  s              @r6   r  z#CUTLASSBenchmarkRequest.make_run_fn  s    	 ""$:>}:MQTPU:UV*+VVMMHHOO	
 !!001F1F1U1U1WX

 TXXt'7'78
 ""[[$$q(Q.mmzzDN
 %T^^%<%<%>?M 

 __
 	

 
 
		'E 
[ WH  	'!fG, !&&	's#    F2)F7 7	G, !G'!G,'G,c           
        | j                   ry | j                          t        t        j	                  d | j
                  D                    }t        |dz         D cg c]  }t        d        }}t        | j                  j                  | j                  j                                     }t        | j                  | j                        }t               } |g || j                  t!        |      d |  | j                  j#                          |j$                  | _        t(        j+                  d| j&                  | j                  | j,                  | j.                  | j                  || j                         d| _         y c c}w )Nc              3  4   K   | ]  }|j                     y wr<   )r   )r   metas     r6   r   z@CUTLASSBenchmarkRequest.update_workspace_size.<locals>.<genexpr>  s     G$))Gs   r(   zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)r  r  r   r   fromkeysr  r   r   r   rq  rF  r  r  r  r   r  r
   rG  valuer]  rJ   rK   r  r  )rZ   unique_input_countr   r  r  ry  c_workspace_sizes          r6   r  z-CUTLASSBenchmarkRequest.update_workspace_size  si   ''  MMG0F0FGG
 )..@1.D(EF1FF!!001F1F1U1U1WX

 TXXt'7'78
#: 	
	
__	
  	
 	
 	
 	))+.44 hMMHHOO		
 (,$; Gs   Fc                    | j                   9| j                  j                  | j                  d      \  | _         | _        | _        | j                   j                          y )Nr  )r  r  rV   r  r  r  openr   s    r6   r  z)CUTLASSBenchmarkRequest.ensure_dll_loaded  sJ    888<8J8J8O8O  $95DHdmT%5 	r5   c                l    | j                   !| j                   j                          d | _         d | _        y r<   )r  rx   r  r   s    r6   r  z&CUTLASSBenchmarkRequest.cleanup_run_fn   s(    88HHNNDHr5   c                T    d| j                   d| j                  d| j                  S )Nr  z, self.source_file=z, self.hash_key=)r  r  r  r   s    r6   r  zCUTLASSBenchmarkRequest.__str__  s0    #$""$$8t'7'7&99JDMM;KLLr5   c                X    | j                   j                         }d |d<   d |d<   d|d<   |S )Nr  r  Fr  )__dict__copyrZ   states     r6   __getstate__z$CUTLASSBenchmarkRequest.__getstate__	  s7    ""$e!k+0'(r5   c                :    | j                   j                  |       y r<   )r  r@   r  s     r6   __setstate__z$CUTLASSBenchmarkRequest.__setstate__  s    U#r5   )rC  )r  rs   r  r   r
  r   r  r+  r  rs   rI  rs   r   r   r,  r   r  )r   dict[str, Any])r  r  r   r   )r1   r2   r3   r   r[   r  r  r  r  r  r  r  r  r  r  s   @r6   r  r  p  s      "

 9
 :	

 "
 
 
 

0;6*61=6	6p$,LM$r5   r  c                  Z     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d fdZd Z	 	 	 	 	 	 ddZddZ xZS )CppBenchmarkRequestc                f    t         |   ||||       || _        t        |      | _        d | _        y r<   )rS  r[   r  r   r  r  )rZ   r  r  r
  r  r  r_  s         r6   r[   zCppBenchmarkRequest.__init__  s5     	&79KZX& --1r5   c                    t         j                  d|        t        j                  | j                  d       t         j                  d|        y )Nr  rd  rI  r  )rJ   rK   r   rV   r  r   s    r6   r  zCppBenchmarkRequest.precompile%  s<     	.5$**>3T:r5   c               \   t        j                  | j                  d      | _        t	        |      |gz   D cg c]  }|j                          }}t        j                  d| j                  | j                  || j                         t        | j                  | j                        }t        d | j                  D              sJ t        j                  gt        |      t        t	        | j                              z   z  |_        t!        j"                  |g|| j                   S c c}w )Nrd  r  zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%sc              3  P   K   | ]  }t        |t        j                           y wr<   )r   ctypesc_ulonglong)r   args     r6   r   z2CppBenchmarkRequest.make_run_fn.<locals>.<genexpr>;  s     R3:c6#5#56Rs   $&)r   rV   r  r  r   r  rJ   rK   r  r  r  r   r  r  r   argtypesrv  rw  )rZ   r  r  rA  r  ry  s         r6   r  zCppBenchmarkRequest.make_run_fn,  s     $$T%5%55I04]0Cse0KLf!LLXHHOO	
 TXXt'7'78
R$//RRRR%112ID122


   

 __
 	
! Ms   D)c                     d| j                   S )Nr  )r  r   s    r6   r  zCppBenchmarkRequest.__str__G  s    #$""$%%r5   )r  rs   r  r   r
  r   r  r+  r  rs   r   r   r,  r  )r1   r2   r3   r[   r  r  r  r  r  s   @r6   r  r    si    22 92 :	2
 "2 2 
2;
*
1=
	
6&r5   r  c                  P     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZ xZS )CuteDSLBenchmarkRequestz;Benchmark request for CuteDSL (CUTLASS Python DSL) kernels.c                    t         |   ||||       |j                         }t        j                  |      \  | _        | _        y r<   )rS  r[   finalize_allr   r  rU  rT  )rZ   r  r  r
  r  r  finalized_coder_  s          r6   r[   z CuteDSLBenchmarkRequest.__init__N  sC     	&79KZX$1132=2C2CN2S/t/r5   c          	     T  	 t        j                  | j                  | j                        }ddlm} | j                   d| }t        ||      s?t        |      D cg c]  }t        t        ||            s| }}t        d| d|       t        ||      		fd}|S c c}w )z
        Create a function to run the CuteDSL kernel with the given input and output tensors.
        Similar to TritonBenchmarkRequest.make_run_fn but for CuteDSL kernels.
        r(   )MAIN_SUFFIXr   z-Could not find CuteDSL main kernel function 'z'. Available callables: c                 ~    t        d      } | j                  j                  j                        } g d|iS )NrC  re  )r   rq  rX   r?  )r   re  r  kernel_funcr  s     r6   
run_kernelz7CuteDSLBenchmarkRequest.make_run_fn.<locals>.run_kernelq  s@    7?%44SZZ5E5EFFBBsB6BBr5   )r   rg  rU  rT  codegen.cutedsl.cutedsl_kernelr  r  hasattrdircallabler  r  )
rZ   r  r  rx  r  main_func_namer   	availabler  r  s
    ``      @r6   r  z#CuteDSLBenchmarkRequest.make_run_fn[  s     **4+@+@$BRBRS 	@ ,,-Q{m<sN+*-c(S$hwsD?Q6RSIS??OOghqgrs  c>2	C
  Ts   B%9B%)r  rs   r  r   r
  r   r  ztuple[Any, ...]r  r&   r   r   r,  )r1   r2   r3   r   r[   r  r  r  s   @r6   r   r   K  si    ETT 9T :	T
 $T #T 
T*1=	r5   r   c                 X    t               } t        j                  | j                         | S r<   )r   atexitro   r   )pools    r6   get_tuning_process_poolr  y  s    D
OODMM"Kr5   c                4    t               j                  |       S )zO
    Do benchmarking in a subprocess and return the perf number (latency).
    )r  r   )r   s    r6   benchmark_in_sub_processr    s     #$..w77r5   c                      e Zd ZU dZdZded<    ej                         Zded<   dZ	ded	<   d
 Z
ed        Zed        ZddZddZddZd ZddZddZddZd Zed        Zy)AutotuneProcessPoolzr
    Singleton pool manager for running autotuning (precompilation + benchmarking)
    in a separate process.
    NzAutotuneProcessPool | None	_instancezthreading.Lock_lockFr   _shutdown_for_inactivityc                t    | j                         | _        d | _        d | _        | j	                         | _        y r<   )
_init_pool_pool_warmup_future_warmup_start_time_init_timer_timerr   s    r6   r[   zAutotuneProcessPool.__init__  s0    151B
2604$($4$4$6r5   c                    | j                   9| j                  5  | j                    |        | _         ddd       | j                   S | j                   S # 1 sw Y   | j                   S xY w)z*Get or create the singleton pool instance.N)r  r  r   s    r6   get_instancez AutotuneProcessPool.get_instance  sY     ==  *==($'ECM* }}s}}	* }}s   AA&c                    t         j                  sJ d       | j                  *| j                         | _        | j	                         | _        | j                  S )zGet the process pool.zFTo use AutotuneProcessPool, pipeline_max_autotune_gemm must be enabled)r)   pipeline_max_autotune_gemmr  r  r  r   r   s    r6   r  zAutotuneProcessPool.pool  sQ     00 	
T	
0 ::*DJ**,DKzzr5   c                J    t         dkD  rt        t         | j                        S y r~  ) AUTOTUNE_POOL_INACTIVITY_TIMEOUTr   _on_inactivity_timeoutr   s    r6   r  zAutotuneProcessPool._init_timer  s!    +a/94;V;VWWr5   c                R    | j                   | j                   j                          y y r<   )r   record_callr   s    r6   _record_activityz$AutotuneProcessPool._record_activity  s!    ;;"KK##% #r5   c                    t         j                  dt               | j                  5  | j                  #| j                  j                  d       d | _        d | _        dt        _        d d d        y # 1 sw Y   y xY w)NzAAutotuneProcessPool shutting down due to inactivity (timeout=%ds)Fr   T)	rJ   r$  r'  r  r  r   r   r  r  r   s    r6   r(  z*AutotuneProcessPool._on_inactivity_timeout  sq    O,	

 ZZ 		@zz%

###/!
DK
 <@8		@ 		@ 		@s   AA22A;c                    t        j                  d      }t        d|      }t        j                  | j
                         t        j                  d       |S )a  
        Get or create the process pool.

        Uses ProcessPoolExecutor with 'spawn' context for CUDA safety.
        ProcessPoolExecutor is lazily initialized - workers are not spawned
        until the first submit() call, making this property non-blocking.
        spawnr(   )r   
mp_contextz2AutotuneProcessPool created (workers spawn lazily))mpget_contextr   r  ro   	_shutdownrJ   r$  )rZ   ctxr  s      r6   r  zAutotuneProcessPool._init_pool  sH     nnW%"
 	'PQr5   c                   | j                   | j                  5  | j                   t        j                         | _        | j
                  j                  t        t        j                  j                  j                  j                        | _         | j                   j                  | j                         t        j!                  d       ddd       | j                   S | j                   S # 1 sw Y   | j                   S xY w)z
        Submit a warmup job to eagerly spawn workers and initialize CUDA.

        This is optional - call it early to hide spawn latency.
        Returns the warmup future which can be ignored or awaited.
        N)fp32_precisionzWarmup job submitted)r  r  r#  perf_counterr  r  submit_init_autotune_subprocessr<  backendsrC  matmulr5  add_done_callback_on_warmup_completerJ   r$  r   s    r6   warm_upzAutotuneProcessPool.warm_up  s     & @&&..2.?.?.AD+*.))*:*:1',~~':':'A'A'P'P +; +D' ''99$:R:RS"''(>?@ """t"""@ """s   B2C++C?c                $   d}| j                   !t        j                         | j                   z
  }	 |j                         }t        j                  d||       | j                          y# t        $ r}t        j                  d|       |d}~ww xY w)z/Callback invoked when the warmup job completes.NzEAutotuneProcessPool warmup completed successfully in %.4f seconds: %sz4AutotuneProcessPool warmup failed after %.4f seconds)	r  r#  r6  rE   rJ   r$  r+  rA   r   )rZ   futurewarmup_elapsed_timerE   rF   s        r6   r<  z'AutotuneProcessPool._on_warmup_complete  s    """."&"3"3"58O8O"O	]]_FW#
 !!# 	  F# G	s   7A) )	B2B

Bc                       j                   j                  |g|i |} j                  |j                   fd       |S )z-Submit a job to the pool and return a Future.c                $    j                         S r<   )r+  )r   rZ   s    r6   r  z,AutotuneProcessPool.submit.<locals>.<lambda>	  s    t/D/D/F r5   )r  r7  r   r;  )rZ   r  r  r  r?  s   `    r6   r7  zAutotuneProcessPool.submit  sA    !!!"6t6v6;;"$$%FGr5   c                    | j                   !| j                   j                          d| _         | j                  $| j                  j                  d       d| _        yy)zShutdown the pool on exit.NFr   )r   quitr  r   r   s    r6   r2  zAutotuneProcessPool._shutdown  sN    ;;"KKDK::!JJU+DJ "r5   c                    | j                   C| j                  5  | j                   !| j                   j                          d| _         ddd       yy# 1 sw Y   yxY w)z+Explicitly shutdown the singleton instance.Nr  r  r2  r"  s    r6   shutdown_instancez%AutotuneProcessPool.shutdown_instance  sU     ==$ )==,MM++-$(CM) ) %) )s   .AA)r   zTimer | Noner   )r   Future[Any])r?  rH  r   r   )r1   r2   r3   r   r  r  	threadingLockr  r  r[   r  r#  propertyr  r  r+  r(  r  r=  r<  r7  r2  rG  r4   r5   r6   r  r    s    
 -1I)0*INN,E>,%*d*7    
&@"(#(* ) )r5   r  c                 H    t         j                  xr t        j                   S r<   )r)   r%  r  r  r4   r5   r6   use_pipelined_autotuningrM    s!    )) 	=#<<<r5   c                    ddl }|j                  j                         r |j                  dd       | |j                  j                  j
                  _        y)z9
    Warmup function run in the autotune subprocess.
    r   Nr(   rC  r   T)r<  rC  is_availabler  r9  r:  r5  )r5  r<  s     r6   r8  r8  &  sD      zz Af%0>ENN-r5   c                    	 | j                         }|S # t        $ r& t        j                  d| d       t	        d      cY S w xY w)a*  
    Run autotuning benchmarks in a subprocess.

    This function is submitted to AutotuneProcessPool and runs in isolation
    to prevent GPU contention with the main compilation process.

    Args:
        picklable_choices: List of picklable choice information

    Returns:
        timing
    zFailed to benchmark choice %sT)exc_infor   )r   rA   rJ   warningr   )benchmark_requesttimings     r6   run_autotune_in_subprocessrU  5  sQ     ",,. + 	 	
 U|s    ,AAc                      e Zd ZU dZdZded<    ej                         Zd
ddZ	e
dd       Zd ZdddZe
dd	       Zy)PrecompileThreadPoolz
    Thread pool for running precompilation asynchronously.

    This allows the main compilation process to continue while
    precompilation happens in background threads.
    NzPrecompileThreadPool | Noner  c                &    t        |      | _        y )Nr   )r	   	_executor)rZ   r   s     r6   r[   zPrecompileThreadPool.__init__`  s    +Dr5   c                    ddl m} | j                  ?| j                  5  | j                   |  |             | _        d d d        | j                  S | j                  S # 1 sw Y   | j                  S xY w)Nr   )get_num_workers)r   r[  r  r  )r   r[  s     r6   r#  z!PrecompileThreadPool.get_instancec  s`    D==  ;==($'(9$:CM; }}s}}; }}s   AA2c                    t        j                         }t        j                  |j                  |      } | j
                  j                  |g|i |S r<   )contextvarscopy_contextrv  rw  rh  rY  r7  )rZ   r  r  r  r3  s        r6   r7  zPrecompileThreadPool.submitm  sG    &&(sww+$t~~$$R9$9&99r5   c                :    | j                   j                  |      S )Nr   )rY  r   r   s     r6   r2  zPrecompileThreadPool._shutdowns  s    ~~&&D&11r5   c                    | j                   E| j                  5  | j                   #| j                   j                  d       d | _         d d d        y y # 1 sw Y   y xY w)NFr   rF  r"  s    r6   rG  z&PrecompileThreadPool.shutdown_instancev  sX    ==$ )==,MM+++7$(CM) ) %) )s   0AA)   )r   r   )r   rW  )F)r   r   r   )r1   r2   r3   r   r  r  rI  rJ  r  r[   r  r#  r7  r2  rG  r4   r5   r6   rW  rW  U  sZ     .2I*1INNEE  :2 ) )r5   rW  c                  V    e Zd ZdZi Zedd       Zedd       Ze	 	 	 	 	 	 dd       Z	y)	AsyncAutotunera  
    Handles asynchronous autotuning of kernel choices in a separate process.

    This class manages the lifecycle of autotuning:
    1. Accepts precompiled choices from the main process
    2. Submits benchmarking work to AutotuneProcessPool
    3. Returns results via a Future

    Usage:
        autotuner = AsyncAutotuner(choices)
        autotuner.start()  # Kicks off async benchmarking
        timings = autotuner.get_results()  # Blocks until complete
    c                (    | j                         |z   S r<   )r  )r   
inputs_keys     r6   get_choice_hashzAsyncAutotuner.get_choice_hash  s     :--r5   c                   |D ]  }t         j                  ||      }|t         j                  v r,t        |dd      J d       t        j                         j                  t        |j                        }|t         j                  |<    y)z
        Start asynchronous autotuning in a subprocess.

        This method:
        1. Extracts picklable benchmark requests from choices
        2. Submits benchmarking work to AutotuneProcessPool
        3. Returns immediately (non-blocking)
        r   Nzbmreq is None for choice)	rc  rf  choice_hash_to_futurer  r  r#  r7  rU  r   )r   r   re  r   choice_hashautotune_futures         r6   rY   zAsyncAutotuner.start  s      	PF(88LKnBBB67D1= *= 2>>@GG*O
 APN00=	Pr5   c                    i }|D ]<  }t         j                  ||      }t         j                  |   j                         ||<   > |S )z
        Get autotuning results, blocking until complete.

        Args:
            timeout: Maximum time to wait in seconds. None means wait forever.

        Returns:
            Dict mapping ChoiceCaller to benchmark timing
        )rc  rf  rh  rE   )r   r   re  timingsr   ri  s         r6   get_resultszAsyncAutotuner.get_results  sP      	YF(88LK,BB;OVVXGFO	Y r5   N)r   r%   re  rs   r   rs   )r   list[ChoiceCaller]re  rs   )r   rn  re  rs   r   zdict[ChoiceCaller, float])
r1   r2   r3   r   rh  r   rf  r  rY   rm  r4   r5   r6   rc  rc    sc     . . P P6 (69	" r5   rc  )r   r   r   r   )r5  rs   r   r   )rS  r  r   r   )s
__future__r   r  r]  r  dataclassesrv  r!  multiprocessingr0  r>   rQ   r   rl   ru   rq   rI  r#  r   collections.abcr   r   r   concurrent.futuresr   r   r	   r
   r   r   r   typingr   r   r   r<  torch._inductor.async_compiletorch._dynamo.device_interfacer   torch._dynamo.testingr   torch._inductorr   torch._inductor.codecacher   r   r   r   r   r   $torch._inductor.compile_worker.timerr   torch._inductor.utilsr   r   r   r   r   torch._loggingr    torch.utils._ordered_setr!   r   r?   rM   r'  typesr$   r   r%   r&   r'   r  r)   runtime.benchmarkingr*   virtualizedr+   r,   r1   rJ   rA   r/   r9   r   r   r   LayoutOrBuffer	dataclassr   r  r0  r9  rM  rQ  r  r  r  r  r  r  r  r  r  r  r   cacher  r  r  rM  r8  rU  rW  rc  r4   r5   r6   <module>r     s   "        	     
    8 8 N N 2 2 ) )  $ C .   7  - /
 $'JJNNCUK$       -  . "8\:		 	t tno od RYY& .
 .
 .
b g g gT, D   F- -vU- vUr	 79O 		 79O 	JE#3 JEZ	9		9	$S/ $SN	"9;S 		"9;S 	a$57G a$H4&13C 4&n+57G +\  8'8&8S) S)l'
@') ')TC Cr5   