
    9j0                        U d dl mZ d dlZd dlZd dlmZmZ d dlmZ d dl	Z	d dl
Zd dlmZ d dlZd dlmZ ddlmZmZ ddlmZmZmZmZmZmZmZ dd	lmZmZ dd
l m!Z! ddl"m#Z#m$Z$ erd dl%m&Z&m'Z' dZ(da)de*d<   ejV                   G d d             Z,ejV                   G d d             Z-ddZ.d dZ/ej`                  d!d       Z1	 	 	 	 	 	 	 	 	 	 d"dZ2 G d de      Z3d#dZ4 G d d      Z5	 	 	 	 d$dZ6	 	 	 	 	 	 d%dZ7y)&    )annotationsN)AnyTYPE_CHECKING)patch)
OrderedSet   )configselect_algorithm)BufferChoiceCallerLayoutMultiTemplateBufferOperationBuffer
StorageBox	TensorBox)KernelInputsMMKernelInputs)SchedulerNode)NullHandlerV)	GeneratorSequencedistributed_autotunedist.ProcessGroup | None_AUTOTUNE_PGc                  .    e Zd ZU dZdZded<   dZded<   y)_DistributedAutotuneStatezA
    State used to track autotuning during a graph_context()
    r   intautotuned_indexautotuned_local_countN)__name__
__module____qualname____doc__r   __annotations__r         d/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/distributed_autotune.pyr   r   '   s      OS "#3"r'   r   c                  "    e Zd ZU ded<   ded<   y)_DistributedAutotuneInfor   indexboollocalN)r!   r"   r#   r%   r&   r'   r(   r*   r*   6   s    JKr'   r*   c                     t        j                         r@t        j                         r,t         t         j                  j                  d      at        S y )Npt2_distributed_autotune_pg)pg_tag)distis_availableis_initializedr   distributed_c10d_new_group_with_tagr&   r'   r(   get_autotune_pgr6   <   sI    t22400DD4 E L r'   c                l    t         j                  sJ t        |       }t        |      }t	        | |       y)z
    Finish the distributed autotuning by propagating the autotuning results
    between the ranks and then replacing the placeholder with the real Buffer.
    N)r	   distributed_max_autotune_gemm_autotune_local_nodes_sync_autotune_remote_nodes)	schedulerautotune_resultschoices_by_indexs      r(   scheduler?   H   s5    
 ////,Y7-.9&67r'   c               #    K   t        t        j                  d      t              rJ t        j                  t                      	 d t        j                  t                      y# t        j                  t                      w xY ww)zd
    Wrapped around processing a graph, sets up figuring out which ranks tune
    which shapes.
    F)check_poisonedN)
isinstancer   get_distributed_autotune_stater   set_distributed_autotune_stater   r&   r'   r(   graph_contextrE   S   sg      	((>!   $$%>%@A8	((7((7s   ABA) B)BBc                   t         j                  syt               x}syt        |      dk  ryt        j
                  }|j                  }|xj                  dz  c_        ||j                         z  |j                         k(  }t        ||      t        j                  j                  t        <   |r|xj                  dz  c_        yt        j                  j                   j"                  j%                  t'        | ||            S )z
    Used by an op (like `mm`) to determine if the op should be autotuned
    locally (returns None) or remotely (returns a placeholder Buffer).
    Nr   )r	   r8   r6   lenr   distributed_autotune_stater   sizerankr*   current_nodemeta_DISTRIBUTED_AUTOTUNE_KEYr    torch	_inductorirr   create_DistributedAutotuneBuffer)namechoicesinputslayoutautotune_pgstater+   r-   s           r(   maybe_autotune_remoterY   d   s     //*,,K,
7|q((E!!E	QK$$&&+*:*:*<<E5Mu6ANN12 ##q(#??''.."48 r'   c                  X     e Zd ZU dZded<   	 	 	 	 	 	 	 	 d fdZ	 	 	 	 ddZd	dZ xZS )
rR   z
    A MultiTemplateBuffer which represents a kernel being autotuned on a
    different rank. When `schedule` is called this will be replaced by the
    "real" buffer.
    str_kernel_namec           	     b    t         |   ||| j                  g t        i              || _        y )N)choice_timings_fnunfiltered_choicesallowed_prologue_inps)super__init___dummy_choice_timingsr   r\   )selfkernel_namerU   rV   	__class__s       r(   rb   z#_DistributedAutotuneBuffer.__init__   s:     	"88!",R. 	 	
 (r'   c                    t         N)NotImplementedError)rd   _hint_overrides     r(   rc   z0_DistributedAutotuneBuffer._dummy_choice_timings   s
    
 "!r'   c                   ddl m} t        j                  t        j
                  dd      5  t        g | j                        }t        | j                  t              sJ |j                  | j                  |      } || j                  |g|j                         | j                        \  }}t        |t              sJ |cddd       S # 1 sw Y   yxY w)zu
        Given a _SerializedChoice (autotune results from another rank)
        compute the final TensorBox.
        r   )autotune_select_algorithmr<   N)r
   rl   r   objectr   graphr   original_inputsrB   rV   r   
get_choicer\   nodesr   )rd   
ser_choicerl   kernel_inputschoicebuffer_s          r(   autotunez#_DistributedAutotuneBuffer.autotune   s     	@\\!'';5 	*+BT-A-A+BCMdkk6222**4;;FF1!!##%	IFA fi000	 	 	s   BC

C)re   r[   rU   list[Buffer]rV   r   returnNone)rj   z
int | Nonery   zdict[ChoiceCaller, float])rr   _SerializedChoicery   r   )	r!   r"   r#   r$   r%   rb   rc   rw   __classcell__)rf   s   @r(   rR   rR      sU     (( ( 	(
 
( "("	""r'   rR   c                p   t               }|sJ dg|j                         z  }t        j                  j	                  || |       t        d |D              }dg|z  }d}|D ]@  }|D ]9  }t        |t              sJ ||j                     J |||j                  <   |dz  }; B ||k(  sJ d| d|        |S )zT
    Perform the all_gather to collect the autotune results from all the ranks.
    N)groupc              3  2   K   | ]  }t        |        y wrh   )rG   ).0xs     r(   	<genexpr>z_sync.<locals>.<genexpr>   s     0SV0s   r   r   zcount mismatch:  != )	r6   rI   rN   distributedall_gather_objectsumrB   r{   r+   )r=   rW   
all_states
node_countr>   check_countother_resultsrt   s           r(   r:   r:      s    
 "#K; 269I9I9K0KJ	''
4DK'X0Z00J150CK# # 	Ff&7888#FLL1999-3V\\*1K		 $V(8D&VV$r'   c                  L    e Zd ZdZddZd	dZed
d       Zedd       ZddZ	y)r{   z
    This is a serializer for the autotune choice. KernelTemplateChoice can't
    be serialized directly (the template and inputs prevent this) so we need to
    serialize it by parts and reconstruct later on.
    c                    || _         t        j                  |      | _        | j	                  |j
                        | _        y rh   )r+   r{   _template_uid_from_choicetemplate_uid_compute_kwargsdescriptionkwargs)rd   r+   rt   s      r(   rb   z_SerializedChoice.__init__   s4    
-GGO**6+=+=>r'   c                &   | j                         }i | j                  }d|v rF|j                         d   j                         d   }t	        j
                  ||d         |d   k(  |d<   i }ddlm}m}  ||      }	 |||	|||      }
|
j                  S )z=
        Deserialize the ChoiceCaller and return it.
        BLOCK_Kr   r   EVEN_K)DictKernelTemplateParamsKernelTemplateChoice)
_template_from_uidr   rq   get_sizesympygcdkernel_template_choicer   r   rt   )rd   rV   rU   templater   kextra_kwargsr   r   paramsktcs              r(   rp   z_SerializedChoice.get_choice   s    
 **, DKK
 q!**,Q/A$yyF9,=>&BSSF8')	

 *&1"8V\66Rzzr'   c                j   | si S i }| j                  d      D ]  }|j                  dd      \  }}|j                         |j                         }}|dk(  rd||<   C|dk(  rd||<   N|j                         rt        |      ||<   m|j	                  d      r|j                  d      sJ |dd	 ||<    |S )
zI
        Given a template description turn it into input kwargs.
        ,=r   TrueTFalseF')splitstripisdigitr   
startswithendswith)r   r   cfgkeyvals        r(   r   z!_SerializedChoice._compute_kwargs   s    
 I /1$$S) 	(Cyya(HCyy{CIIKCf}"s#s!#hs~~c*s||C/@@@!!Bis	( r'   c                   t        | t        j                        r<| j                  j                  dk(  ryt        d| j                  j                        t        | t        j                        ryt        dt        |              )z
        Given a ChoiceCaller figure out which template represents it. This
        is reversed by _template_from_uid().
        mmz!torch._inductor.kernel.mm.aten_mmzTODO: kernel z%torch._inductor.kernel.mm.mm_templatezTODO: )rB   r
   ExternKernelCallerrt   rS   RuntimeErrorTritonTemplateCallertype)rt   s    r(   r   z+_SerializedChoice._template_uid_from_choice  sq     f.AAB}}!!T):"]6==3E3E2H#IJJ 0 E EF:V~677r'   c                    | j                   j                  d      }t               |d      }|dd D ]  }t        ||      } |S )z2
        See _template_uid_from_choice().
        .r   r   N)r   r   globalsgetattr)rd   partsobjr   s       r(   r   z$_SerializedChoice._template_from_uid+  sO     !!'',ia!qr 	"A#q/C	"
r'   N)r+   r   rt   r   ry   rz   )rV   r   rU   r   ry   zChoiceCaller | None)r   r[   ry   zdict[str, int | str | bool])rt   r   ry   r[   )ry   r   )
r!   r"   r#   r$   rb   rp   staticmethodr   r   r   r&   r'   r(   r{   r{      s>    ?
4  0 8 8$r'   r{   c                >   g }| j                   D ]  }t        |t              s|j                  x}#t        |t              r4t        |t
              sE|j                  x}T|j                  x}c|j                  t              }|{|j                  sJ |j                         \  }}t        |j                  |      }	|j                  |	        t        j                   }
t#        |      |
j$                  k(  s!J dt#        |       d|
j$                   d       |S )zt
    Go through the nodes in the scheduler and autotune the kernels which
    should be autotuned by this rank.
    z'incorrect local autotuned nodes found (r   ))rq   rB   r   noderR   r   origin_noderL   getrM   r-   get_min_choicer{   r+   appendr   rH   rG   r    )r<   r=   r   
inner_noder   rL   info
min_choicerv   rt   rX   s              r(   r9   r9   6  s.    13  ($.))#J,j"<=*&9:%111K:$$$D-xx12<zzz
 #113
A"4::z:'A (D ((E E$?$?? 
1#6F2G1HUMhMhLiijk? r'   c                   t        | j                        D ]  \  }}t        |t              st        |j                  x}t
              s4|j                  J |j                  j                  t           }|j                  ||j                           }|j                  }t        |t              sJ |j                  }t        |t              sJ |j                  |j                  k(  sJ | j                  ||||        y)zo
    Go through the nodes in the scheduler and autotune the nodes that were
    autotuned on remote ranks.
    N)	enumeraterq   rB   r   r   rR   r   rL   rM   rw   r+   datar   r   rV   _replace_node)	r<   r>   ir   	dist_noder   out_tensorboxout_storage
out_buffers	            r(   r;   r;   i  s     Y__- D4dM*z))#Y&@0
 ((444((--.GHD%../?

/KLM',,Kk:666$))Jj/:::$$	(8(8888##J	1dCDr'   )ry   r   )r<   #torch._inductor.scheduler.Schedulerry   rz   )ry   zGenerator[None, None, None])
rS   r[   rT   zlist[ChoiceCaller]rU   rx   rV   r   ry   zTensorBox | None)r=   list[_SerializedChoice]ry   Sequence[_SerializedChoice])r<   r   ry   r   )r<   r   r>   r   ry   rz   )8
__future__r   
contextlibdataclassestypingr   r   unittest.mockr   r   torch._loggingrN   torch.distributedr   r1   torch.fxtorch.utils._ordered_setr    r	   r
   rP   r   r   r   r   r   r   r   rs   r   r   r<   r   virtualizedr   r   collections.abcr   r   rM   r   r%   	dataclassr   r*   r6   r?   contextmanagerrE   rY   rR   r:   r{   r9   r;   r&   r'   r(   <module>r      sD   "   %       / &   8 $ ' 3 3 )-& - # # #   
	8 8 8 
*4@JPB4!4 4p8Z Zz0200fD2D1D 
Dr'   