
    9jG                       d dl mZ d dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlmZmZ d dlmZmZ d dlmZmZ d d	lmZ d
dlmZ e	rd dlmZmZ d dlmZ  e
dd      Zej>                  jA                  e!d      Z"ej>                  jA                  e!d      Z#e$e%ejL                  z  dz     Z'ee$e   ge'f   Z( G d d      Z) ejT                  dd       G d d             Z+ ejT                  dd       G d d             Z, ejT                  dd       G d d             Z-	 	 	 	 d7dZ.d8dZ/d9dZ0d:d Z1d;d!Z2	 	 	 	 	 	 d<d"Z3	 	 	 	 	 	 	 	 d=d#Z4d>d$Z5	 	 	 	 d?d%Z6	 	 	 	 d?d&Z7d@d'Z8ejT                   G d( d)             Z9	 	 	 	 	 	 	 	 	 	 dAd*Z:dBd+Z; G d, d-e      Z<	 	 	 	 	 	 	 	 	 	 	 	 dCd.Z=	 	 	 	 	 	 dDd/Z> ejT                  d0       G d1 d2             Z? ejT                  d0       G d3 d4             Z@	 	 	 	 	 	 dEd5ZAdFd6ZBy)G    )annotationsN)Callable)Enum)AnyTYPE_CHECKINGTypeVar)countersget_metrics_context)GraphPartitionMap	InputType)get_plain_tensorsis_fake)
OrderedSet   )is_using_cudagraph_partition)SequenceSet)
OutputCode_OCr   )bound
cudagraphscudagraph_static_inputsc                  H    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZddZy)	CUDAGraphPolicya^  Pluggable policy controlling CUDA graph wrapping in Inductor's post_compile.

    Override methods to customize:
      - HOW compiled functions are cudagraph-wrapped (cudagraphify)
      - WHETHER inner CompiledFxGraphs should be wrapped (should_wrap)
      - OUTER wrapping of compound outputs like RegionalOutputCode (wrap_output)

    Set via ``torch._inductor.config.cudagraph_policy``.  When ``None``
    (the default), the existing built-in behaviour is used unchanged.

    Example usage::

        class MyCUDAGraphPolicy(CUDAGraphPolicy):
            def cudagraphify(self, model, example_inputs, static_input_idxs, **kwargs):
                return my_custom_wrapper(model, example_inputs, static_input_idxs)


        with torch._inductor.config.patch("cudagraph_policy", MyCUDAGraphPolicy()):
            compiled_fn = deserialize_artifacts(...)
    c               *    ddl m}  |||f|||d|S )a  Wrap a single compiled callable with CUDA graph capture/replay.

        Called by ``cudagraph_post_compile`` for each ``CompiledFxGraph``.
        The default delegates to ``compile_fx.cudagraphify`` (cudagraph_trees).

        ``example_inputs`` are the example inputs at post_compile time.
        The default implementation does not forward them because
        ``compile_fx.cudagraphify`` defers graph recording to the first
        real call via an inner closure.  Subclasses that need the
        example inputs for warmup or static-input detection may use them.

        When ``config.graph_partition=True``, setting a CUDAGraphPolicy
        bypasses ``cudagraph_partition_post_compile`` (which wraps each
        partition individually) and routes through ``cudagraph_post_compile``
        instead, so this method wraps the *entire* callable, not individual
        partitions.  Subclasses that need per-partition control should
        handle partitioning internally.
        r   )cudagraphify)device_indexis_backwardis_inference)torch._inductor.compile_fxr   )	selfmodelexample_inputsstatic_input_idxsr   r   r   kwargsr   s	            _/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/cudagraph_utils.pyr   zCUDAGraphPolicy.cudagraphify:   s4    : 	<
 &#%
 
 	
    c                     y)aQ  Whether to apply cudagraph wrapping to this CompiledFxGraph.

        Called for each inner ``CompiledFxGraph`` during ``post_compile``.
        Return ``False`` to skip wrapping (e.g. when wrapping at the outer
        level via ``wrap_output`` instead).

        Default: ``True`` (wrap everything, same as current behaviour).
        T )r!   compiled_graphs     r&   should_wrapzCUDAGraphPolicy.should_wrapb   s     r'   c                    |S )a  Optional outer-level wrapping after inner post_compile completes.

        Called by ``_compile_fx_inner``, ``BundledOutputCodeLoadable.post_compile``,
        and ``FxGraphCacheLoadable.post_compile`` on the ``OutputCode`` returned
        from ``post_compile``.  Subclasses that only want to wrap specific
        output types should check ``isinstance`` and return the input
        unchanged for types they don't handle.

        Default: identity (no outer wrapping).
        r)   )r!   output_codes     r&   wrap_outputzCUDAGraphPolicy.wrap_outputm   s
     r'   N)r"   Callable[..., Any]r#   zSequence[InputType]r$   Sequence[int]r   intr   boolr   r2   r%   r   returnr/   )r*   r   r3   r2   )r-   r   r3   r   )__name__
__module____qualname____doc__r   r+   r.   r)   r'   r&   r   r   $   se    *&
!&
 ,&
 )	&
 &
 &
 &
 &
 
&
P	r'   r   T)frozenslotsc                      e Zd ZU dZded<   y)
FunctionIDz9Unique counter of a function wrapped in cudagraphify_implr1   idNr4   r5   r6   r7   __annotations__r)   r'   r&   r;   r;   {   s
    ?Gr'   r;   c                  :    e Zd ZU dZded<   ded<   ded<   ded<   y	)
PlaceholderInfoz
    A serializable version of torch.fx.Node that contains information
    pertinent to placeholder stack traces. We use these in logging and error messages
    related to cudagraphs, and will cache these results.
    strname
str | Nonestack_tracelist[PlaceholderInfo]usersmutating_use_stack_traceNr=   r)   r'   r&   r@   r@      s      I  ((r'   r@   c                  N    e Zd ZU dZded<   ded<   ded<   ded	<   d
ed<   ded<   y)WrappedFunctionz
    Represents a function that you want to record for CUDA graph replay,
    with a little more metadata so we can identify if we have an applicable
    CUDA graph in our CUDA graph tree for it.
    r/   r"   r0   r$   r;   r<   ztuple[torch.Tensor, ...]	constantsSequence[PlaceholderInfo]placeholdersmutated_input_idxsNr=   r)   r'   r&   rI   rI      s,     $$N''++%%r'   rI   c                   t        | j                        dk(  r8t        t        | j                              j                  j                  dd       S | j                  D ]_  }|j                  t        j                  j                  j                  j                  u s>|j                  j                  dd       x}s]|c S  y )Nr   rD   )lenrF   nextitermetagettargettorchopsatencopy_default)placeholder_nodeuserD   s      r&   &get_mutating_use_stack_trace_from_noder\      s     !!"a'D)//0166::=$OO%% #::--555!hhll=$??{?""#
 r'   c                    | j                   S N)rG   )placeholder_infos    r&   get_mutating_use_stack_tracer`      s    444r'   c                    | j                   }| j                  j                  dd       }g }d }| j                  dk(  r-| j                  D cg c]  }t        |       }}t        |       }t        ||||      S c c}w )NrD   placeholder)rB   rR   rS   oprF   to_placeholder_infor\   r@   )rZ   rB   rD   rF   rG   is         r&   rd   rd      s      D"''++M4@KE#m+1A1G1GHA$Q'HH#I$
  4e5MNN Is   
A7c                r    | j                   D cg c]  }|j                  dk(  st        |       c}S c c}w )Nrb   )nodesrc   rd   )graphnodes     r&   get_placeholder_inforj      s4    .3kk&*TWW=UD!  s   44c                    d|  S )Nzskipping cudagraphs due to r)   )reasons    r&   format_default_skip_messagerm      s    (11r'   c                    d}|D ]  }| |   }t        |      x}s n t        dt        |       d      }|r| d| S |S )N zmutated inputs (z instances). Found from : 
 )r`   rm   rO   )rL   mutation_indicesrD   idxrb   msgs         r&   get_mutation_stack_tracert      sp     !K "3'6{CC;C
 &
3/01=C (66Jr'   c                   t         j                  j                  j                  j                  r3| j
                  D cg c]  }|| j                  v s |||         s| }}n| j
                  }t        j                  d| j                         t        j                  d|       |rt        | j                  |      S d S c c}w )Nz'check mutation static input indices: %sz#check mutation mutation indices: %s)rU   	_inductorconfigtritoncudagraph_treesrM   r$   static_inputs_logdebugrt   rL   )funcinputsis_cuda_graph_recorded_tensorrr   rq   s        r&   check_for_mutationr      s     $$44 ..+
t---0=	 +
 +
  22143I3I ACST  	!!2!24DE !+
s   "B>c                j    | j                   D ]$  }|j                  j                  dd       x}s"|c S  y )NrD   )rF   rR   rS   )ri   r[   rD   s      r&   _get_use_stack_tracer      s:    zz ((,,}d;;;; r'   c                .   | j                  t        j                  d      d        t               r%| j                  t        j                  d      d        | j	                  t        j                  d            x}r8d|j
                   d}t        |      x}rt        | d|       S t        |      S t        |       dk(  r0t        t        | j                                     j                  dk(  ry d | D        }t        d	d
j                  |             S )NrR   cpuzcpu device ()rp   r   cudac              3  2   K   | ]  }t        |        y wr^   )repr).0keys     r&   	<genexpr>z:check_multiple_devices_or_any_cpu_nodes.<locals>.<genexpr>  s     :sc:s   zmultiple devices: z, )poprU   devicer   rS   rB   r   rm   rO   rP   rQ   keystypejoin)device_node_mappingcpu_noders   rD   	keys_reprs        r&   'check_multiple_devices_or_any_cpu_nodesr     s     ELL0$7 $%U 3T:&**5<<+>??x?X]]O1-.x88;8.#6H/VWW*3// 	 A%)..012776A:&9:I&);DIIi<P;Q'RSSr'   c                    t        |       S r^   )r   )r   s    r&    check_lowering_disable_cudagraphr   #  s     33FGGr'   c                &   t         j                  |        t        d   dxx   dz  cc<   t        j                  j
                  j                  j                  rt        |       t               }|j                         r|j                  d| d       y y )Ninductorcudagraph_skipsr   cudagraph_skip_reasonT)	overwrite)cudagraphs_logwarningr	   rU   rv   rw   rx   cudagraph_or_errorRuntimeErrorr
   in_progressset)rs   metrics_contexts     r&   #log_cudagraph_skip_and_bump_counterr   )  sy    3Z*+q0+$$773)+O""$3SDI %r'   c                       e Zd ZU ded<   ddZy)BoxedDeviceIndex
int | Nonevaluec                :    |t        |t              sJ || _        y r^   )
isinstancer1   r   )r!   
device_idxs     r&   r   zBoxedDeviceIndex.set9  s    !Z
C%@@@
r'   N)r   r   r3   None)r4   r5   r6   r>   r   r)   r'   r&   r   r   5  s     r'   r   c                H   t        d      }t        j                  j                  j                  j
                  rQt        |      }|D cg c]	  }||vs| }}t        |      dk7  }|sy t        | j                        }	t        |	|      S t        |      dk7  }|sd S |S c c}w )Nzmutated inputsr   )rm   rU   rv   rw   rx   ry   r   rO   rj   rh   rt   )
gmmutated_inputsrM   r$   default_msgunique_idxsrr   rq   has_mutationrL   s
             r&   3check_for_mutation_ignore_cuda_graph_managed_tensorr   >  s     ..>?K $$44 !23+=XCKAWCXX+,1+BHH5'6FGG >*a/'t8[8 Ys   		BBc                    | j                   r| j                   S | j                  D ]  }|j                   s|j                   c S  y)zM
    Gets the first non-empty stack trace of a placeholder or its users.
    N)rD   rF   )rb   users     r&   get_placeholder_stack_tracer   V  sH     &&&!! $###$ r'   c                  $    e Zd ZdZdZdZdZddZy)CheckInvariantStatusr            c                    | j                   dk(  ry| j                   dk(  ry| j                   dk(  ry| j                    d| j                   S )NCudagraphManagedIdxMismatchz-cudagraph managed tensor data pointer changedStaticInputIdxMismatchz!static input data pointer changed&ExpectedDeadIndicesBeforeGraphMismatchz+expected dead indices before graph are livez: )rB   r   )r!   s    r&   __str__zCheckInvariantStatus.__str__q  sK    9955BYY226YYBB@ii[4::,//r'   Nr3   rA   )r4   r5   r6   SUCCESSr   r   r   r   r)   r'   r&   r   r   d  s$    G #$  ./*0r'   r   c                   t        |      t        |      k(  rt        |      t        |       k(  sJ d       |D cg c]  }||   	 }}|D cg c]  }||   	 }}| d}t        t        ||            D ]t  \  }\  }	}
t        |	t        j
                        sJ ||   }|	j                         |
k7  s>| |   }| d|j                   d|
 d|	j                          dt        |       d
}v |S c c}w c c}w )z}
    Logs the mismatch between input data pointers and recorded data pointers.
    This checks only idxs in target_idxs.
    zClength mismatch between inputs, recorded_data_ptr, and placeholdersz.
zinput name: z. data pointer changed from z to z. input stack trace: 
)	rO   	enumeratezipr   rU   Tensordata_ptrrB   r   )rL   r}   recorded_data_ptrtarget_idxsmismatchre   	t_tensorst_data_ptrs	error_msgtensorr   indexrb   s                r&   log_data_ptr_mismatchr   |  s'    v;#/00S[CDU5U MU %00q0I01<=A$Q'=K=*C I!*3y++F!G 	FH&%,,///A??(&u-K+\+*:*:); <--5Jd6??;L:M N&&A+&N%OrS 	  1=s   C-C2c                >   t        | j                               dz   dfd}t        j                  j                  j
                  j                  rLt        j                  j                  j
                  j                  kD  rt        j                   |              yy)Nr   c                     d  dS )NzCUDAGraph supports dynamic shapes by recording a new graph for each distinct input size. Recording too many CUDAGraphs may lead to extra overhead. We have observed a0   distinct sizes. Please consider the following options for better performance: a) padding inputs to a few fixed number of shapes; or b) set torch._inductor.config.triton.cudagraph_skip_dynamic_graphs=True. Set torch._inductor.config.triton.cudagraph_dynamic_shape_warn_limit=None to silence this warning.r)   )num_cudagraphss   r&   warn_msgz4maybe_warning_due_to_dynamic_shape.<locals>.warn_msg  s    00>/? @''		
r'   TFr   )	rO   r   rU   rv   rw   rx   "cudagraph_dynamic_shape_warn_limitr   r   )fn_cachenew_int_keyr   r   s      @r&   "maybe_warning_due_to_dynamic_shaper     st     )A-N

 	%%HH
//
 
 
'
'
J
JK 	xz*r'   )r8   c                  0    e Zd ZU dZded<   ded<   ded<   y)	CudagraphCachedInfoz'
    Info needed to realign inputs
    rK   rL   list[str | None]stack_tracesz	list[str]cudagraph_fail_reasonsNr=   r)   r'   r&   r   r     s     ,+""%%r'   r   c                  D    e Zd ZU dZded<   ded<   ded<   ded<   d	ed
<   y)CudagraphMetadataz.
    Metadata for recording a CUDA graph.
    rK   rL   OrderedSet[int]r$   rM   r   r   zdict[str, torch.Tensor]rJ   Nr=   r)   r'   r&   r   r     s'     ,+&&''""&&r'   r   c                h   g }t               }t               }t        | j                        D ]  \  }}||j                  v r|j	                  |       ||j
                  v r|j	                  |       ||j                  |   }nt        d| j                   d| dg d      }|j                  |        g }| j                  D ]4  }	|	|j                  |j                  |	          $|j                  d       6 | j                  D 
ci c]  }
|
|j                  |
    }}
t        |||||      S c c}
w )z
    Convert the cudagraph metadata at the graph level to the graph partition level,
    given the graph partition info (i.e., mapping from partition input/output index
    to graph input/output index).
    N
partition__placeholder_)rB   rD   rF   rG   )r   r   input_index_mappingr$   addrM   rL   r@   r<   appendoutput_index_mappingr   constant_namesrJ   r   )partition_mapmetadatapartition_placeholderspartition_static_input_idxspartition_mutated_input_idxspartition_input_idxgraph_input_idxrb   partition_stack_tracesgraph_output_idxrB   partition_constantss               r&    get_partition_cudagraph_metadatar     sp     3=<4>L 09))1 3,_ h888'++,?@h999(,,-@A&"//@K *!-"2"2!3=AT@UV )-	K 	%%k2'3*  )>> 0'"))(*?*?@P*QR"))$/	0 4A3O3O+/h  &&  #$ 	s   D/c                ~   t        | t        j                        s
t               S t               }t	        | g       D ]o  }t        |      t        j                  urt        |      s%|j                  s|j                  j
                  dk7  rP	 |j                  |j                                q |S # t        $ r Y w xY w)zODebug helper that collects the data pointers of all CUDA tensors in the object.)outr   )r   rU   r   r   r   r   r   is_metar   r   r   	Exception)objptrsbases      r&   collect_cuda_data_ptrsr     s    c5<<(|&LD!#2. :U\\)4=DLLDKK,<,<,F	HHT]]_% K  		s   B00	B<;B<)rZ   torch.fx.Noder3   rC   )r_   r@   r3   rC   )rZ   r   r3   r@   )rh   ztorch.fx.Graphr3   rE   )rl   rA   r3   rA   )rL   rK   rq   z AbstractSet[int] | Sequence[int]r3   rA   )r|   rI   r}   list[InputType]r~   zCallable[[torch.Tensor], bool]r3   rC   )ri   r   r3   rC   )r   z!dict[torch.device, torch.fx.Node]r3   rC   )rs   rA   r3   r   )
r   ztorch.fx.GraphModuler   zOrderedSet[str]rM   r   r$   r0   r3   rC   )rb   r@   r3   rC   )rL   rK   r}   r   r   zSequence[int | None]r   r0   r   r   r3   rA   )r   z)dict[tuple[int, ...], Callable[..., Any]]r   r   r3   r2   )r   r   r   r   r3   r   )r   objectr3   r   )C
__future__r   dataclassescollections.abcr   enumr   typingr   r   r   rU   torch._dynamo.utilsr	   r
   torch._inductor.utilsr   r   torch._subclasses.fake_tensorr   r   torch.utils._ordered_setr   utilsr   r   r   AbstractSettorch._inductor.output_coder   r   _logginggetArtifactLoggerr4   r   rz   listr1   r   
OutputType	ModelTyper   	dataclassr;   r@   rI   r\   r`   rd   rj   rm   rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r)   r'   r&   <module>r     s   "  $  . .  = > D / / <6e<( 11(LINN44' 
 #$t+,
d9o&
23	T Tn d$/  0 d$/) ) 0) d$/& & 0&#5O2+6 	(
 $B 	>T:TT8H:HH	J      99#9 (9 %	9
 9004 00+ , 	
 # 	>7 
: d#& & $& d#	' 	' $	'3$33 3lr'   