
    9jt                       U d dl mZ d dlZd dlZd dlmZ d dlmZmZmZm	Z	 d dl
mZmZmZ d dlZd dlmZ erd dlmZ dd	lmZ g d
Z ed      Z ed      Z eej0                  d      s] ed      ej0                  j2                  d<    ed      ej0                  j2                  d<    ed      ej0                  j2                  d<   d dlmZmZmZ ddZd dZ G d de      Z  G d d      Z!e	dede"f   f   Z#de$d<   e	 	 	 d!	 	 	 	 	 	 	 	 	 	 	 d"d       Z%e	 	 	 d!	 	 	 	 	 	 	 	 	 	 	 d#d       Z%	 	 	 d!	 	 	 	 	 	 	 	 	 	 	 d$dZ%y)%    )annotationsN)Callable)overloadTYPE_CHECKING	TypeAliasUnion)	ParamSpecSelfTypeVar)Tensor)_POOL_HANDLE   )_dummy_type)is_current_stream_capturinggraph_pool_handle	CUDAGraphgraphmake_graphed_callables_R_P_CudaStreamBase
_CUDAGraph_graph_pool_handle_cuda_isCurrentStreamCapturing)r   r   r   c                     t               S )zReturn True if CUDA graph capture is underway on the current CUDA stream, False otherwise.

    If a CUDA context does not exist on the current device, returns False without initializing the context.
    )r        Q/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/cuda/graphs.pyr   r   -   s    
 *++r   c                 P    t         j                  j                  t                     S )zReturn an opaque token representing the id of a graph memory pool.

    See :ref:`Graph memory management<graph-memory-management>`.

    .. warning::
        This API is in beta and may change in future releases.
    )torchcudar   r   r   r   r   r   r   6   s     ::""#5#788r   c                       e Zd ZdZdd fdZ	 d	 	 	 	 	 d fdZd fdZd fdZd fdZd fdZ	d fdZ
d fd	Zd fd
Zd fdZd fdZ xZS )r   a-  Wrapper around a CUDA graph.

    Arguments:
        keep_graph (bool, optional): If ``keep_graph=False``, the
            cudaGraphExec_t will be instantiated on GPU at the end of
            ``capture_end`` and the underlying cudaGraph_t will be
            destroyed. Users who want to query or otherwise modify the
            underlying cudaGraph_t before instantiation can set
            ``keep_graph=True`` and access it via ``raw_cuda_graph`` after
            ``capture_end``. Note that the cudaGraphExec_t will not be
            instantiated at the end of ``capture_end`` in this
            case. Instead, it will be instantiated via an explicit called
            to ``instantiate`` or automatically on the first call to
            ``replay`` if ``instantiate`` was not already called. Calling
            ``instantiate`` manually before ``replay`` is recommended to
            prevent increased latency on the first call to ``replay``. It
            is allowed to modify the raw cudaGraph_t after first calling
            ``instantiate``, but the user must call ``instantiate`` again
            manually to make sure the instantiated graph has these
            changes. Pytorch has no means of tracking these changes.

    .. warning::
        This API is in beta and may change in future releases.

    c                $    t         |   | |      S N)super__new__)cls
keep_graph	__class__s     r   r&   zCUDAGraph.__new__]   s    wsJ//r   c                (    t         |   ||       y)a  Begin capturing CUDA work on the current stream.

        Typically, you shouldn't call ``capture_begin`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_begin`` internally.

        Arguments:
            pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
                :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
                with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
            capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
                Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
                may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
                actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
                unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
        )poolcapture_error_modeN)r%   capture_begin)selfr+   r,   r)   s      r   r-   zCUDAGraph.capture_begin`   s    & 	4<NOr   c                "    t         |           y)aG  End CUDA graph capture on the current stream.

        After ``capture_end``, ``replay`` may be called on this instance.

        Typically, you shouldn't call ``capture_end`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_end`` internally.
        N)r%   capture_endr.   r)   s    r   r0   zCUDAGraph.capture_endu   s     	r   c                "    t         |           y)a$  Instantiate the CUDA graph. Will be called by
        ``capture_end`` if ``keep_graph=False``, or by ``replay`` if
        ``keep_graph=True`` and ``instantiate`` has not already been
        explicitly called. Does not destroy the cudaGraph_t returned
        by ``raw_cuda_graph``.
        N)r%   instantiater1   s    r   r3   zCUDAGraph.instantiate   s     	r   c                "    t         |           y)z,Replay the CUDA work captured by this graph.N)r%   replayr1   s    r   r5   zCUDAGraph.replay   s    r   c                "    t         |           y)z1Delete the graph currently held by this instance.N)r%   resetr1   s    r   r7   zCUDAGraph.reset   s    r   c                     t         |          S )zReturn an opaque token representing the id of this graph's memory pool.

        This id can optionally be passed to another graph's ``capture_begin``,
        which hints the other graph may share the same memory pool.
        )r%   r+   r1   s    r   r+   zCUDAGraph.pool   s     w|~r   c                     t         |          S )z/Enable debugging mode for CUDAGraph.debug_dump.)r%   enable_debug_moder1   s    r   r:   zCUDAGraph.enable_debug_mode   s    w(**r   c                "    t         |   |      S )z
        Arguments:
            debug_path (required): Path to dump the graph to.

        Calls a debugging function to dump the graph if the debugging is
        enabled via CUDAGraph.enable_debug_mode()
        )r%   
debug_dump)r.   
debug_pathr)   s     r   r<   zCUDAGraph.debug_dump   s     w!*--r   c                     t         |          S )a}  Returns the underlying cudaGraph_t. ``keep_graph`` must be True.

        See the following for APIs for how to manipulate this object: `Graph Managmement <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html>`_ and `cuda-python Graph Management bindings <https://nvidia.github.io/cuda-python/cuda-bindings/latest/module/runtime.html#graph-management>`_
        )r%   raw_cuda_graphr1   s    r   r?   zCUDAGraph.raw_cuda_graph   s    
 w%''r   c                     t         |          S )a  Returns the underlying cudaGraphExec_t. ``instantiate`` must have been called if ``keep_graph`` is True, or ``capture_end`` must have been called if ``keep_graph`` is False. If you call ``instantiate()`` after ``raw_cuda_graph_exec()``, the previously returned cudaGraphExec_t will be destroyed. It is your responsibility not to use this object after destruction.

        See the following for APIs for how to manipulate this object: `Graph Execution <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH__EXEC.html>`_ and `cuda-python Graph Execution bindings <https://nvidia.github.io/cuda-python/cuda-bindings/latest/module/runtime.html#graph-execution>`_
        )r%   raw_cuda_graph_execr1   s    r   rA   zCUDAGraph.raw_cuda_graph_exec   s    
 w*,,r   )F)r(   boolreturnr
   )Nglobal)r+   _POOL_HANDLE | Noner,   strrC   NonerC   rG   rC   r   )r=   rF   rC   rG   )rC   int)__name__
__module____qualname____doc__r&   r-   r0   r3   r5   r7   r+   r:   r<   r?   rA   __classcell__)r)   s   @r   r   r   B   sg    40 KSP'PDGP	P*	+.(- -r   r   c                  T    e Zd ZU dZdZded<   	 	 	 	 d	 	 	 	 	 	 	 	 	 d	dZd
dZddZy)r   a	  Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` object for later replay.

    See :ref:`CUDA Graphs <cuda-graph-semantics>` for a general introduction,
    detailed use, and constraints.

    Arguments:
        cuda_graph (torch.cuda.CUDAGraph): Graph object used for capture.
        pool (optional): Opaque token (returned by a call to :func:`~torch.cuda.graph_pool_handle()` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) hinting this graph's capture
            may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
        stream (torch.cuda.Stream, optional): If supplied, will be set as the current stream in the context.
            If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
        capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
            Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
            may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
            actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
            unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
        enable_annotations (bool, optional): If ``True``, enables kernel annotation
            recording on entry and automatically calls
            :func:`~torch.cuda._graph_annotations.resolve_pending_annotations` before
            the capture ends.  Annotations are **not** cleared on exit so that multiple
            graphs in the same workload can accumulate annotations.
            Requires ``cuda.bindings`` package and cuda-compat >= 13.1 or CUDA driver >= 13.1.

    .. note::
        For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
        used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture.

    .. warning::
        This API is in beta and may change in future releases.

    .. _cudaStreamCaptureMode:
        https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
    Ntorch.cuda.Stream | Nonedefault_capture_streamc                   |C| j                   j                  -t        j                  j	                         | j                   _        |dn|f| _        ||n| j                   j                  | _        | j                  t        d      t        j                  j                  | j                        | _	        || _
        || _        || _        y )Nr   zcapture_stream must not be None)r)   rR   r    r!   Streamr+   capture_streamAssertionErrorstream
stream_ctx
cuda_graphr,   _enable_annotations)r.   rY   r+   rW   r,   enable_annotationss         r   __init__zgraph.__init__   s     >dnnCCK49JJ4E4E4GDNN1;?<RdW	(Fdnn.S.S 	 & !BCC**++D,?,?@$"4#5 r   c                   t         j                  j                          t         j                  j                  j
                  rt        j                          t         j                  j                          t         j                  j                          | j                  rddlm}  |        | j                  j                           | j                   j"                  | j$                  d| j&                  i y )Nr   )r[   r,   )r    r!   synchronizecompilerconfigforce_cudagraph_gcgccollectempty_cache_C_host_emptyCacherZ   torch.cuda._graph_annotationsr[   rX   	__enter__rY   r-   r+   r,   )r.   _enable_anns     r   rh   zgraph.__enter__   s    

 >>  33 JJL

 !!###WM 	!!#%%%YY	
  $66		
r   c                    | j                   rddlm}  |        | j                  j	                           | j
                  j                  |  | j                   rddlm}  || j                         y y )Nr   )resolve_pending_annotations)remap_to_exec_graph)rZ   rg   rk   rY   r0   rX   __exit__rl   )r.   argsrk   rl   s       r   rm   zgraph.__exit__  sU    ##Q')##%   $'##I0 $r   )NNrD   F)
rY   r   r+   rE   rW   rQ   r,   rF   r[   rB   rH   )rn   objectrC   rG   )	rK   rL   rM   rN   rR   __annotations__r\   rh   rm   r   r   r   r   r      sd    !F 8<4;
 %)+/"*#(66 "6 )	6
  6 !62
@1r   r   torch.nn.Module.r   _ModuleOrCallablec                     y r$   r   	callablessample_argsnum_warmup_itersallow_unused_inputr+   s        r   r   r   (  s     r   c                     y r$   r   rt   s        r   r   r   2  s     %(r   c                |   t        j                         rt        j                         rt        d      d}t	        | t
              s*d}| f} t        j                  t
        t        df   |      f}n,t        j                  t
        t
        t        df   df   |      }g }t        | |      D ]  \  }}	t	        |t         j                  j                        r~t        |j                        dk(  r0t        |j                        dk(  rt        |j                        dk(  st!        d      t#        d |j%                         D              st!        d      t        j&                  j(                  j*                  |	 }
|j-                  t        |
             t#        d	 |
D              rt!        d
       |D 	cg c]  }	t        |	       }}	| D cg c]A  }t	        |t         j                  j                        rt        |j/                               ndC }}t1        t        |             D cg c]  }||   ||   z    }}t1        t        |             D cg c]   }t         j2                  j5                         " }}t1        t        |             D cg c]   }t         j2                  j5                         " }}|
t7               n|}t         j2                  j9                          t         j2                  j;                  t         j2                  j=                               5  t        | ||      D ]  \  }}	}d\  }}}t1        |      D ]  }t         j&                  j(                  j?                   ||	       }t        d |D              }t        |      dkD  sPt         j@                  jC                  |t        d |D              t        d |D              d|      } |||fD ]  }~  	 ddd       t         j2                  j9                          g }g }t        | ||      D ]  \  }}	}t         j2                  jE                  ||      5   ||	 }ddd       t         j&                  j(                  jG                        \  }}|j-                  t        |             |j-                  |        g }g } t        tI        |      tI        |      tI        |            D ]  \  }}!}"t        d |!D              }#t        d |!D              }d}t        |      dkD  rnt         j2                  jE                  |"|      5  t         j@                  jC                  |t        d |D              t        d |#D              d|      }ddd       g }$d}%|D ];  }&|&jJ                  r||$j-                  ||%          |%dz  }%+|$j-                  d       = t        |$      }$|j-                  |#       | j-                  |$        |jM                          | jM                          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd}'g }(tO        |       D ]  \  }} |'||   ||   ||   ||   ||   ||   ||   ||   | |   	      })t	        |t         j                  j                        rD	 	 	 	 	 	 	 	 	 	 dd}* |*||jP                  |)|jR                        |_)        |(j-                  |       |(j-                  |)        |r|(d   S t        |(      S c c}	w c c}w c c}w c c}w c c}w # 1 sw Y   .xY w# 1 sw Y   xY w# 1 sw Y   xY w)a  Accept callables (functions or :class:`nn.Module<torch.nn.Module>`\ s) and returns graphed versions.

    Each graphed callable's forward pass runs its source callable's
    forward CUDA work as a CUDA graph inside a single autograd node.

    The graphed callable's forward pass also appends
    a backward node to the autograd graph. During backward, this node runs the
    callable's backward work as a CUDA graph.

    Therefore, each graphed callable should be a drop-in replacement for its source callable
    in an autograd-enabled training loop.

    See :ref:`Partial-network capture<partial-network-capture>` for detailed use and constraints.

    If you pass a tuple of several callables, their captures will use the same memory pool.
    See :ref:`Graph memory management<graph-memory-management>` for when this is appropriate.

    Arguments:
        callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph.
            See :ref:`Graph memory management<graph-memory-management>` for when passing a tuple of callables
            is appropriate.  If you pass a tuple of callables, their order in the tuple must be the same order
            they'll run in the live workload.
        sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable.
            If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors.
            If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors.
        num_warmup_iters (int): The number of warmup iterations. Currently, ``DataDistributedParallel`` needs
            11 iterations for warm up. Default: ``3``.
        allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs
            (and therefore their grad is always zero) is an error. Defaults to False.
        pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
            with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
    .. note::
        The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
        that's expected for the corresponding real input in the training loop.

    .. warning::
        This API is in beta and may change in future releases.

    .. warning::
        ``sample_args`` for each callable must contain only Tensors. Other types are not allowed.

    .. warning::
        Returned callables do not support higher order differentiation (e.g., double backward).

    .. warning::
        In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters
        may be trainable. Buffers must have ``requires_grad=False``.

    .. warning::
        After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`,
        you may not add or remove any of that Module's parameters or buffers.

    .. warning::
        :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks
        registered on them at the time they are passed. However, registering hooks on modules *after* passing them
        through :func:`~torch.cuda.make_graphed_callables` is allowed.

    .. warning::
        When running a graphed callable, you must pass its arguments in the same order and format
        they appeared in that callable's ``sample_args``.

    .. warning::
        The automatic mixed precision is supported in :func:`~torch.cuda.make_graphed_callables` only with disabled
        caching. The context manager `torch.cuda.amp.autocast()` must have `cache_enabled=False`.
    z_make_graphed_callables does not support the autocast caching. Please set `cache_enabled=False`.FT.r   zModules must not have hooks registered at the time they are passed. However, registering hooks on modules after passing them through make_graphed_callables is allowed.c              3  8   K   | ]  }|j                   d u   yw)FNrequires_grad.0bs     r   	<genexpr>z)make_graphed_callables.<locals>.<genexpr>  s     EAq%/Es   zIn any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have ``requires_grad=False``.c              3  P   K   | ]  }t        |t        j                           y wr$   )
isinstancer    r   )r   args     r   r   z)make_graphed_callables.<locals>.<genexpr>  s     HS:c5<<0Hs   $&zfIn the beta API, sample_args for each callable must contain only Tensors. Other types are not allowed.r   N)NNNc              3  :   K   | ]  }|j                   s|  y wr$   r|   r   os     r   r   z)make_graphed_callables.<locals>.<genexpr>  s     $K11??Q$K   c              3  :   K   | ]  }|j                   s|  y wr$   r|   r   is     r   r   z)make_graphed_callables.<locals>.<genexpr>  s      %"#qA%r   c              3  `   K   | ]&  }|j                   st        j                  |       ( y wr$   r}   r    
empty_liker   s     r   r   z)make_graphed_callables.<locals>.<genexpr>  s&      +45AOOE,,Q/+s   ..)outputsinputsgrad_outputsonly_inputsallow_unused)r+   c              3  b   K   | ]'  }|j                   rt        j                  |      nd  ) y wr$   r   r   s     r   r   z)make_graphed_callables.<locals>.<genexpr>  s+      $
AB1??EQ<$
s   -/c              3  :   K   | ]  }|j                   s|  y wr$   r|   r   s     r   r   z)make_graphed_callables.<locals>.<genexpr>  s     J1!//QJr   c              3  :   K   | ]  }|j                   s|  y wr$   r|   r   s     r   r   z)make_graphed_callables.<locals>.<genexpr>   s      TqAOO Tr   c              3  &   K   | ]	  }||  y wr$   r   r   s     r   r   z)make_graphed_callables.<locals>.<genexpr>  s     &WQq&Ws      c	           	         
  G  fddt         j                  j                        
d
fd}	|	S )Nc                      e Zd Zedfd       Zeej                  j                  j                  d fd              Z	y)Omake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphedc                4   t              D ]A  }|   j                         ||   j                         k7  s+|   j                  ||          C j                          t	        t
              st        dt                     t        d D              S )Nz"static_outputs must be tuple, got c              3  <   K   | ]  }|j                           y wr$   detachr   s     r   r   zjmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forward.<locals>.<genexpr>3  s     @AQXXZ@s   )rangedata_ptrcopy_r5   r   tuplerV   type)ctxr   r   	fwd_graphlen_user_argsstatic_input_surfacestatic_outputss      r   forwardzWmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forward'  s     }- AA+A.779VAY=O=O=QQ,Q/55fQi@A   "!.%8(<T.=Q<RS  @@@@r   c                   t        |      t              k7  r#t        dt        |       dt                     t        |      D ];  \  }}|	|j                         |j                         k7  s+|j	                  |       = j                          t        t              st        dt                     t        d D              S )Nzlen(grads)=z != len(static_grad_outputs)=z&static_grad_inputs must be tuple, got c              3  D   K   | ]  }||j                         n|  y wr$   r   r~   s     r   r   zkmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backward.<locals>.<genexpr>J  s)        #$-AHHJQ6s    )	lenrV   zipr   r   r5   r   r   r   )r   gradsggrad	bwd_graphstatic_grad_inputsstatic_grad_outputss       r   backwardzXmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backward5  s     u:%8!99(%c%j\1NsSfOgNhi   ##6> *GAt} ::<4==?:GGDM*   " ""4e<(@FXAY@Z[    0  r   N)r   ro   r   r   rC   tuple[Tensor, ...])r   ro   r   r   rC   r   )
rK   rL   rM   staticmethodr   r    autogradfunctiononce_differentiabler   )r   r   r   r   r   r   r   s   r   Graphedr   &  sC    
A 
A ^^$$88 9 r   r   c                     t        j                  j                  j                  |  } j                  t        |      z    }t         j                  j                  j                  |      S r$   )r    utils_pytreearg_tree_leavesapplyr   tree_unflatten)	user_argsflatten_user_argsoutr   module_paramsoutput_unflatten_specs      r   functionalizedzVmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.functionalizedP  sY     !& 3 3 C CY O'--%(9":]"JLC;;&&55c;PQQr   )r   ro   rC   ro   )r    r   Function)r   r   r   r   r   r   r   r   r   r   r   s   ````````` @r   make_graphed_autograd_functionz>make_graphed_callables.<locals>.make_graphed_autograd_function  s.    (	 (	enn-- (	T	R r   c                      d fd}|S )Nc                 B    j                   k(  r | i |S  | i |S r$   )training)r   user_kwargsfuncgraph_training_stategraphedorig_fwds     r   new_fwdzEmake_graphed_callables.<locals>.make_graphed_forward.<locals>.new_fwdq  s2     }}(<<&	A[AA'BkBBr   )r   z_P.argsr   z	_P.kwargsrC   r   r   )r   r   r   r   r   s   ```` r   make_graphed_forwardz4make_graphed_callables.<locals>.make_graphed_forwardk  s    C C r   )r   r   r   r   r   ztuple[torch.nn.Parameter, ...]r   rJ   r   ztorch.utils._pytree.TreeSpecr   r   r   r   r   ztuple[Tensor | None, ...]r   r   rC   zCallable[..., object])
r   rq   r   rB   r   Callable[_P, _R]r   r   rC   r   )*r    is_autocast_enabledis_autocast_cache_enabledRuntimeErrorr   r   typingcastr   r   nnModuler   _backward_hooks_forward_hooks_forward_pre_hooksrV   allbuffersr   r   r   append
parametersr   r!   r   r   r^   rW   rT   tree_leavesr   r   r   tree_flattenreversedr}   reverse	enumerater   r   )+ru   rv   rw   rx   r+   just_one_callable_sample_argsflatten_sample_argscrn   flatten_argper_callable_len_user_argsper_callable_module_paramsr   "per_callable_static_input_surfaces_
fwd_graphs
bwd_graphsmempoolr   r   grad_inputsr   outputs_gradvper_callable_static_outputs"per_callable_output_unflatten_specr   func_outputsflatten_outputsspec per_callable_static_grad_outputsper_callable_static_grad_inputsr   r   r   r   grad_idxr   r   retr   r   s+                                              r   r   r   <  sz   R   "u'F'F'Hm
 	
  i' L	E&#+$6DF{{5vs{);S)@#A;Oy,/ 4a)A%%&!+(()Q.,,-2$a  EEE$1 
 kk))994@""5#56HKHH ^ )6 9L!L#d)!L!L " ",Auxx!?allnRG" " s9~&* 	A!;A!>>*& *
 38I2GHQ%**&&(HJH27I2GHQ%**&&(HJH%)\!tG
 
JJ			5::,,.	/ 03|%G1
 	,D$, 2B.K,+, ++--99$+F$$K$KK|$q("'.."5"5 ,$ %';%   &+ +9@+ & %)%7 #6 
#K	 |[9 '	. 
JJ #%)+&!$Yj!I 8dIZZig6 	';L	' !& 3 3 @ @ N#**5+AB*11$78 (*$&(#;>34,-< %C7ni $ $
FT$
 
 JJJ|q !!)'!: #nn11(  T,@ TT!&&W2E&W!W $!3 2   ' 	0C  [%<"))+h*?@A"))$/	0 ##56(//0CD'../ABK%CP %,,.#++-=== 6= 	=
  <= 1= += 7= /= 
=@ $&CY' $ 40qMqM&q)&q).q1.q1'*,Q/+A.

 dEHHOO,%&* * +	
 "  0dmmWdllDL JJtJJwI$ L 1v:i "M"*
 IH B	' 	'0 sL   6[>A\,\%\%\!A5\A\
\$:A\1\!$\.	1\;	)rC   rB   rI   )   FN)ru   rr   rv   r   rw   rJ   rx   rB   r+   rE   rC   rr   )ru   tuple[_ModuleOrCallable, ...]rv   ztuple[tuple[Tensor, ...], ...]rw   rJ   rx   rB   r+   rE   rC   r  )ru   1_ModuleOrCallable | tuple[_ModuleOrCallable, ...]rv   z3tuple[Tensor, ...] | tuple[tuple[Tensor, ...], ...]rw   rJ   rx   rB   r+   rE   rC   r  )&
__future__r   rb   r   collections.abcr   r   r   r   r   typing_extensionsr	   r
   r   r    r   
torch.cudar   _utilsr   __all__r   r   hasattrre   __dict__torch._Cr   r   r   r   r   r   r   ro   rr   rp   r   r   r   r   <module>r     s   " 	  $ < < 6 6   '   T]t_ uxx*+&1,&?EHHl#.9:N.OEHH*+:E(;EHH67 T S,9q-
 q-hk1 k1^  %%6f8M%MN 9 N 
 $ $ #  	
   
 
 $ $(,(/( ( 	(
 ( #( 
( $ $I@IDI I 	I
 I 7Ir   