
    9j~                       U d dl mZ d dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
mZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d d	lmZmZ erd d
lmZ d dlmZ  ej:                  e      Z G d de      Z e!e!e"df   edz  f   Z#de$d<    G d de%      Z& edd       G d d             Z' edd       G d de'             Z(e'e(z  Z)de$d<    ed       G d d             Z* edd       G d d             Z+ edd       G d d              Z,	 	 	 	 	 	 d<d!Z-	 	 	 	 d=d"Z. G d# d$      Z/ G d% d&e      Z0d'd(d>d)Z1d* Z2	 d?	 	 	 	 	 	 	 d@d+Z3	 d?	 	 	 	 	 	 	 dAd,Z4 ed       G d- d.             Z5dBd/Z6edd0	 	 	 	 	 dCd1       Z7e	 	 	 	 	 	 dDd2       Z7d'd0	 	 	 	 	 dEd3Z7dFdGd4Z8e	 dH	 	 	 	 	 dId5       Z9e	 dH	 	 	 	 	 dJd6       Z9	 dF	 	 	 	 	 dKd7Z9d'd'd8	 	 	 	 	 	 	 	 	 	 	 dLd9Z:dd'd8	 	 	 	 	 	 	 	 	 	 	 dMd:Z;	 	 	 	 	 	 	 	 	 	 dNd;Z<y)O    )annotationsN)	dataclassfield)Enum)castLiteraloverloadProtocolTYPE_CHECKING	TypeAlias)fx)_MeshLayout)DTensor)tree_flattentree_unflatten)
DeviceMesh)	Placementc                  $    e Zd ZdZ	 	 	 	 	 	 ddZy)GetMeshCallbackzGCallback to create/retrieve a DeviceMesh from its cache key components.c                     y N )selfmesh_dim_namesmesh_layouts      c/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/distributed/pipelining/_utils.py__call__zGetMeshCallback.__call__   s         N)r   tuple[str, ...]r   _MeshLayout | Nonereturnr   )__name__
__module____qualname____doc__r   r   r   r   r   r      s&    Q' ( 
	r   r   .r   MeshCacheKeyc                      e Zd ZdZy)PipeliningMetadataErrorz<Raised on metadata mismatches during pipeline communication.N)r"   r#   r$   r%   r   r   r   r(   r(   ,   s    Fr   r(   T)frozenslotsc                  \    e Zd ZU dZded<   ded<   ded<   ded	<   edd
       ZddZddZy)_TensorMetazTensor metadata for recv buffer allocation and validation.

    For plain tensors, these are the tensor's actual attributes.
    For DTensors, these are LOCAL shard attributes; global attributes
    are stored in :class:`_DTensorMeta`.
    
torch.Sizeshapetuple[int, ...]strideztorch.dtypedtypeboolrequires_gradc                    t        | t              rt        d      t        | j                  | j                         | j                  | j                        S )a  Create metadata from a plain tensor.

        Args:
            tensor: A plain ``torch.Tensor`` (not DTensor).

        Returns:
            Metadata capturing shape, stride, dtype, and requires_grad.

        Raises:
            TypeError: If ``tensor`` is a DTensor.
        zJExpected plain tensor, got DTensor. Use _DTensorMeta.from_dtensor instead.r.   r0   r1   r3   )
isinstancer   r(   r,   r.   r0   r1   r3   tensors    r   from_tensorz_TensorMeta.from_tensor>   sM     fg&)\  ,,==?,, ..	
 	
r   c                T    t        | |      }|j                  | j                         |S )zReconstruct a tensor on ``device`` from this metadata.

        Args:
            device: Target device for the tensor.

        Returns:
            An empty strided tensor on ``device``.
        )_make_tensor_from_metarequires_grad_r3   )r   devicets      r   	to_tensorz_TensorMeta.to_tensorV   s)     #40	++,r   c                   | |k(  rg S g }| j                   |j                   k7  r+|j                  d| j                    d|j                           | j                  |j                  k7  r+|j                  d| j                   d|j                          | j                  |j                  k7  r+|j                  d| j                   d|j                          |S )zReturn field-by-field differences with ``other``.

        Args:
            other: Metadata to compare against.

        Returns:
            List of human-readable difference strings (empty if equal).
        zshape mismatch:  vs zstride mismatch: zdtype mismatch: )r.   appendr0   r1   r   otherdiffss      r   get_diffz_TensorMeta.get_diffc   s     5=I::$LL+DJJ<tEKK=IJ;;%,,&LL,T[[Mell^LM::$LL+DJJ<tEKK=IJ r   N)r8   torch.Tensorr!   r,   )r=   torch.device | strr!   rG   rD   r,   r!   	list[str])	r"   r#   r$   r%   __annotations__staticmethodr9   r?   rF   r   r   r   r,   r,   0   s=     
 
.r   r,   c                      e Zd ZU dZ ed       Zded<    ed      Zded	<    ed      Zd
ed<    ed      Z	ded<    ed      Z
ded<   edd       Zedd       ZddZddZy)_DTensorMetaa  DTensor metadata extending :class:`_TensorMeta` with distribution info.

    Inherited fields (shape, stride, etc.) are LOCAL shard attributes.
    Additional fields capture global shape and placement information
    needed to reconstruct a :class:`DTensor` via ``DTensor.from_local()``.

    The :class:`DeviceMesh` is **not** stored (not serializable for P2P);
    it is looked up from :class:`_MeshCache` using
    ``(mesh_dim_names, mesh_layout)`` as the key.
    c                 ,    t        j                  g       S r   )torchSizer   r   r   <lambda>z_DTensorMeta.<lambda>   s    UZZ^ r   )default_factoryr-   global_shaper   )defaultr/   global_strideztuple[Placement, ...]
placementsr   r   Nr    r   c                p   | j                   }t        | j                  j                  | j                  j	                         | j
                  | j                  | j                  | j	                         | j                  j                  |j                  rt        |j                        nd|j                  	      S )zCreate metadata from a DTensor.

        Args:
            dtensor: The DTensor to extract metadata from.

        Returns:
            Metadata capturing both local and global attributes.
        r   )	r.   r0   r1   r3   rT   rV   rW   r   r   )device_meshrN   _local_tensorr.   r0   r1   r3   _specrW   r   tuple_layout)dtensorrY   s     r   from_dtensorz_DTensorMeta.from_dtensor   s     ))''--((//1--!// !..*}}//5@5O5Ok001UW#++
 	
r   c                2    | j                   | j                  fS )z<Cache key ``(mesh_dim_names, mesh_layout)`` for mesh lookup.)r   r   r   s    r   mesh_cache_keyz_DTensorMeta.mesh_cache_key   s     ##T%5%566r   c                    t        | |      }t        t        t        j                  ||| j                  | j
                  | j                  d      j                  | j                              S )zReconstruct a DTensor on ``device`` with placements.

        Args:
            device: Target device for the local tensor.
            mesh: The ``DeviceMesh`` to attach.

        Returns:
            A DTensor on ``device``.
        F)rY   rW   r.   r0   	run_check)	r;   r   r   
from_localrW   rT   rV   r<   r3   )r   r=   meshlocal_tensors       r   
to_dtensorz_DTensorMeta.to_dtensor   sd     .dF;  ??'')) nT//0

 
	
r   c                0   | |k(  rg S t         j                  | |      }t        |t              rV| j                  |j                  k7  r+|j                  d| j                   d|j                          | j                  |j                  k7  r+|j                  d| j                   d|j                          | j                  |j                  k7  r+|j                  d| j                   d|j                          | j                  |j                  k7  r+|j                  d| j                   d|j                          | j                  |j                  k7  r+|j                  d| j                   d|j                          |S |j                  d       |S )zReturn field-by-field differences, including DTensor-specific fields.

        Args:
            other: Metadata to compare against.

        Returns:
            List of human-readable difference strings (empty if equal).
        zglobal_shape mismatch: rA   zglobal_stride mismatch: zplacements mismatch: zmesh_dim_names mismatch: zmesh_layout mismatch: z!type: _DTensorMeta vs _TensorMeta)
r,   rF   r6   rN   rT   rB   rV   rW   r   r   rC   s      r   rF   z_DTensorMeta.get_diff   s    5=I
 $$T51 e\*  E$6$66-d.?.?-@UEWEWDXY !!U%8%88.t/A/A.B$uGZGZF[\ %"2"22+DOO+<DAQAQ@RS ""e&:&::/0C0C/DDI]I]H^_ 5#4#44,T-=-=,>d5CTCTBUV  LL<=r   )r^   r   r!   rN   )r!   r&   )r=   rH   rf   r   r!   r   rI   )r"   r#   r$   r%   r   rT   rK   rV   rW   r   r   rL   r_   propertyrb   rh   rF   r   r   r   rN   rN   {   s    	  %5KLL*L%*2%6M?6 ).)J% 
 ',B&7NO7&+'K#  
 
8 7 7
2*r   rN   
TensorMeta)r*   c                  b    e Zd ZU dZdZded<   dZded<   dZded<   dZded<   dd	Z	dd
Z
ddZy)
_StageMetazPConsolidated tensor metadata for a pipeline stage's forward and backward passes.Ntuple[TensorMeta, ...] | Noneinputsoutputs$tuple[TensorMeta | None, ...] | Noneinput_gradsoutput_gradsc                ~    t        d | j                  | j                  | j                  | j                  fD              S )z)Check if any metadata field is populated.c              3  $   K   | ]  }|d u 
 y wr   r   ).0vs     r   	<genexpr>z%_StageMeta.has_any.<locals>.<genexpr>  s      
 TM
s   )anyro   rp   rr   rs   ra   s    r   has_anyz_StageMeta.has_any  s9     
kk4<<1A1A4CTCTU
 
 	
r   c                l    | j                   | j                  fD ]  }|st        d |D              s y y)z3Check if any input/output metadata is DTensor type.c              3  B   K   | ]  }|st        |t                y wr   )r6   rN   rv   ms     r   rx   z*_StageMeta.has_dtensors.<locals>.<genexpr>  s     MQ1Z<8Ms   TF)ro   rp   ry   )r   metass     r   has_dtensorsz_StageMeta.has_dtensors  s6    kk4<<0 	EM%MM	 r   c                >    | j                   duxr | j                  duS )z-Check if forward metadata is fully populated.N)ro   rp   ra   s    r   is_complete_for_forwardz"_StageMeta.is_complete_for_forward  s    {{$&C4<<t+CCr   )r!   r2   )r"   r#   r$   r%   ro   rK   rp   rr   rs   rz   r   r   r   r   r   rm   rm     sA    Z,0F)0-1G*18<K5<9=L6=
Dr   rm   c                      e Zd ZU dZded<   y)_StageForwardMetazLForward metadata transmitted from stage *i* to stage *i+1* during inference.tuple[TensorMeta, ...]forward_metasNr"   r#   r$   r%   rK   r   r   r   r   r   !  s    V))r   r   c                      e Zd ZU dZded<   y)_StageBackwardMetau   Backward metadata transmitted from stage *i* to stage *i-1* during inference.

    Gradient placements may differ from forward activations
    (e.g., ``Replicate`` → ``Partial``).
    tuple[TensorMeta | None, ...]backward_metasNr   r   r   r   r   r   (  s     r   r   c                p    t        j                  | j                  | j                  | j                  |      S )zCreate a tensor from metadata.

    Args:
        meta: Metadata with shape, stride, and dtype.
        device: Target device for the tensor.

    Returns:
        Empty tensor preserving the exact memory layout.
    )sizer0   r1   r=   )rP   empty_stridedr.   r0   r1   )metar=   s     r   r;   r;   5  s/     ZZ{{jj	 r   c                &    t        d | D              S )zDerive gradient metadata from tensor metadata.

    Returns metadata with the same shape/stride/dtype but ``requires_grad=False``.
    Entries where the source has ``requires_grad=False`` become ``None``.
    c              3     K   | ]?  }|j                   r-t        |j                  |j                  |j                  d       nd A yw)Fr5   N)r3   r,   r.   r0   r1   r}   s     r   rx   z%_derive_grad_metas.<locals>.<genexpr>R  sC        ?? 	!''!((!''QVW	s   AA)r\   )tensor_metass    r   _derive_grad_metasr   J  s        	  r   c                  B    e Zd ZdZd	d
dZddZddZddZddZddZ	y)
_MeshCachezCache for :class:`DeviceMesh` objects keyed by ``(mesh_dim_names, mesh_layout)``.

    Assumes all pipeline stages share the same rank tensor (true for
    TorchTitan-style frameworks where meshes derive from a common world).
    Nc                     i | _         || _        y r   )_cache_get_mesh_cb)r   get_mesh_cbs     r   __init__z_MeshCache.__init__a  s    68'r   c                    || j                   v r| j                   |   S |\  }}| j                  t        d| d| d      | j                  ||      }|t        d| d| d      || j                   |<   |S )a  Return a cached mesh, or create one via the callback.

        Args:
            key: Cache key ``(mesh_dim_names, mesh_layout)``.

        Returns:
            The ``DeviceMesh``.

        Raises:
            PipeliningMetadataError: If not cached and no callback provided.
        z+Mesh not found in cache for mesh_dim_names=z, mesh_layout=z`, and no get_mesh callback provided. Provide a get_mesh callback or use DTensors in static mode.z>Mesh lookup failed: callback returned None for mesh_dim_names=z6. Ensure all stages use meshes from the same universe.)r   r   r(   )r   keyr   r   rf   s        r   get_meshz_MeshCache.get_meshe  s     $++;;s##&)#$)=n=M N*m ,NO    =<)""0!1} MGH 
  Cr   c                "    || j                   |<   y)zAdd a mesh to the cache.Nr   )r   r   rf   s      r   putz_MeshCache.put  s    Cr   c                    |D ]p  }t        |t              s|j                  }|j                  rt	        |j                        nd}|j
                  }||f}|| j                  vsb|| j                  |<   r y)zJExtract and cache meshes from any :class:`DTensor` instances in *tensors*.r   N)r6   r   rY   r   r\   r]   r   )r   tensorsr8   rf   	dim_namesr   r   s          r   update_from_tensorsz_MeshCache.update_from_tensors  sn     	,F&'*)):>:M:ME$"5"56SU	"ll +.dkk)'+DKK$	,r   c                    || j                   v S r   r   )r   r   s     r   __contains__z_MeshCache.__contains__  s    dkk!!r   c                ,    t        | j                        S r   )lenr   ra   s    r   __len__z_MeshCache.__len__  s    4;;r   r   )r   zGetMeshCallback | Noner!   None)r   r&   r!   r   )r   r&   rf   r   r!   r   )r   tuple[torch.Tensor | None, ...]r!   r   )r   r&   r!   r2   )r!   int)
r"   r#   r$   r%   r   r   r   r   r   r   r   r   r   r   r   Z  s&    ( D 	," r   r   c                  *    e Zd ZdZdZdZedd       Zy)InferenceModea  Pipeline-level metadata inference mode, determined collectively across all PP ranks.

    The mode is set by the schedule (not individual stages) because
    ``has_backward`` is only known at schedule creation time and all
    stages must agree to avoid P2P hangs.

    .. attribute:: STATIC

        All stages have sufficient metadata; runtime inference is skipped.

    .. attribute:: DYNAMIC

        At least one stage requires runtime metadata inference.
    staticdynamicc                    |j                         sy|j                         sy|sy|j                  |j                  yy)a'  Determine whether dynamic metadata inference is needed for a stage.

        Args:
            meta: Stage metadata from user-provided args.
            stage_has_backward: Whether a backward pass will be performed.

        Returns:
            ``True`` if dynamic inference is needed.
        TF)r   r   rr   rs   )clsr   stage_has_backwards      r   needs_dynamiczInferenceMode.needs_dynamic  sM     ++-   " " #t'8'8'@ r   N)r   rm   r   r2   r!   r2   )r"   r#   r$   r%   STATICDYNAMICclassmethodr   r   r   r   r   r     s%     FG r   r   Fdetachc                   t        |       \  }}|rb|D cg c]G  }t        |t        j                        r)|j	                         j                  |j                        n|I }}t        ||      }||fS |S c c}w )a;  Flatten ``args`` into a list, optionally detaching tensors.

    Args:
        args: Nested arguments to flatten.
        detach: If ``True``, detach tensors while preserving ``requires_grad``.

    Returns:
        ``(new_args, flat_detached_args)`` when ``detach=True``;
        ``flat_args`` list otherwise.
    )r   r6   rP   Tensorr   r<   r3   r   )argsr   	flat_argstreespecaflat_detachednew_argss          r   flatten_argsr     s     't,Ix
 	
  !U\\* HHJ%%aoo6
 
 "-:&&
s   AA5c                    t        | d      S )zHFlatten and detach. Deprecated: use ``flatten_args(args, detach=True)``.Tr   )r   )r   s    r   flatten_args_detachr     s    T**r   c                   i }|dk(  rt        |      D ]
  }|| z  ||<    |S |dk(  rU|| z  dk7  rt        d| d|  d      d}t        |      D ])  }|||<   |dz   | z  dk(  r|| z  dz  dk(  r|dz  }%|dz  }+ |S t        d	| d
      )z
    Compute the stage id to rank mapping for either a looped or V-style schedule.

    Most commonly num_stages == pp_size * 2, but this function can be used to
    compute the mapping for any number of stages per rank.
    looprw   r   znum_stages z% must be evenly divisible by pp_size z for V schedules      zStyle z is not supported.)range
ValueError)pp_size
num_stagesstylemappingstage_index
rank_indexs         r   generate_stage_to_rank_mappingr     s     G , 	9K#.#8GK 	9( N% 
#1$j\)NwiWgh  
 , 	 K#-GK a7*a/w&!+q0a
a
	  N 6%(:;<<r   c                    t        | ||      }i }|j                         D ]"  \  }}||vrg ||<   ||   j                  |       $ |j                         D ]  }|j	                           |S )a  
    Compute the rank to stage id mapping for either a looped or V-style schedule.

    This function inverts the stage_to_rank_mapping to get which stages are assigned to each rank.

    Returns a dictionary mapping rank -> list of stage indices assigned to that rank.
    )r   itemsrB   valuessort)r   r   r   stage_to_rankrank_to_stagesstage_idrankstagess           r   generate_rank_to_stage_mappingr     s     37JNM ,.N'--/ .$~%#%N4 t##H-. !'')  r   c                  0    e Zd ZU dZded<   ded<   ded<   y)	PipeInfoz>
    Captures information for a pipeline (`Pipe` object).
    zfx.Graphgraphr   r   r2   has_loss_and_backwardNr   r   r   r   r   r   6  s     OOr   r   c                v    t        | t              rt        j                  |       S t        j                  |       S )a  Extract metadata from a tensor.

    Handles both plain Tensor and DTensor correctly: DTensors are
    dispatched to ``_DTensorMeta.from_dtensor`` which captures local
    shard attributes plus global shape/placement info, while plain
    tensors use ``_TensorMeta.from_tensor``.

    Args:
        tensor: A plain tensor or DTensor.

    Returns:
        ``_TensorMeta`` for plain tensors, ``_DTensorMeta`` for DTensors.
    )r6   r   rN   r_   r,   r9   r7   s    r   extract_tensor_metar   F  s0     &'"((00&&v..r   )
allow_nonec                    y r   r   r   r   s     r   extract_tensor_metasr   Z  s    
 %(r   c                    y r   r   r   s     r   r   r   b  s    
 ,/r   c                   | yg }d}| D ]J  }t        |t        j                        r|j                  t	        |             8d}|j                  d       L |s|rt        d      t        |      S )a  Extract metadata from a tuple of tensors.

    Args:
        tensors: Tuple of tensors (may include ``None`` when ``allow_none=True``).
        allow_none: If ``True``, preserve ``None`` elements (for gradients).

    Returns:
        Tuple of ``TensorMeta``, or ``None`` if ``tensors`` is ``None``.

    Raises:
        PipeliningMetadataError: If ``None`` found and ``allow_none=False``.
    NFTz_None values are not allowed in tensor metadata tuples. Use allow_none=True for optional values.)r6   rP   r   rB   r   r(   r\   )r   r   metas_with_nonehas_noner>   s        r   r   r   j  s    " /1OH )a&""#6q#9:H""4() (%7
 	
 !!r   c                n    |r| j                         n| }t        |t              r|j                         S |S )u  Convert a DTensor to its local shard, or return a plain tensor as-is.

    When ``detach=True``, the tensor is detached before conversion —
    this applies to both DTensors and plain tensors.

    Args:
        tensor: A tensor that may be a DTensor.
        detach: If ``True``, detach before ``to_local()`` to avoid
            redistribution during backward.

    Returns:
        The local tensor component.
    )r   r6   r   to_local)r8   r   maybe_detached_tensors      r   to_local_if_dtensorr     s4     06FMMO6'1$--//  r   c                     y r   r   r   r   s     r   validate_and_normalize_to_tupler     s     '*r   c                     y r   r   r   s     r   r   r     s     .1r   c           	        | yt        | t        j                        r| fS t        | t        t        f      rt        |       D ]X  \  }}||st        d| d      t        |t        j                        r5t        d| dt        |      j                   d       t        | t              rt        |       S | S t        dt        |       j                   d      )a  Normalize ``args`` to a tuple and validate that all elements are tensors.

    Args:
        args: A single tensor, tuple/list of tensors, or ``None``.
        allow_none: If ``True``, permit ``None`` elements (for gradients).

    Returns:
        Tuple of tensors, or ``None`` if ``args`` is ``None``.

    Raises:
        PipeliningMetadataError: On non-tensor values
            (or ``None`` when ``allow_none=False``).
    Nz
Stage arg[zF] is None. Stage args must be tensors. Use kwargs for optional values.z] has type zC. All stage args must be tensors. Use kwargs for non-tensor inputs.z<Stage args must be a tensor, tuple, or list of tensors, got .)	r6   rP   r   r\   list	enumerater(   typer"   )r   r   iargs       r   r   r     s    , |	D%,,	'w	D5$-	(o 	FAs{!1$QC (V W  c5<<0- ;tCy/A/A.B CX Y 	 )t4uT{>$>%J4PT:K^K^J__`a
 	
r   raise_on_mismatchwarn_on_mismatchc               *   t        |t        j                        rt        |      }n|}t	        |      t	        |      urmdt	        |      j
                   dt	        |      j
                   g}|rt        |  d|d          |r%t        j                  |  d|d    dt        d       |S |j                  |      }|rT|rt        |  dd	j                  |             |r1t        j                  |  d
d	j                  |       dt        d       |S )al  
    Compare expected metadata against actual tensor or metadata.

    This is the unified validation/comparison function that uses get_diff() from
    metadata classes. Works with both plain tensors and DTensors.

    For plain tensors: compares shape/stride/dtype/requires_grad.
    For DTensors: compares all properties including global shape and placements.

    Args:
        desc: Description for error/warning messages.
        expected: Expected tensor metadata (_TensorMeta or _DTensorMeta).
        actual: Actual tensor or metadata to compare against.
        raise_on_mismatch: If True, raise PipeliningMetadataError on mismatch.
        warn_on_mismatch: If True, issue a warning on mismatch.

    Returns:
        List of differences (empty if metadata matches).

    Raises:
        PipeliningMetadataError: If raise_on_mismatch=True and differences exist.
    ztype: expected , got : r   z: Metadata type mismatch. z.. Using dynamically inferred metadata instead.r   
stacklevelz; z: Metadata mismatch. )r6   rP   r   r   r   r"   r(   warningswarnUserWarningrF   join)descexpectedactualr   r   actual_meta	type_diffrE   s           r   validate_metadatar
    s0   > &%,,')&1 H~T+..d8n556fT+=N=W=W<XY
	 )TF"Yq\N*CDDMM&29Q<. A? @	  k*E)TF"TYYu5E4F*GHHMM&-dii.>-? @? @	 Lr   c                  t        |      t        |      k7  rJ|  dt        |       dt        |       }|rt        |      |rt        j                  |t        d       |gS g }t        t        ||d            D ]  \  }\  }}	||	||	R|  d| d|d	nd
 d|	d	nd
 }|rt        |      |rt        j                  |t        d       |j                  |       dt        |  d| d||	||      }
|j                  |
        |S )a2  Validate metadata for a tuple of tensors element-wise.

    Args:
        desc: Description prefix for error/warning messages.
        expected: Tuple of expected metadata (may include ``None`` for grads).
        actual: Tuple of actual tensors or metadata to compare against.
        raise_on_mismatch: If ``True``, raise on the first mismatch.
        warn_on_mismatch: If ``True``, issue warnings for mismatches.

    Returns:
        Aggregated list of difference strings.

    Raises:
        PipeliningMetadataError: If lengths differ or on mismatch.
    z: expected z tensors, got r   r   Tstrict[z]: expected r   metadatar   ]r   )
r   r(   r  r  r  r   ziprB   r
  extend)r  r  r  r   r   msg	all_diffsr   expactrE   s              r   validate_tensors_metadatar  -  s>   . 8}F#k#h-s6{mL)#..MM#{q9uI"3x#EF  :C;3;;#+&!L3;J(O P!$v*=?  !-c22c;1=S!!fAaSN/-
 	+ , r   c                   |rdnd}| d}| d}t        |      t        |      k7  r-t        d|  d| dt        |       d| dt        |       d	      t        t        ||d
            D ]  \  }\  }}	|j                  s3|	1t        d|  d| d| d| d| dt        |	      j                   d      |j                  r.|	,t        j                  d|  d| d| d| d| dt        d       t        |t              s|j                  s|	t        |	t              rt        d|  d| d| d| d| dt        |	      j                   d       y)u/  
    Validate the args↔grads contract for static mode.

    Enforces four rules for each (arg, grad) pair:
      1. len(args) must equal len(grads).
      2. If arg.requires_grad is False, grad must be None.
      3. If arg.requires_grad is True and grad is None, emit a warning
         (this is legal at pipeline boundaries but may indicate a bug).
      4. If arg is a DTensor with requires_grad=True and grad is not None,
         grad must also be a DTensor.

    Args:
        stage_index: The stage index for error messages.
        args: Tuple of forward tensors.
        grads: Tuple of gradient tensors (can include None).
        is_input: True for input_args/input_grads, False for output_args/output_grads.

    Raises:
        PipeliningMetadataError: If any hard rule (1, 2, or 4) is violated.
    inputoutput_args_gradszStage r   z	 length (z) does not match zo). Each forward tensor must have a corresponding gradient entry (use None for tensors that don't require grad).Tr  Nr  z] has requires_grad=False, but z] is not None (zE). Non-differentiable tensors must have None as their gradient entry.z] has requires_grad=True, but zT] is None. This is legal at pipeline boundaries but may indicate a missing gradient.r   r   z,] is a DTensor with requires_grad=True, but z] is za, expected DTensor or None. DTensor gradients may have different placements than forward tensors.)r   r(   r   r  r3   r   r"   r  r  r  r6   r   )
r   r   gradsis_inputkind	args_name
grads_namer   r   grads
             r   'validate_static_arg_grad_correspondencer#  f  s   4 7HD&I6J 4yCJ%[MJ<yUDUk3t9+ .[\
 	
 $CeD$AB ;C  T%5)R	{!A3 7!l!A3od4j6I6I5J KUV  MMR	{!A3 7!l!A3 '78  sG$!! tW-)R	{!A3 7!l!A3eDJ,?,?+@ AXY 5r   )r   r,   r=   rH   r!   rG   )r   r   r!   ztuple[_TensorMeta | None, ...])r   r2   )r   )r   r   r   r   r   strr!   zdict[int, int])r   r   r   r   r   r$  r!   zdict[int, list[int]])r8   rG   r!   rk   )r   tuple[torch.Tensor, ...] | Noner   Literal[False]r!   rn   )r   &tuple[torch.Tensor | None, ...] | Noner   Literal[True]r!   rq   )r   Atuple[torch.Tensor | None, ...] | tuple[torch.Tensor, ...] | Noner   r2   r!   rq   )F)r8   rG   r   r2   r!   rG   ).)r   zCtorch.Tensor | tuple[torch.Tensor, ...] | list[torch.Tensor] | Noner   r&  r!   r%  )r   zQtorch.Tensor | tuple[torch.Tensor | None, ...] | list[torch.Tensor | None] | Noner   r(  r!   r'  )r   ztorch.Tensor | tuple[torch.Tensor, ...] | tuple[torch.Tensor | None, ...] | list[torch.Tensor] | list[torch.Tensor | None] | Noner   r2   r!   r)  )r  r$  r  rk   r  ztorch.Tensor | TensorMetar   r2   r   r2   r!   rJ   )r  r$  r  r   r  z,tuple[torch.Tensor | TensorMeta | None, ...]r   r2   r   r2   r!   rJ   )
r   r   r   ztuple[torch.Tensor, ...]r  r   r  r2   r!   r   )=
__future__r   loggingr  dataclassesr   r   enumr   typingr   r   r	   r
   r   r   rP   r   torch.distributed._mesh_layoutr   torch.distributed.tensorr   torch.utils._pytreer   r   torch.distributed.device_meshr   (torch.distributed.tensor.placement_typesr   	getLoggerr"   loggerr   r\   r$  r&   rK   RuntimeErrorr(   r,   rN   rk   rm   r   r   r;   r   r   r   r   r   r   r   r   r   r   r   r   r
  r  r#  r   r   r   <module>r7     s	   #   (  N N   6 , < 8B 
		8	$h   c3ht1C CDi DGl G $d#G G $GT $d#@; @ $@H $l2
I 2
 D D D6 $d#* * $* $d#	 	 $	
 *(# @  @ P0D 0p */ 6+ 17!*-F 17!*-4      /( 
 "%(,( ( #	( 
( 
/3/ / *	/ 
/ !"N!" !" *	!"H!( 
 "%*
M** %* 
* 
 !$11
 1 ,1 
1  -
-
 -
 G-
t $"B
BB &B
 B B BT #"6
6+6 96
 6 6 6rDD
"D +D 	D
 
Dr   