
    9jH                    h   U d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlZd dlmc mZ d dlmc mc mZ d dlmZ d dlmZ d dlmZmZmZmZmZ d d	l m!Z! d d
l"m#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z.  ej^                  e0      Z1da2e3dz  e4d<   da5e3e4d<   e jl                  dKde3fd       Z7e jl                  dKde3fd       Z8 ejr                  dd       G d d             Z: ejr                  dd       G d de:             Z;de:de<e(   de=e>e<e>   f   ddfdZ?d e#d!e@e>d"f   de#dz  fd#ZAd e#d!e@e>d"f   de#dz  fd$ZB eC       ZDeCe@e>e@e>d"f   eEf      e4d%<   d&e#d!e@e>d"f   d'e@e(d"f   d(e@e(d"f   d)e>d*eEd+eEddfd,ZFd-e<e:   d&e#d'e@e(d"f   d(e@e(d"f   de<e:e;z     f
d.ZGi ZHe=e@ej                  e#   ef   d/f   e4d0<   d&e#d1edd/fd2ZJdLd3ZK G d4 d/      ZL	 dMd5ed6ed7e3dz  de<e:   fd8ZMe	 dMd5ed6ed7e3dz  de<e:   fd9       ZNdddd:d;ej                  d<ed=ed>e3d7e3dz  d?e3dej                  fd@ZP	 	 	 dNdAdBdCedDej                  dz  dEej                  dz  d>e3f
dFZR G dG dHej                  j                        ZU G dI dJej                  j                        ZVy)O    N)defaultdict)Sequence)cache)cast)_are_we_tracingone_step_redistribute_cost)_StridedShardNotDecodableErrorDTensorSpec
ShardOrderShardOrderEntry
TensorMeta)assert_no_mixed_partial_types)
DeviceMesh)_is_shard_like_StridedShardPartial	Placement	ReplicateShard)IntLikeType)get_active_debug_mode#_FORCE_MIN_COST_REDISTRIBUTION_PLANF,_DISABLE_REDISTRIBUTE_TRANSFORM_OPTIMIZATIONTenabledc              #   8   K   t         }| a 	 d |a y# |a w xY ww)uX	  
    Context manager to control the redistribution planning strategy for DTensor operations.

    This context manager allows you to choose between two algorithms for computing the
    sequence of collective operations needed to redistribute a DTensor from one placement
    to another:

    - **Graph-based**: Uses Dijkstra's algorithm to find the minimum-cost path
      through all possible placement transformations. This approach considers the global
      cost of all collective operations and finds the optimal sequence. Best for complex
      redistribution patterns where reducing communication cost and memory overhead is critical.

    - **Greedy**: Uses a heuristic approach that makes locally optimal choices
      at each step. This is faster to compute but may not produce the globally optimal
      transformation sequence. Best for simple redistribution patterns or when planning
      speed is more important than optimal communication.

    **Default Behavior (without this context manager):**

    When this context manager is NOT used, the algorithm selection follows this priority:

    1. **Non-default shard orders**
       → Always use graph-based algorithm (required for correctness)

    2. **Explicit `use_graph_based_transform` parameter** to `_gen_transform_infos_non_cached`
       → Use the specified algorithm (True = graph-based, False = greedy)

    3. **No explicit parameter** (default case)
       → Use greedy algorithm for faster planning

    **Behavior with this context manager:**

    This context manager overrides the default selection by setting the global flag
    `_FORCE_MIN_COST_REDISTRIBUTION_PLAN`, which takes precedence over the explicit
    `use_graph_based_transform` parameter (but not over non-default shard order requirements).

    **Cache Considerations:**

    The redistribution planner caches transform info for performance via the `@cache`
    decorator on `_gen_transform_infos`. If you need to change the algorithm selection
    for the same input specs, clear the cache using `_gen_transform_infos.cache_clear()`
    to ensure the new setting takes effect and doesn't reuse cached results from a
    previous run.

    Args:
        enabled (bool): If True, forces the use of the graph-based algorithm.
                       If False, forces the use of the greedy algorithm.
                       Default: True
    N)r   )r   	old_values     f/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/distributed/tensor/_redistribute.py use_min_cost_redistribution_planr   7   s(     j 4I*1'8.7+i+   	 disabledc              #   8   K   t         }| a 	 d |a y# |a w xY ww)a  
    Context manager to disable the transform optimization pass that merges
    consecutive same-type collectives into single flattened operations.

    When the optimization is disabled, ``_optimize_transform_infos`` becomes a
    no-op and returns the original list of ``_TransformInfo`` objects unchanged.
    This is useful for debugging or isolating issues related to the flattened
    collective merging logic.

    The flag can also be set directly::

        torch.distributed.tensor._redistribute._DISABLE_REDISTRIBUTE_TRANSFORM_OPTIMIZATION = True

    Args:
        disabled (bool): If True (default), disables the optimization.
                         If False, explicitly enables it (the normal default).
    N)r   )r!   r   s     r   +disable_redistribute_transform_optimizationr#   t   s(     * =I3;0A7@4y4r    frozenslotsc                   T    e Zd ZU eed<   eeef   ed<   ee   ed<   d Z	de
dz  fdZy)_TransformInfomesh_dimsrc_dst_placementslogical_shapec                     | j                   dk  rt        | j                  d   | j                  d   k(  rt        d      y )Nr      zQTransformInfo should only be created if it is an op with some effect, not a no-op)r)   AssertionErrorr*   selfs    r   __post_init__z_TransformInfo.__post_init__   sH    ==1  ""1%)@)@)CC c  D    returnNc                    | j                   \  }}|j                         r|j                         ry|j                         rt        |      ryt        |      r|j                         ryt        |      rt        |      ryy)z
        Return a key for grouping transforms by communication type.

        Returns None for local ops (no communication needed), or a string
        that identifies the collective type for potential grouping/merging.
        
all_reducereduce_scatter
all_gather
all_to_allN)r*   
is_partialis_replicater   )r0   srcdsts      r   _comm_type_keyz_TransformInfo._comm_type_key   sl     **S>> 0 0 2^^."5#C S%5%5%7C ^C%8 r2   )__name__
__module____qualname__int__annotations__tupler   r   r   r1   strr=    r2   r   r(   r(      s7    Mi233K((d
 r2   r(   c                   L    e Zd ZU dZeed<   eedf   ed<   dZedz  ed<   ddZ	y)	_FlattenedTransformInfoa4  
    Represents a flattened transform that combines multiple mesh dimensions
    into a single collective operation using a flattened DeviceMesh.

    Note: inherits the fields from _TransformInfo. Gets an __init__ with parent fields, followed by child fields,
    and runs parent validation (post_init)
    mesh.original_mesh_dimsN	avg_scalec                     t         j                  |        | j                  (| j                  dkD  st        d| j                         y y )Nr-   z"avg_scale must be > 1 if set, got )r(   r1   rJ   r.   r/   s    r   r1   z%_FlattenedTransformInfo.__post_init__   sL    $$T*>>%>>A%$88HI  & &r2   r3   N)
r>   r?   r@   __doc__r   rB   rC   rA   rJ   r1   rE   r2   r   rG   rG      s2     c3h' !IsTz r2   rG   transform_infocurrent_placementsshard_order_dictr3   c                 z   | j                   \  }}t        | t              r| j                  }n| j                  f}t        |t
        t        z        r|j                  }t               }|D ]L  }t        ||         dk(  rt        d| d|  d| d|       |j                  ||   j                                N t        |      |k(  s#t        dt        |       d| d|  d| d| 
      t        |t
        t        z        r0|j                  }	|	|vrg ||	<   |D ]  }
||	   j                  |
        |D ]  }
|||
<   	 y)	zw
    Update current_placements and shard_order_dict in-place to reflect the
    effect of a single transform step.
    r   z?Invalid shard_order update. No entries left to pop for src_dim z. transform_info=z, current_placements=z, shard_order=z^Mismatch between expected and removed mesh dims during shard_order update. Expected to remove z, but removed N)r*   
isinstancerG   rI   r)   r   r   dimsetlen
ValueErroraddpopappend)rN   rO   rP   src_placementdst_placement	mesh_dimssrc_dimremoved_dim_dst_dimr)   s              r   "_update_shard_order_and_placementsra      s    $2#D#D M=."9:"55	#,,.	-!67##e 	=A#G,-2 Ui00@ A**<)= >##3"46  OO,W599;<	= 9~,.y>". >""0!1 2&&8%9 :/02  -!67##**(*W%! 	7HW%,,X6	7  5'48$5r2   rH   r\   .c                 N   | j                         }| j                  yt        fd|D              }| j                  |      }|j	                         }t        |      dkD  r|j                         }|j                  j                         D ]  }|j                  |k(  s|c S  y)z
    Query for an explicitly created flattened mesh using layout comparison.

    Searches root_mesh._flatten_mapping for a mesh whose layout matches
    the expected flattened layout for the given dims. Pure Python layout math.
    Nc              3   (   K   | ]	  }|     y wNrE   ).0imesh_dim_namess     r   	<genexpr>z5_get_flattened_mesh_by_layout_impl.<locals>.<genexpr>  s     ;AnQ';s   r-   )
_get_root_meshrg   rC   _get_slice_mesh_layoutcoalescerU   nest_flatten_mappingvalues_layout)rH   r\   	root_mesh	dim_namessliced_layoutexpected_layoutflattened_meshrg   s          @r   "_get_flattened_mesh_by_layout_implru     s     ##%I((N ;;;I //	:M#,,.O
?a)..0 $44;;= "!!_4!!" r2   c                    t               rjt        j                  j                  j                  rFt        | |      yddlm} t        j                  j                  j                  | t        |            S t        | |      S )aB  
    Query for an explicitly created flattened mesh using layout comparison.

    When tracing with compile_on_one_rank, delegates to a custom op so the
    flattened mesh appears as a call_function node derived from mesh (a graph
    input) rather than as a get_attr constant holding an unpicklable
    ProcessGroup.
    Nr   )device_mesh)r   torchdistributedconfigcompile_on_one_rankru   torch.distributed._opsrw   ops_get_flattened_submeshlist)rH   r\   r_   s      r   _get_flattened_mesh_by_layoutr   "  sa     U..55II .dI>F;yy$$;;D$y/RR-dI>>r2   _warned_flatten_issuesrw   src_placementsdst_placementsnum_ops	comm_typereasonc                    t        |       ||f}|t        v ryt        j                  |       | j                  }|+|D 	cg c]  }	||	   	 }
}	dj	                  d |
D              }nddj	                  d |D               d|  }d}|dk(  rd	| d
}n"|dk(  rd}n|dk(  rd| d}nt        d|       t        j                  ||||||       yc c}	w )a-  
    Warn once per (mesh, dims, reason) about inability to flatten operations.

    Args:
        device_mesh: The device mesh being used
        mesh_dims: Tuple of mesh dimensions that could not be flattened
        src_placements: Source placements for the redistribution
        dst_placements: Target placements for the redistribution
        num_ops: Number of sequential operations that will be performed
        comm_type: Type of collective operation (e.g., "reduce_scatter")
        reason: Either "no_flattened_mesh" or "uneven_tensor_shape"
    Nz, c              3   (   K   | ]
  }d | d   yw)"NrE   )re   names     r   rh   z:_warn_flatten_optimization_not_possible.<locals>.<genexpr>Z  s     ?Tqa[?s   zdims c              3   2   K   | ]  }t        |        y wrd   )rD   )re   ds     r   rh   z:_warn_flatten_optimization_not_possible.<locals>.<genexpr>\  s     $?SV$?s   z of a%  While redistributing from %s to %s, %d sequential %s operations will be performed. This is suboptimal: multiple collective operations have higher latency (separate kernel launches and synchronization points) and may give inconsistent results between ranks due to different reduction orders. %sno_flattened_meshz&To optimize, flatten mesh dimensions [z0] so DTensor can use a single operation instead.uneven_tensor_shape)z Unfortunately, because the tensor dimension is not evenly divisible by the product of the mesh dim sizes that would need to be flattened for the optimization to work, it can not be optimized.non_ascending_mesh_dimsz0it is not possible to merge non-ascending order z operations.zUnexpected reason: )hashr   rW   rg   joinr.   loggerwarning)rw   r\   r   r   r   r   r   	cache_keyrg   r   rq   dims_strcommon_warning
reason_msgs                 r   '_warn_flatten_optimization_not_possibler   =  s   * k"Iv6I**y) //N!09:1^A&:	:99?Y??499$?Y$??@[MR	`  $$=hZGwx
	(	(

 
,	,>ykV 	 26(;<<
NNJ7 ;s   Ctransform_infosc           	      <   t        |       dk  r| S t        r| S t        h d      dt        dz  dt        ffd}dt
        t        t        f   dt
        t        t        f   dt        fd	d
t        t           dt
        t        dz  t        dz  f   ffd}g }d}|t        |       k  rW| |   }|j                         }	 ||	      s|j                  |       |dz  }C|j                  }
|g}|dz   }|t        |       k  r || |   j                               rm | |   j                  |
      rW|j                  | |          |dz  }|t        |       k  r0 || |   j                               r | |   j                  |
      rW ||      \  }}||j                  |       nJ|j                  |       |dv r5t        t        d |D                    }t        |||t        |      |	|       |}|t        |       k  rWt         j#                  d| |       |S )a  
    Optimize transform infos by merging consecutive same-type collectives into
    a single flattened operation when a matching flattened DeviceMesh exists.

    Merging requirements:
    - Operations must be consecutive in the transform list (no reordering).
      Notably, redistributing from P, P, P -> R, S, R is not optimized here and cannot be optimized due to
      optimization needing to fuse non-contiguous reductions, leaving this pattern vulnerable to numerics issues and
      suboptimal perf
    - Operations must have the same comm type (e.g., all allgather or all reduce_scatter)
    - Operations must have identical src_dst_placements (e.g., can't merge
      Partial->Shard(0) with Partial->Shard(1))
    - A flattened mesh covering the relevant dimensions must exist
    - For reduce_scatter, tensor dim must be evenly divisible by flattened mesh size

    For nested sharding, the merged operation uses the logical_shape from the
    outermost mesh dimension (smallest mesh_dim index) which represents the
    global tensor shape needed for correct padding/unpadding.

    TODO:
    - all_to_all operations are excluded from merging, but it may be possible to merge them in some cases.

       >   r7   r5   r6   keyNr3   c                     | v S )z:Check if a comm type key represents a mergeable operation.rE   )r   MERGEABLE_COMM_TYPESs    r   is_mergeablez/_optimize_transform_infos.<locals>.is_mergeable  s    ***r2   p1p2c                     | |k(  ry| \  }}|\  }}||k7  ry|j                         r|j                         syt        t        |      }t        t        |      }ddh}|j                  |v xr |j                  |v S )z
        Check if two src_dst_placements can be merged.

        Allows merging of Partial("sum") and Partial("avg") since they can be
        combined: perform sum reduction, then scale by avg mesh dims afterward.
        TFsumavg)r9   r   r   	reduce_op)	r   r   src1dst1src2dst2partial1partial2mergeable_reduce_opss	            r   are_placements_mergeablez;_optimize_transform_infos.<locals>.are_placements_mergeable  s     8
d
d 4< !doo&7 && %u~"66 ;""&::	
r2   infosc                 |   t        |       dk  ry| d   j                  | d   j                         }t        fd| D              st	        d      t        d | D              }t        t        |            }|dk(  r||k7  ry|d	k(  r||d
d
d   k7  ryt        |      }|y\  }}|d	k(  r+t        t        |      j                  t        | fd      }n~|dk(  r`t        t        |      j                  t        | fd      }|j                     }t        j                  fd| D              }	||	z  dk7  ry|dk(  r| d   }nt        d|       d
}
|}|j!                         r0t        j                  fd| D              }|dkD  r|}
t#        d      }||f}t%        d||j                  |||
      d
fS )a  
        Try to create a flattened transform from 2+ same-type transforms.

        Returns (result, failure_reason) where:
        - result is the FlattenedTransformInfo if successful, None otherwise
        - failure_reason is None if successful, or one of:
          - "too_few_transforms": Less than 2 transforms provided
          - "no_flattened_mesh": No flattened mesh exists for the required dimensions
          - "uneven_tensor_shape": For reduce_scatter, tensor dim not evenly divisible
        r   )Ntoo_few_transformsr   c              3   D   K   | ]  } |j                           y wrd   )r*   )re   infor   first_placementss     r   rh   zJ_optimize_transform_infos.<locals>.try_create_flattened.<locals>.<genexpr>  s'      
 %T%<%<>NO
s    z5All transforms must have mergeable src_dst_placementsc              3   4   K   | ]  }|j                     y wrd   r)   )re   r   s     r   rh   zJ_optimize_transform_infos.<locals>.try_create_flattened.<locals>.<genexpr>  s     :D$--:   r6   )Nr   r7   N)Nr   c                 "    | j                      S rd   r+   xaffected_dims    r   <lambda>zI_optimize_transform_infos.<locals>.try_create_flattened.<locals>.<lambda>      aool6S r2   )r   c                 "    | j                      S rd   r   r   s    r   r   zI_optimize_transform_infos.<locals>.try_create_flattened.<locals>.<lambda>  r   r2   c              3   T   K   | ]  }j                  |j                         ! y wrd   )sizer)   re   r   rw   s     r   rh   zJ_optimize_transform_infos.<locals>.try_create_flattened.<locals>.<genexpr>  s%      248  /2s   %()Nr   r5   z0Unsupported comm type for try_create_flattened: c              3      K   | ]I  }t        t        |j                  d          j                  dk(  rj	                  |j
                         K yw)r   r   N)r   r   r*   r   r   r)   r   s     r   rh   zJ_optimize_transform_infos.<locals>.try_create_flattened.<locals>.<genexpr>  sH      !8!8!;<FF%O   /s   AAr-   r   )r)   r*   r+   rH   rI   rJ   )rU   r*   r=   allr.   rC   sortedr   r   r   rS   maxr+   mathprodNotImplementedErrorr9   r   rG   )r   r   r\   sorted_mesh_dimsrt   r;   r<   outermost_infotensor_dim_sizeeffective_shard_mesh_sizerJ   
merged_srcscalemerged_placementsr   r   r   rw   s                 @@r   try_create_flattenedz7_optimize_transform_infos.<locals>.try_create_flattened  s    u:>- !866!H++-	 

 
 !G  :E::	 	!23 (( ,,6,& ,TrT2266{DTU!, $S$s+//L ,STN**s+//L ,STN,::<HO(,		 2<A2 )% !::a?2,&"1XN%B9+N 
 	
>>II ! E
 qy!	$U^
'- $#4,::##3# 

 
	
r2   r   r-   )r   r   r   c              3   4   K   | ]  }|j                     y wrd   r   )re   gs     r   rh   z,_optimize_transform_infos.<locals>.<genexpr>Y  s     (C(Cr   z5_optimize_transform_infos original: %s, optimized: %s)rU   r   	frozensetrD   boolrC   r   r   r(   rG   r=   rY   r*   extendr   r   r   debug)r   rw   r   r   r   r   resultrf   r   current_keyrO   groupj	flattenedfailure_reasonr\   r   r   s    `              @@r   _optimize_transform_infosr   y  s]   : ?a3 %%ST+#* + +
)Y&'
-29i3G-H
	
@i
N#i
	&-sTz9	:i
X >@F	A
c/"
"q!))+ K(MM$FA
 "44'+fEO$$_Q/>>@A("557I LL+,FA O$$_Q/>>@A("557I %9$?!	> MM)$ MM%  " 
 "&(CU(C"CD	7""J" e c/"
"f LL?RX Mr2   DTensorRedistributePlanner_planner_cachedtensor_metac                     t               rt        | |      S t        j                  |       |f}|t        vrt        | |      }|t        |<   t        |   S )a  
    Factory function to get or create a DTensorRedistributePlanner instance.
    This function provides transparent caching of planner instances based on
    device mesh and dtensor meta. Multiple calls with the same parameters
    will return the same cached instance for better performance.
    Args:
        device_mesh: The device mesh for the planner
        dtensor_meta: TensorMeta of the DTensor to redistribute
    Returns:
        A DTensorRedistributePlanner instance (potentially cached)
    )r   r   weakrefrefr   )rw   r   r   planners       r   get_redistribute_plannerr   s  sS     )+|DD[)<8I&,[,G$+y!)$$r2   c                  ,    t         j                          y)z8Clear the cache of DTensorRedistributePlanner instances.N)r   clearrE   r2   r    clear_redistribute_planner_cacher     s    r2   c                      e Zd ZdZ ej
                  dd       G d d             Zd Zede	e
ee
   f   defd	       Zedede	e
ee
   f   fd
       Ze	 	 d%dedee   deedf   dedz  dedefd       ZdededdfdZ	 	 d&dZdeedf   dede	def   fdZdededed   fdZddde
dee
df   dee   fd Zd!ed"edee
df   dee   fd#Z d!ed"edee   fd$Z!y)'r   a  
    This class is used to plan the collective calls to transform the local shard
    of the DTensor from its current spec to the target spec.
    Suppose there are N tensor dimensions and M mesh dimensions, the total
    possible state size will be (N+2)*M*M!.
    Note: Use get_redistribute_planner() factory function instead of direct
    instantiation for automatic caching.
    Tr$   c                       e Zd ZU eedf   ed<   eed<    ej                  dddd      Z	e
dz  ed<   d Zd	 Zd
 Zde
fdZde
fdZdedefdZy)$DTensorRedistributePlanner.DistState.
placementstensor_dim_to_mesh_dimNF)defaultinitreprcompare_hashc                 V    t        j                  | j                  | j                        S rd   )r   format_shard_order_strr   r   r/   s    r   __str__z,DTensorRedistributePlanner.DistState.__str__  s%    55++ r2   c                 "    | j                         S rd   )r   r/   s    r   __repr__z-DTensorRedistributePlanner.DistState.__repr__  s    <<>!r2   c                 N    t         j                  | d| j                                y )Nr   )object__setattr___compute_hashr/   s    r   r1   z2DTensorRedistributePlanner.DistState.__post_init__  s"    ""$r2   r3   c                 R    | j                   | j                   S | j                         S rd   )r   r   r/   s    r   __hash__z-DTensorRedistributePlanner.DistState.__hash__  s#    !%!74::QT=O=O=QQr2   c                 D    t        | j                  | j                  f      S rd   )r   r   r   r/   s    r   r   z2DTensorRedistributePlanner.DistState._compute_hash  s$    OO// r2   otherc                     t        |t        j                        sy| j                  |j                  k7  ry| j                  | j
                  f|j                  |j
                  fk(  S NF)rR   r   	DistStater   r   r   )r0   r   s     r   __eq__z+DTensorRedistributePlanner.DistState.__eq__  sa    e%?%I%IJzzU[[(++   ,, r2   )r>   r?   r@   rC   r   rB   r   dataclassesfieldr   rA   r   r   r1   r   r   r   r   r  rE   r2   r   r  r     sy    )S.)) **-K--u5%
sTz 	
		"		Rc 	R	3 		 	4 	r2   r  c                 ^     t        |t        t        z        rt         fd|D              S |S )z<Convert a nested list structure to a nested tuple structure.c              3   @   K   | ]  }j                  |        y wrd   )	_to_tuple)re   itemr0   s     r   rh   z7DTensorRedistributePlanner._to_tuple.<locals>.<genexpr>  s     <$-<s   )rR   r   rC   )r0   r   s   ` r   r	  z$DTensorRedistributePlanner._to_tuple  s'    a&<!<<<r2   r   r3   c                 T    t        d t        | j                               D              S )zConvert dict to ShardOrderc              3   R   K   | ]  \  }}|rt        |t        |              ! yw))
tensor_dimr\   N)r   rC   )re   r   values      r   rh   zADTensorRedistributePlanner._dict_to_ShardOrder.<locals>.<genexpr>  s-      
U seElCC
s   %')rC   r   items)r   s    r   _dict_to_ShardOrderz.DTensorRedistributePlanner._dict_to_ShardOrder  s)      
$QWWY/
 
 	
r2   c                 v    t        t              }| D ]$  }t        |j                        ||j                  <   & |S )z1Convert ShardOrder to dict with tensor dim as key)r   r   r\   r  )r   tensor_mesh_dim_dictentrys      r   _ShardOrder_to_dictz.DTensorRedistributePlanner._ShardOrder_to_dict  s@      +40 	KE59%//5J !1!12	K##r2   NrH   r   rZ   .src_shard_order use_strided_shard_as_shard_orderc                 .   t        |      | j                  k7  r$t        dt        |       d| j                         |rt        j                  || d      \  }}|t        j
                  |      }t        |      }t        j                  |      }t        j                  t        |      |      }|g}g }	|D ]s  }
t        |
t              }t        |
||       t        j                  t        |      t        j                  |            }|j                  |       |	j                  |       u t!        |d         g}t#        |	      D ]<  \  }}|rdnd}|j                  |       |j                  t!        ||dz                   > d	j%                  |      S )
a  
        Generate a string representation of the sequence of state transitions
        (placements and shard orders) as described by the given transform_info.

        Args:
            mesh: The DeviceMesh used for the redistribution.
            transform_infos: A sequence of _TransformInfo objects describing each
                transformation step.
            src_placement: The initial tuple of Placement objects.
            src_shard_order: (Optional) The initial ShardOrder representing
                the mapping of tensor dimensions to mesh dimensions. If None,
                the default shard order is computed from src_placement and mesh.
            use_strided_shard_as_shard_order: If True, normalize _StridedShard
                placements into regular Shard placements with an explicit
                shard_order before stringifying.

        Returns:
            A string showing the sequence of DistState transitions, separated by '->'.
        zdimensions mismatch z vs Tr  r   z-->z->r-    )rU   ndimr.   r   &_normalize_placements_into_shard_ordercompute_default_shard_orderr   r   r  r  rC   rR   rG   ra   r  rY   rD   	enumerater   )rH   r   rZ   r  r  cur_placementrP   	cur_state
state_listis_flattened_listrN   is_flattened	new_statetrace_partsrf   	separators                   r   stringify_transform_infosz4DTensorRedistributePlanner.stringify_transform_infos  s   6 }* &s='9&:$tyykJ  ,BB!4$ +M?
 ")EEmTO]+5II
 /88- /
	 

 )+- 	3N%n6MNL./? 3<<m$*>>?OPI i($$\2	3 :a=)*():; 	7OA|!-4Iy)s:a!e#456	7 ww{##r2   rw   r   c                     || _         |j                         st        |t        || _        t	        |j
                        | _        t               | _        t               | _	        | j                          y)z
        Initialize DTensorRedistributePlanner.

        Args:
            device_mesh: The device mesh for this planner
            dtensor_meta: TensorMeta of the DTensor to redistribute
        N)rw   _is_current_rank_part_of_meshr.   r   rU   shapetensor_dimensionrT   "strided_shard_placements_in_targetpartial_reduce_ops_in_targetsetup_cost_callbacks)r0   rw   r   s      r   __init__z#DTensorRedistributePlanner.__init__.  se     '88:    ( #L$6$6 7FIe/69e)!!#r2   c                 X     dt         j                  dt        f fdfd}| _        y)z
        Set up the cost function for different collective operations.
        Uses communication time estimation based on actual tensor sizes and
        mesh topology for accurate cost modeling.
        stater3   c                 t    t        j                  | j                  j                  | j                  d      S )NF)rH   r   tensor_metashard_orderr  )r   rw   r   r   r   )r0  r0   s    r   state_to_speczFDTensorRedistributePlanner.setup_cost_callbacks.<locals>.state_to_specN  s9     %% ++ --!8816 r2   c                 4    t         |        |            S rd   r   )	src_state	dst_stater4  s     r   cost_functionzFDTensorRedistributePlanner.setup_cost_callbacks.<locals>.cost_functionY  s    -i(-	*B r2   N)r   r  r   r8  )r0   r8  r4  s   ` @r   r-  z/DTensorRedistributePlanner.setup_cost_callbacksE  s-    		-77					
 +r2   r   tensor_mesh_dim_tupler   c           	         i }t         j                  |      }| j                  | j                  |      |      }|D ]  }|j                  }||   d   }t        ||   t              s+t        | j                        D ]  }	||	k(  r	||   j                         }
||	   j                  |
       t        |      }t        |	      ||
<   | j                  | j                  |      t         j                  |            }| j                  ||      ||<   ||   j                  |
       ||	   j                            |D ]  }|j                  }||   d   }t        ||   t              s+||   j                         }
t        |      }t               ||
<   | j                  | j                  |      t         j                  |            }||   j                  |
       | j                  ||      ||<    t        |      D ]d  \  }}t        |t               st        |      }t               ||<   | j                  | j                  |      |      }| j                  ||      ||<   f t        |      D ]  \  }}t        |t              st        | j                        D ]  }	t        |      }t        |	      ||<   ||	   j                  |       | j                  | j                  |      t         j                  |            }| j                  ||      ||<   ||	   j                            t        |      D ]  \  }}t        |t               st        | j                        D ]  }	t        |      }t        |	      ||<   ||	   j                  |       | j                  | j                  |      t         j                  |            }| j                  ||      ||<   ||	   j                            t        |      D ]  \  }}t        |t              s| j"                  D ]  }t        |      }t!        |      ||<   |D ch c]  }t        |t               s|j$                  ! }}t'        |      dkD  r|ddhk7  r\| j                  | j                  |      |      }| j                  ||      ||<     |D ]  }|j                  }||   d   }t        ||   t(              s+||   j                         }
t        |      }t               ||
<   | j                  | j                  |      t         j                  |            }||   j                  |
       | j                  ||      ||<    | j*                  s|S t        |      D ]  \  }}t        |t              s| j*                  D ]  }|j,                  }	t        |      }|||<   ||	   j                  |       | j                  | j                  |      t         j                  |            }| j                  ||      ||<   ||	   j                            |S c c}w )Nr   r-   r   r   )r   r  r  r	  r  rR   r   ranger*  rX   rY   r   r  r8  r   r  r   r,  r   rU   r   r+  rS   )r0   r   r9  all_next_stater  cur_dist_stater  src_tensor_dimsrc_mesh_dimdst_tensor_dimmove_mesh_dimnew_placements
dist_state	placementr)   r   ppartial_reduce_opsstrided_shard_objs                      r   get_next_statez)DTensorRedistributePlanner.get_next_state`  s   H MO9MM! 
 NN:&!
 + 	;E"--N/?CLj6>"'(=(="> ;!^3 !5^ D H H J$^4;;MJ!%j!105n0E}-!^^NN>2.BB,
 .2-?-?".z*
 %^4;;MJ$^488:+;	;B + 	E"--N/?CLj6>0@DDFM!*-N,5KN=)~.*>>?STJ !077F)-););*N:&	* (1'< 	#L)i1!*-N+4;N<(~.0EJ *.););*N:&	 $-Z#8 	;Hii3"'(=(="> ;!%j!1+0+@x($^4;;HE!^^NN>2.BB,
 .2-?-?".z* %^488:;	;, $-Z#8 	;Hii1"'(=(="> ;!%j!1+0+@x($^4;;HE!^^NN>2.BB,
 .2-?-?".z* %^488:;	;0 $-Z#8 	Hii3!>> 	!%j!1+29+=x( *8&$%:a;QAKK&" & )*Q.3E%QV3W!^^NN>24I
 .2-?-?".z*	: + 	E"--N/?CLj6F0@DDFM!*-N,5KN=)~.*>>?STJ !077F)-););*N:&	& 66!! $-Z#8 	;Hii3%)%L%L ;!!2!6!6!%j!1+<x($^4;;HE!^^NN>2.BB,
 .2-?-?".z* %^488:!;	;* Q&s   =X;X;r6  r7  c                    ddl }d}d|||gfg}t               }|r|j                  |      \  }}}	}
|	|k(  r|
S |	|v r$|j                  |	       | j	                  |	j
                  |	j                        }|j                         D ]0  \  }}||vs||z   }|
|gz   }|dz  }|j                  |||||f       2 |rt        d| d|       )aB  
        Find the min cost path from src_state to dst_state using Dijkstra's
        algorithm.

        Args:
            src_state: The source state
            dst_state: The destination state

        Returns:
            A list of states representing the min cost path from src_state to
            dst_state
        r   Nr-   zNo path found from src_state z to dst_state )
heapqrT   heappoprW   rH  r   r   r  heappushr.   )r0   r6  r7  rJ  counterpqvisitedcostr_   current_statepathnext_states
next_statetransition_costnew_costnew_paths                   r   find_min_cost_pathz-DTensorRedistributePlanner.find_min_cost_pathz  s    	  )i[12 	 %+0==+<(D!]D	)'KK&--((-*N*NK 0;/@/@/B R+
OW,#o5H#zl2HqLGNN2':x'PQR " +I;nYKP
 	
r2   r)   full_tensor_shapec           	      l   t        |      }|j                  D ]  }|j                  }|j                  }t	        |      dk  rt
        |D ]  }||k(  r	|j                  |   }	t        |	t              rM|	j                  ||   | j                  j                  |      | j                  j                  |            \  }
}nkt        |	t              rM|	j                  ||   | j                  j                  |      | j                  j                  |            \  }
}nt        d|	       |
||<     |S )Nr   r   zUnsupported placement type: )r   r   r  r\   rU   r.   r   rR   r   local_shard_size_and_offsetrw   r   _sym_get_coordinater   rV   )r0   r6  r)   rY  new_logical_shaper  r  r\   mdimrD  new_sizer_   s               r   get_logical_shapez,DTensorRedistributePlanner.get_logical_shape  sA    044E/F55 	9E))JI9~"$$! 98#%006	i/"+"G"G)*5((--t-<((<<TB#KHa
  	=9"+"G"G)*5((--t-<((<<TB#KHa %'CI;%OPP08!*-%9	90 ! r2   src_specdst_specc           	      d   dt         dt        t        t        df   t        f   fd} ||      \  }} ||      \  }}|D ].  }	t	        |	t
              s| j                  j                  |	       0 t        j                  ||      D ]8  }	t	        |	t              s| j                  j                  |	j                         : | j                  ||      }
| j                  ||      }g }| j                  |
|      }t        j                  |      D ]  \  }}|j                   |j                   k7  s d}t#        t%        |j                   |j                               D ]R  \  }\  }}||k7  s|dk7  rt'        d      |}| j)                  |||      }|j+                  t-        |||f|             T  |S )Nspecr3   .c                     | j                   r3t        j                  | j                  | j                  d      \  }}||fS | j
                  t        d|        | j                  | j
                  fS )NTr  zMissing shard_order field in )r  r   r  r   rH   r3  rV   )rd  rB  r3  s      r   _try_normalize_specz\DTensorRedistributePlanner.generate_graph_based_transform_infos.<locals>._try_normalize_spec  sv     44FF		9= , &{22##+$'DTF%KLL(8(888r2   r   z@Multiple mesh_dims are different between cur_state and nxt_stater)   r*   r+   )r   rC   r   r   rR   r   r+  rW   	itertoolschainr   r,  r   r  rX  pairwiser   r  zipr.   r`  rY   r(   )r0   ra  rb  rY  rf  r   r  r   dst_shard_orderrD  r6  r7  r   
state_pathr  	nxt_stateupdate_mesh_dimr)   r  nxt_placementr+   s                        r   $generate_graph_based_transform_infosz?DTensorRedistributePlanner.generate_graph_based_transform_infos  s   	9	95C(*45	9" +>h*G'*=h*G'
 ( 	GI)]377;;IF	G #H 	KI)W-1155i6I6IJ	K NN>?C	NN>?C	02,,Y	B
$-$6$6z$B 	 Iy##y';';;"$@I	,,i.B.BCA <H<}m %5*b0"0 b#  +3(,(>(>%x1B) (..*)84A=3Q.;		0 r2   c           	         t        |j                        }|g}g }| j                  j                  dk(  rZ|j                  d   |j                  d   k7  r9|j                  t        d|j                  d   |j                  d   f|             |S t        |j                        D ]  \  }}||   }t        |t              r|| j                  j                  dz
  k  s8| j                  j                  |      }	|j                  ||j                     |	| j                  j                  |            \  }
}t        |      }|
||j                  <   |j                  |       |j                  |        t        |j                        }t        |j                        }|j                  dkD  rt        t!        t#        |                  D ]  }||   }||   }t        |t              r|j                  }g g }}t        t%        ||            D ]T  \  }\  }}||k\  r nG|j'                  |      r|j                  |       |j'                  |      sD|j                  |       V ||k7  r
t)               }||k7  s|j                  t        |||f||                |||<    t        t%        ||            D ]5  \  }\  }}||k7  s|j                  t        |||f||                |||<   7 |S )a  
        Generate the transform infos from the source placements to the target placements.

        To transform from source to target placement it might have multiple steps, i.e. it
        might decompose Si -> Sj into Si -> R -> Sj.
        This would detect if there're mis-aligned/nested shardings between src/dst placements.
        E.g. Suppose the redistribution to perform is (Shard(0), Shard(0)) -> (Replicate(), Shard(0)),
        in this case Shard(0) -> Shard(0) for mesh dimension 1 actually needs resharding, because in
        the former is a nested-sharding of a tensor already already sharded dimension 0, whereas
        the latter is the first sharding on tensor dimension 0.
        r-   r   rg  r   )r   r)  rw   r  r   rY   r(   r  rR   r   r   _local_shard_size_and_offsetrS   r\  
num_shardsreversedr;  rU   rk  is_shardr   )r0   ra  rb  initial_logical_shapemesh_dims_to_logical_shaper   rf   r;   current_logical_shapemesh_dim_sizelocal_shard_sizer_   r]  rO   target_placementsr)   currenttarget	shard_dimcurrent_mesh_shardingtarget_mesh_shardingsrE  s                          r   generate_greedy_transform_infosz:DTensorRedistributePlanner.generate_greedy_transform_infos  s2   $ !%X^^ 4&;%<"02  A% ""1%)<)<Q)??&&"!"$//2$//2, '<	 #"
   3 34 	IFAs$>q$A!#u%t'',,q00$($4$4$9$91$9$EM*-*J*J-cgg6%((<<Q?+'$a
 )--B(C%1A%cgg..556GH*112GH	I( "("5"56 !4!45" %U3/A+B%CD (:,X6*84 fe, !'

IBDb+?)%..0AB& ;	6Aq =!::i0188;::i0077:; -0DD
 "+f$#**&%-07/@*DX*N 4:&x0Q(:Z ,5"$56,
 	6'H'w & &&"!),3V+<&@&J 06"8,	6 r2   r  rL   )"r>   r?   r@   rM   r  	dataclassr  r	  staticmethoddictrA   r   r   r  r  r   r   r(   rC   r   r   rD   r&  r   r.  r-  floatrH  rX  r   r`  r   rq  r  rE   r2   r   r   r     s0    [$d3. . 4.` 
tCcN3 

 
 
 $z $d3S	>.B $ $ 
 .216G$G$!.1G$ Y^,G$ $d*	G$
 +/G$ 
G$ G$R$$ !$ 
	$.+	+6V)S.)V  *V 
4e;	<	Vt0
"0
/80
	4	50
d!9! ! !c?	!
 
k	!BGG G !c?	G
 
n	GR~~ ~ 
n		~r2   ra  rb  use_graph_based_transformc                    | j                   }| j                  }|j                  }t        d ||fD               }t        d g | j                  |j                  D              }|s|rd}nt
        t
        }n|d}| j                  t        t        || j                        }|r 	 |j                  | || j                        }	|	S |j                  | |      }	|	S # t        $ r |j                  | |      }	Y |	S w xY w)Nc              3   F   K   | ]  }t        j                  |        y wrd   )r   is_default_device_order)re   orders     r   rh   z2_gen_transform_infos_non_cached.<locals>.<genexpr>  s#      $ 	++E2$s   !c              3   <   K   | ]  }t        |t                y wrd   )rR   r   )re   rE  s     r   rh   z2_gen_transform_infos_non_cached.<locals>.<genexpr>  s       	1m$s   TF)rw   r3  r   anyr   r   r2  r.   r   rq  r)  r
   r  )
ra  rb  r  rw   r  rl  has_non_default_orderhas_strided_sharddrpr   s
             r   _gen_transform_infos_non_cachedr    s<   
 &&K**O**O !$ $%7$ !   =8&&=)<)<=   1$(!	,	8$G!	"	*$)!#
"C !	V!FF(HNNO  ==hQ	 . 	V!AA(HUO 		Vs   (C C:9C:c                     t        | ||      S rd   )r  )ra  rb  r  s      r   _gen_transform_infosr    s     +(5 r2   )async_opr  is_explicitlocal_tensorcurrent_spectarget_specr  r  c                   |j                   |j                   k7  rt        d      |j                  t        d      t	        |j
                         t	        |j
                         | }|j                   }|j                         s| S t               rt        |||      }nt        |||      }t        |||j
                  |j
                        }	t               }
|
_|
j                  | |j
                  |j
                  t        j                  ||	|j
                  |j                  |j                        |      nt!        j"                         }|5  |	D ]  }t%        |t&              r|j                   }n|}|j(                  }|j*                  \  }}t%        |t,              st%        |t,              r||u sJ d       |j/                  |      }||k(  r| }|dk(  r| }|j1                         r|j3                         rRt5        t6        |      }|j9                  | ||      }t%        |t&              r |j:                  ||j:                  z  }n|j=                         r0t5        t>        |      }|jA                  | |||jB                        }nt%        |t,              r |jA                  | |||jB                        }nrtE        d| d	| d
      |j=                         rqt5        t>        |      }|j3                         rSt5        t6        |      }|jG                  | |||      }t%        |t&              r|j:                  ||j:                  z  }n|j1                         r%|jI                  | |||jK                  |            }n|j=                         rUt5        t>        |      }|jL                  |jL                  k7  rm|jO                  | |||jB                  |jL                        }nBt%        |t,              rC|jA                  | |||jB                        }|jI                  ||||jK                  |            }nt        d| d|       |j3                         rm|j1                         r%t5        t6        |      }|jQ                  | ||      }ntS        |      rtE        d| d	| d
      ||k7  rtU        d| d| d      | }nat%        |t,              rP|j3                         rGt5        t6        |      }|j9                  | ||      }|jW                  ||||jK                  |            }n|j1                         r$|jW                  | |||jK                  |            }n|j=                         rRt5        t>        |      }|jA                  | |||jB                        }|jW                  ||||jK                  |            }nct%        |t,              rB|jA                  | |||jB                        }|jW                  ||||jK                  |            }nt        d| d|       |s*t%        |tX        jZ                        r|j]                         }|}  	 ddd       |S # 1 sw Y   |S xY w)z
    This redistribute the local tensor (torch.Tensor) from the current DTensorSpec to
    the target DTensorSpec, which involves the necessary collective calls to transform
    the local shard of the DTensor from its current spec to the target spec.
    z)Cross device mesh comm not supported yet!NzUuse_strided_shard_as_shard_order should be initialized in DTensorSpec.__post_init__())r  z:_StridedShard redistribute assumes no flattened transformsr   r-   zredistribute from z to z not supported yetzUnexpected placement z& for redistribute to target placement z&Redistribution from one partial type (z) to another (z) is unsupported.)/rH   r   r  rV   r   r   r(  r   r  r  r   r   record_redistribute_callsr   r&  r3  
contextlibnullcontextrR   rG   r)   r*   r   r   r:   r9   r   r   _reduce_valuerJ   rv  r   _to_replicate_tensorr+   RuntimeError_reduce_shard_value_replicate_to_shardr\  rS   _to_new_shard_dim_partition_valuer   r.   _replicate_to_strided_shardfuncolAsyncCollectiveTensorwait)r  r  r  r  r  r  new_local_tensorrw   r   optimized_transform_infos
debug_moderedistribute_contextrN   mesh_to_userf   r}  r~  
num_chunkspartial_speccurrent_placementtarget_placement
shard_spec
replicateds                          r   redistribute_local_tensorr    s     K,,,!"MNN44<c
 	
 ","9"9:!+"8"89###K446 9+'@
 /+'@

 !:	! '(J  ! 	,,##""&@@)''((== $ 	- 	
 ##% $ 
 b,7 a	,N.*AB,11)''A,??OGV
 '=1Z5V"k1 P1 %))1)5J& #/ Q $0 ""$%%'#'#9L'3'A'A$k1($
 #>3JK*44@+;n>V>V+V(%%'(,UG(<%'8'M'M$k1n6R6R($  7'.'C'C$k1n6R6R($ ',WIT&AST  "#'v#6 %%'#'#9L'3'G'G$k16F($
 #>3JK*44@+;n>V>V+V())+'7'K'K$k1k6U6UVW6X($ %%'!%eW!5J!~~)9)=)==+5+G+G('*88,00,(  7!(!=!=$k1n6R6R"J (8'K'K"##77:	($ %/y8^_e^fg  ""$'')#'#8L'3'D'D$k1($ $G,&,WIT&AST  &(,DWI^\b[cctu  (4$FM2%%'#'#9L!-!;!;$k1"J (.'I'I"KK4S4STU4V($ ))+'-'I'I$k1k6U6UVW6X($ %%'(,UG(<%!2!G!G$k1n6R6R"J (.'I'I"KK4S4STU4V($  7 ")!=!=$k1n6R6R"J (.'I'I"KK4S4STU4V($ %/y8^_e^fg  
 &">">! $4#8#8#: +LCa	,b,F Gb,F s   TY,,Y6grad_outputdtensor.DTensorprevious_specoriginal_dtypebackward_dtypec           	      t   ||| j                   j                  k7  r| j                   j                  |      }t        | j                  j
                  | j                  j                  t        | j                  | j                         |      | j                  j                        }t        |j
                  |j                  |j                  |j                        }n| j                   }| j                  }g }t        |j                  |j                        D ][  \  }}	t        |      s|j                         r*|	j                         r|j!                  t#                      K|j!                  |	       ] t        |j
                  t%        |      |j                  |j                        }t'        ||||      }
|
j                  |k7  r|
j                  |      }
t        |j
                  t%        |      t        | j                  | j                         |
j                        |j                        }|
|fS )am  
    Common function for redistributing a distributed tensor during backward
    and twice-backward backpropagation steps.

    Args:
        grad_output: The output gradient tensor.
        previous_spec: DTensorSpec prior to redistribution.
        original_dtype: Original output tensor dtype from forward pass (for type checking)
        backward_dtype: Desired data type for backwards output.
        async_op: whether to perform the DTensor redistribute operation
                asynchronously or not. Default: False

    Returns:
        A :class:`torch.Tensor` object.
        A :class:`DTensorSpec` object.
    dtyper)  strider  rH   r   r2  r  )r   r2  r  )r  )r2  r  )_local_tensorr  tor   _specrw   r   r   r)  r  r  r2  rk  r   r:   r9   rY   r   rC   r  )r  r  r  r  r  r  r  normalized_placementsr}  r~  outputrd  s               r   _redistribute_backwardr    s   . !n8Q8Q8W8W&W"0033.3I""".."((33"!''"))+$	 .9->->-_-_

 $**$//$00-:-[-[	
 #00"(( .0|668P8PQ 17#w';';'=6CTCTCV!((5!((0	1  !!./!--)6)W)W	M '	F ||~%>*!!#$##%%',,

 *7)W)W	D 4<r2   c                       e Zd Ze	 	 	 ddddedeedf   dedej                  dz  d	ej                  dz  fd
       Z
edd       Zy)RedistributeNinputr  rw   r   .r  forward_dtyper  c           	      t   || _         || _        |j                  j                  | _        |||j                  j                  k7  rx|j                  j                  |      }t        ||j                  j                  t        |j                  |j                         |      |j                  j                        }n|j                  }|j                  }|| _        |j                  |k7  r)t        |||j                        }	t        |||	|d      }
n|}
|}	t!        j"                  |
|	|j$                        S )Nr  r  r  )r2  T)r  r  requires_grad)r  r  r  r  r  r  r   r  r   r   r)  r  r  r  r2  r  dtensorDTensorr  )ctxr  rw   r   r  r  r  r  r  r  r  s              r   forwardzRedistribute.forward+  s)     +"0066$%:M:M:S:S)S ..111FL&  ;;11&++ <<>'
 271]1]	L !..L ;;L'""j0%Z\5M5MK /! F "F&K --
 	
r2   c                     | j                   }t        j                  ||| j                  | j                  | j
                        }|d d d d d fS rd   )r  NestedRedistributeapplyr  r  r  )r  r  r  output_dtensors       r   backwardzRedistribute.backwardf  sZ    ((+11LL
 
 	
r2   FNN)r  r  )r>   r?   r@   r  r   rC   r   r   rx   r  r  r  rE   r2   r   r  r  *  s     ,0-18
 !8
  	8

 )S.)8
 8
 {{T)8
 d*8
 8
t 
 
r2   r  c                       e Zd ZdZe	 	 	 ddddededej                  dz  dej                  dz  f
d	       Z	edd
       Z
y)r  a  
    This class is used to make the redistribution of a DTensor twice-differentiable.
    This is called during the `Redistribute.forward`.
    Therefore, `NestedRedistribute.forward` is called during the first backward pass,
    and `NestedRedistribute.backward` is called during the second backward pass.

    Note: `NestedRedistribute.backward` is not differentiable, and therefore triple
    backward is not yet supported.
    Nr  r  r  r  r  r  c                     || _         |j                  j                  | _        |xs | j                  | _        t        ||| j                  ||      \  }}|| _        t        j                  |||j                        S )Nr  )
r  r  r  r  r  r  r  r  r  r  )r  r  r  r  r  r  r  rd  s           r   r  zNestedRedistribute.forward  s      (66<<+As/A/A-
   %33
 	
r2   c                     | j                   }| j                  }| j                  xs | j                  }t        j                  ||||| j                        }|d d d d d fS rd   )r  r  r  r  r  r  )r  grad2_outputr  r  r  r  s         r   r  zNestedRedistribute.backward  so    ((<<++As/A/A+11
 
 	
r2   r  )r  r  )r>   r?   r@   rM   r  r   r   rx   r  r  r  rE   r2   r   r  r  z  s      ,0-1
 '
 #	

 
 {{T)
 d*
 
@ 
 
r2   r  )TrL   rd   )NNF)Wr  r  rh  loggingr   r   collectionsr   collections.abcr   	functoolsr   typingr   rx   )torch.distributed._functional_collectivesry   _functional_collectivesr  torch.distributed.tensor._apitensor_apir  r   *torch.distributed.tensor._collective_utilsr	   &torch.distributed.tensor._dtensor_specr
   r   r   r   r   torch.distributed.tensor._utilsr   $torch.distributed.tensor.device_meshr   (torch.distributed.tensor.placement_typesr   r   r   r   r   r   torch.typesr   torch.utils._debug_moder   	getLoggerr>   r   r   r   rB   r   contextmanagerr   r#   r  r(   rG   r   r  rA   ra   rC   ru   r   rT   r   rD   r   r   r   ReferenceTyper   r   r   r  r  Tensorr  r  r  autogradFunctionr  r  rE   r2   r   <module>r     s         # $    : : / / E Q  J ;  $ 9 
		8	$ 48 #TD[ 7 6; ,d : 98d 98 98x A$ A A8 d$/    0 F d$/n  04.5".5Y.5 3S	>*.5 
	.5b
!&sCx$B?
?!&sCx?$?0 AD E#uS#X";<= E99S#X9 )S.)9 )S.)	9
 9 9 9 
9xp.)pp )S.)p )S.)	p
 
.2
23pn  	'



+Z
78 " %%% "%4
B BP .2666  $d{6 
.	6r  .2  $d{ 
.	  -1o,,oo o
 o  $d{o o \\oj *.)-Z"ZZ KK$&Z KK$&	Z
 ZzM
5>>** M
`A
00 A
r2   