
    9j              	          d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl
mZ d dl mZ d dlmZ d dlmZmZ d dlmZ d dlmZmZmZ d d	lmZ d dlZd dlmZ d dlmZ d dlmc m Z! d d
l"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9m:Z:m;Z; d dl<m=Z=m>Z>m?Z? d dl@mAZAmBZBmCZCmDZD d dlmEZEmFZF d dlGmHZI d dlJmKZKmLZLmMZMmNZNmOZO d dlPmQZQmRZRmSZSmTZTmUZUmVZVmWZW d dlXmYZY eVr) eZd e[dej                  j                                     Z^ndZ^eTrdZ_dZ`ej                  j                         Z^n.eUrdZ_dZ`n'eWrd Z_d!Z`ej                  j                         Z^nd"Z_d#Z`d$Z^ G d% d&e      Zb G d' d(e      Zc G d) d*ej                  e      Zed+ej                  d,ej                  d-efd.Zgd/ Zh	 	 dd+ej                  d0eifd1Zjdd2Zkd3 Zld4 Zmdd+ej                  d5eifd6Znd+ej                  d7eifd8Zod+ej                  d9eifd:Zp G d; d<      Zq G d= d>ee      Zr G d? d@ee      Zs G dA dBes      Zt G dC dDes      Zu G dE dFee      Zv G dG dHev      Zw G dI dJej                        Zx G dK dLes      Zy G dM dNej                        Zz G dO dPej                        Z| G dQ dRej                        Z}e j                  dSefdT       Ze j                  dUefdV       Ze j                  dWefdX       Ze j                  dYefdZ       Ze j                  d[efd\       Zee j                  d]efd^              Zee j                  d_efd`              Zee j                  daefdb              Zee j                  dcefdd              Zdeed-edfedgefdhZ	 ddiej                  djej                  dkeedlf   fdmZ ej                  eWdn       G do dpeM             Z G dq dr      Z G ds dteeL      Z G du dveeK      Zddwedz  fdxZ G dy dzej                        Z G d{ d|ej                        Z G d} d~ej                        Zy)    N)ABCabstractmethod)Callable)nullcontext)deepcopy)autoEnumwraps)Anycastno_type_check)mock)
checkpoint)
DeviceMesh)
CPUOffloadfully_shardFullyShardedDataParallel)TrainingState)FSDPParamGroupRegisterPostBackwardFunction)#NO_RESHARD_AFTER_FORWARD_STRATEGIES)BackwardPrefetchMixedPrecisionShardingStrategy)ShardedGradScaler)always_wrap_policyModuleWrapPolicywrap)distribute_tensorDTensorShard)ColwiseParallelparallelize_moduleRowwiseParallelSequenceParallel)TransformerDecoderLayerTransformerEncoderLayer)DistributedDataParallel)MultiProcContinuousTestMultiProcessTestCaseMultiThreadedTestCaserun_subtests
TEST_SKIPS)FILE_SCHEMAget_cycles_per_msset_rng_seed	TEST_CUDATEST_HPUTEST_WITH_ROCMTEST_XPU)
has_triton      cudancclzhpu:0hcclxpuxcclcpugloo   c                   (    e Zd Z e       Z e       Zy)FSDPInitModeN)__name__
__module____qualname__r   NO_FSDP	RECURSIVE     c/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/testing/_internal/common_fsdp.pyrB   rB   ^   s    fGIrI   rB   c                   6    e Zd Z e       Z e       Z e       Zy)DEVICEInitModeN)rC   rD   rE   r   DEVICE_BEFOREDEVICE_AFTERDEVICE_NEVERrH   rI   rJ   rL   rL   g   s    FM6L6LrI   rL   c                       e Zd ZdZedeej                  df   fd       Zedej                  fd       Z	edd       Z
eeded	edej                  fd
              Zy)FSDPTestModelzZThis defines the interface expected from all models used commonly for
    FSDP unit tests.return.c                      y)z+Returns an input for the model as as tuple.NrH   selfdevices     rJ   	get_inputzFSDPTestModel.get_inputt        	rI   c                      y)z,Returns the loss given the input and output.NrH   )rU   inputoutputs      rJ   get_losszFSDPTestModel.get_lossy   rX   rI   Nc                      y)z<Runs the backward pass (e.g. including ``loss.backward()``).NrH   rU   losss     rJ   run_backwardzFSDPTestModel.run_backward~   rX   rI   argskwargsc                       y)z&Initializes an instance of this model.NrH   )ra   rb   s     rJ   initzFSDPTestModel.init   s     	rI   rR   N)rC   rD   rE   __doc__r   tupletorchTensorrW   r\   r`   staticmethodr   nnModulerd   rH   rI   rJ   rQ   rQ   p   s     5s):#;        C 3 299   rI   rQ   modelprocess_group	assert_fnc                 T   | j                         D cg c]%  \  }}||j                         j                         f' }}}|| j                         D cg c]%  \  }}||j                         j                         f' c}}z  }t	        j
                  |      }t        |      D 	cg c]  }	d }
}	t	        j                  |
||       |
d   }|t        d      |
dd D ]4  }|t        d      t        ||d      D ]  \  \  }	}\  }	} |||        6 yc c}}w c c}}w c c}	w )	a  
    All-gathers module states across ranks and calls ``assert_fn`` on each pair
    of corresponding states from rank 0 and a nonzero rank. For example, if
    ``assert_fn`` is ``self.assertEqual()``, then this checks that all module
    states are equal across ranks.
    Ngroupr   z$Expected rank0_states to not be Noner@   zExpected state to not be NoneTstrict)
named_parametersdetachr>   named_buffersdistget_world_sizerangeall_gather_objectAssertionErrorzip)rm   rn   ro   
param_nameparamnamed_module_statesbuffer_namebuffer
world_size_olistrank0_statesstatep1p2s                  rJ   _assert_module_statesr      sC    "'!7!7!9J 
U\\^'')*  #(#6#6#8K 
fmmo))+,  $$]3J ,-aT-E-5"5]K8LCDDqr = !@AA #L% E 	GQWab"	
 .s   *D*D'	D%c                  4    t        j                  t              S N)rh   rV   DEVICE_TYPErH   rI   rJ   get_devtyper      s    <<$$rI   zero_buffersc                    |rt        j                  |       n	t               }|5  | j                         D ]/  }t	        j
                         5  |j                          ddd       1 |rB| j                         D ]/  }t	        j
                         5  |j                          ddd       1 ddd       y# 1 sw Y   xY w# 1 sw Y   PxY w# 1 sw Y   yxY w)zBZeros the parameters and optionally buffers of ``model`` in place.N)FSDPsummon_full_paramsr   
parametersrh   no_gradzero_buffers)rm   r   summon_fullctxr   r   s         rJ   _zero_modelr      s     -8$
!
!%
([]C	 #%%' 	E  	 --/ #]]_ #LLN# ### # # ## #s;   (CB43CC !
C4B=9C C	CCc                 t    |s| j                  t              } |r| j                          | j                         S r   )tor   half
state_dict)rm   cpu_offloadr   s      rJ   _get_state_dictr      s.    %

rI   c           	      j    dj                  |D cg c]  }|| t        |         nd c}      S c c}w )Nr   none)joinstr)test_name_mappingra   ss      rJ   subtest_namer      s7    88IMNAam	3q6	"	?N Ns   0c                 x   |j                         D ];  \  }}|j                  t        j                  d      k7  s)|j                         ||<   = | dk(  r|nd g}t	        j
                  |       t        t        t        t        j                  f   |d         }|D ]  }||   j                  t              ||<    |S )Nr>   r   )itemsrV   rh   r>   rx   broadcast_object_listr   dictr   ri   r   r   )rankr   r~   r   r   s        rJ   _broadcast_state_dictr      s     (--/ 1
E<<5<<..%*YY[Jz"1  19Z$/Eu%d3,-uQx8J  H
!+J!7!:!:;!G
:HrI   recursec                     t        j                  | |      5  t        t        | j	                                     cddd       S # 1 sw Y   yxY w)a[  
    Returns the full unsharded parameters of ``model``. Any FSDP-managed
    parameters offloaded to CPU are moved to GPU in the returned list.

    Args:
        recurse (bool): If ``False``, only unshards the parameters immediate to
            ``model``; if ``True``, recurses through the module hierarchy
            rooted at ``model``.
    )r   N)r   r   r   listr   )rm   r   s     rJ   get_full_paramsr      s?     
	 	 	8 2U--/012 2 2s   "AAmove_to_devicec                 4    |r| j                  t              S | S r   )r   r   )rm   r   s     rJ   _move_to_devicer      s    $2588K ==rI   	wrap_fsdpc                 (    |s| S t        | g|i |S r   r   )rm   r   ra   rb   s       rJ   _maybe_wrap_fsdpr      s    !5CtE'CD'CF'CCrI   c                   :    e Zd ZdedefdZdefdZdefdZd Zy)	DummyProcessGroupr   sizec                      || _         || _        y r   )_rank_size)rU   r   r   s      rJ   __init__zDummyProcessGroup.__init__   s    

rI   rR   c                     | j                   S r   )r   rU   s    rJ   r   zDummyProcessGroup.rank       zzrI   c                     | j                   S r   )r   r   s    rJ   r   zDummyProcessGroup.size   r   rI   c                 B    t        j                         }d }||_        |S )Nc                  d    t         j                  j                         } | j                  d       | S )Nr@   )rh   futuresFuture
set_result)futures    rJ   
get_futurez/DummyProcessGroup.allreduce.<locals>.get_future  s'    +0==+?+?+AFa MrI   )r   Mockr   )rU   ra   rb   	dist_waitr   s        rJ   	allreducezDummyProcessGroup.allreduce  s"    IIK		
  *	rI   N)rC   rD   rE   intr   r   r   r   rH   rI   rJ   r   r      s2    S  c c 	rI   r   c                        e Zd Zdej                  dededef fdZd Zd Z	d Z
d	 Ze	 	 	 ddej                  dededeeef   d
z  dededej$                  ez  fd       Zd Z xZS )TransformerWithSharedParamsrr   device_init_modeadd_bndeterministicc                    t         |           |j                         | _        |j                         | _        |rt        j                  d       d}d}t        j                  ||      | _	        t        j                  |dddd      | _        t        j                  ||      | _        | j                  j                  | j                  _        | j                  d| j                  j                  j!                  |f             | j                  d	t        j"                  | j$                  t
        j&                  
             d| _        |r)t
        j                  j+                  | j(                        nt
        j                  j-                         | _        |t0        j2                  k(  r| j5                  t6              } |r| j9                          y y )Nr         r8      g?)d_modelnum_encoder_layersnum_decoder_layersdim_feedforwarddropout
vocab_biaslong_buffer)dtype)superr   r   r   r   rh   manual_seedrk   	Embeddingembed_tokensTransformertransformerLinearoutput_projweightregister_buffernew_ones
zeros_liker   longbsBatchNorm1dIdentitybnrL   rM   r   r   eval)rU   rr   r   r   r   d_vocabr   	__class__s          rJ   r   z$TransformerWithSharedParams.__init__  sa    	JJL	**,a LL':>>  
 99Wg6 #'"3"3":":$++22;;WJG	
 	T__EJJ?	

 39%((&&tww/uxx?P?P?R~;;;77;'DIIK rI   c                 ,   t        j                  d| j                  z          t        j                  d|      j	                  d| j
                        }t        j                  | j
                  dz  |      j	                  d| j
                        }||fS )Nr@      rV      r7   )rh   r   r   arangeviewr   )rU   rV   srctgts       rJ   rW   z%TransformerWithSharedParams.get_input8  sl    !dii-(ll2f-221dgg>ll477Q;v6;;AtwwGSzrI   c                    | j                  |      }|| j                  z   | j                  j                  |      z   }| j                  |      }| j	                  |      }| j                  ||      }| j                  |      S r   )r   r   r   type_asr   r   r   )rU   src_idstgt_idsr   r   xs         rJ   forwardz#TransformerWithSharedParams.forward>  sv    (DOO#d&6&6&>&>s&CC(ggclS#&""rI   c                     |\  }}t         j                  j                  |j                  d|j	                  d            |j                  d      d      S )Nsum)	reduction)rk   
functionalcross_entropyr   r   )rU   rZ   r[   r   r   s        rJ   r\   z$TransformerWithSharedParams.get_lossF  sI    3}}**KKFKKO,chhrle + 
 	
rI   c                 $    |j                          y r   backwardr^   s     rJ   r`   z(TransformerWithSharedParams.run_backwardL      rI   Nfsdp_init_modefsdp_kwargsrR   c                 N   |i }|t         j                  k(  r&t        | t              r| d   }n| }t	        ||||      S |t         j
                  k(  rd|vrt        t        t        h      }n|j                  d      }d|v r8|d   t        j                  t        j                  hv rt        | t              sd}n| }t        | t              r| d   }	n| }	t	        |	|||      }
t        |
|fd|i|}|t        j                  k(  r|j!                  t"              }|S t%        d|       )au  
        Initializes a :class:`TransformerWithSharedParams` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps with
                top-level FSDP. By default, the top-level FSDP uses the
                ``ModuleWrapPolicy`` for encoder and decoder layers, but a
                different auto wrap policy may be specified via
                ``fsdp_kwargs``.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
            add_bn (bool): Whether to include batch norm in the model.
        Nr   auto_wrap_policysharding_strategyUnsupported FSDP init mode: )rB   rF   
isinstancerg   r   rG   r   r(   r'   popr   HYBRID_SHARD_HYBRID_SHARD_ZERO2r   rL   rN   r   r   
ValueError)rr   r  r   r  r   r   pgr  fsdp_pg
tformer_pgm
fsdp_models               rJ   rd   z TransformerWithSharedParams.initO  sX   6 K\111%'1X.$fm  |555!4#3//$  $/??3E#F  ${2 34$113C3W3WXY"5%0%'"1X
"
+,fmA  "2 	J  >#>#>>']];7
77GHIIrI   c                     | j                   gS r   )r   r   s    rJ   get_ignored_modulesz/TransformerWithSharedParams.get_ignored_modules  s      !!rI   )NFT)rC   rD   rE   rx   ProcessGrouprL   boolr   rW   r   r\   r`   rj   rB   r   r   r   rk   rl   r   rd   r  __classcell__r   s   @rJ   r   r     s    (  ( )( 	(
 (T#
 
 .2#KJ  KJ$KJ )KJ #s(^d*	KJ
 KJ KJ 
T	KJ KJZ"rI   r   c                        e Zd Zdej                  dededef fdZd Zd Z	d Z
d	 Ze	 	 ddej                  dededeeef   d
z  dedej$                  fd       Z xZS )NestedWrappedModulerr   r   r   r   c                    t         |           j                         | _        j                         | _        |t
        j                  k(  }fd}|rt        j                  d       t        j                  t        t        j                  dd      |       |t        j                   |t        t        j                  dd      |            t        t        j                  dd      |                   |t        t        j                  dd      |            t        t        j                  dd      |            | _        y )Nc                 &    rt        | fi S | S r   r   layerr  rr   r   s    rJ   _maybe_wrapz1NestedWrappedModule.__init__.<locals>._maybe_wrap      E58K88LrI   r   r   r7   r   )r   r   r   r   r   rL   rM   rh   r   rk   
Sequentialr   r   module	rU   rr   r   r   r   r  r   r'  r   s	    ``  `  rJ   r   zNestedWrappedModule.__init__  s     	JJL	**,)^-I-II	
 a mmBIIaO^<		!R0@. QR#BIIb"$5~F 		"a(8.IJBIIaO^<

rI   c                 x    t        j                  d| j                  z          t        j                  dd|      fS )Nr@   r7   r   r   )rh   r   r   randrT   s     rJ   rW   zNestedWrappedModule.get_input  s.    !dii-(

1a/11rI   c                 $    | j                  |      S r   r*  rU   r   s     rJ   r   zNestedWrappedModule.forward      {{1~rI   c                 &    |j                         }|S r   )r  rU   rZ   r[   r_   s       rJ   r\   zNestedWrappedModule.get_loss  s    zz|rI   c                 $    |j                          y r   r  r^   s     rJ   r`   z NestedWrappedModule.run_backward  r
  rI   Nr  r  rR   c                    |i }|t         j                  k(  rt        | d||      S |t         j                  k(  r:t        | fd||d|}|t        j
                  k(  r|j                  t              }|S t        d|       )a  
        Initializes a :class:`NestedWrappedModule` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
                modules with FSDP but not the top-level module. The model may
                later be wrapped with a top-level FSDP external to this method
                if desired.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
        Fr   r   r   Tr  )	rB   rF   r"  rG   rL   rN   r   r   r  )rr   r  r   r  r   r  s         rJ   rd   zNestedWrappedModule.init  s    . K\111&!1+	  |555,!1+	
 J  >#>#>>']];7
77GHIIrI   NF)rC   rD   rE   rx   r  r  rL   r   rW   r   r\   r`   rj   rB   r   r   r   rk   rl   rd   r  r   s   @rJ   r"  r"    s    
  
 
 )	

 
@2 
 .2#+J  +J$+J )+J #s(^d*	+J
 +J 
+J +JrI   r"  c                   h     e Zd Ze	 	 ddej
                  dededee	e
f   dz  def
 fd       Z xZS )	AlwaysWrapNestedWrappedModuleNrr   r  r   r  r   c                 0   t         t        t          	 | t        j                  |||      }|t        j                  k(  r|S |t        j
                  k(  rB|xs i }t        |fdt        i|}|t        j                  k(  r|j                  t              }|S y)z
        Initializes a :class:`NestedWrappedModule` instance, but unlike
        :meth:`NestedWrappedModule.init`, for the ``RECURSIVE`` init mode, this
        wraps with top-level FSDP and the ``always_wrap_policy()`` auto wrap
        policy.
        )rr   r  r   r  r   r  N)r   r9  rd   rB   rF   rG   r   r   rL   rN   r   r   )rr   r  r   r  r   rm   r  r   s          rJ   rd   z"AlwaysWrapNestedWrappedModule.init   s     )+H
'//-#'  
 	 \111L|555%+KeX6HXKXJ>#>#>>']];7
 6rI   r7  )rC   rD   rE   rj   rx   r  rB   rL   r   r   r   r  rd   r  r   s   @rJ   r9  r9    s^    
 .2#  $ ) #s(^d*	
  rI   r9  c                        e Zd Zdej                  dededef fdZedd       Z	e	 	 ddej                  de
ded	eeef   dz  def
d
       Z xZS )NonUniformReqGradNWMrr   r   r   r   c                    t         t        |           j                         | _        j	                         | _        |t        j                  k(  }fd}|rt        j                  d       t        j                  t        t        j                  dd      |       |t        j                   |t        t        j                  dd      |            t        t        j                  dd      |                   |t        j                  t        t        j                  dd      |      t        t        j                  dd      |                        | _        y )Nc                 &    rt        | fi S | S r   r   r%  s    rJ   r'  z2NonUniformReqGradNWM.__init__.<locals>._maybe_wrap5  r(  rI   r   r   r7   r   )r   r"  r   r   r   r   rL   rM   rh   r   rk   r)  r   r   r*  r+  s	    ``  `  rJ   r   zNonUniformReqGradNWM.__init__"  s     	!413 JJL	**,)^-I-II	
 a mmBIIaO^<		!R0@. QR#BIIb"$5~F #BIIb!$4nE#BIIaO^D
rI   Nc                     | j                         D ]-  \  }}t        j                  ||      r|j                  d       / y r7  )ru   rematchrequires_grad_)rm   req_grad_masknps       rJ   _set_nonuniform_req_gradz-NonUniformReqGradNWM._set_nonuniform_req_gradL  s:    **, 	(DAq88M1-  '	(rI   r  r  c                    t        j                  d      }|t        j                  k(  r't	        | d||      }t        j                  ||       |S |t        j                  k(  rT|i }t	        | fd||d|}|t        j                  k(  r|j                  t              }t        j                  ||       |S t        d|       )a  
        Initializes a :class:`NestedWrappedModule` instance, but unlike
        :meth:`NestedWrappedModule.init`, it wraps a second :class:`torch.nn.Sequential`
        container to enable the desired non-uniform ``requires_grad``
        ``use_orig_params=True`` tests. For both ``RECURSIVE`` and ``NO_FSDP``
        init modes, freezes all parameters except the last two to validate
        ``ShardedGradScaler`` support for ranks with no (non-zero sized) local shards in
        FSDP ``use_orig_params=True`` mode.
        zmodule\.2.*\.1.*Fr6  Tr  )r@  compilerB   rF   r<  rF  rG   rL   rN   r   r   r  )rr   r  r   r  r   req_grad_pattern	ddp_modelr  s           rJ   rd   zNonUniformReqGradNWM.initR  s    ( ::&9:\111,!1+	I !99)EUV|555" -!1+	
 J  >#>#>>']];7
 99*FVW77GHIIrI   re   r7  )rC   rD   rE   rx   r  r  rL   r   rj   rF  rB   r   r   r   rd   r  r   s   @rJ   r<  r<  !  s    (
  (
 (
 )	(

 (
T ( (
 
 .2#+J  +J$+J )+J #s(^d*	+J
 +J +JrI   r<  c                        e Zd ZdZdej
                  dedef fdZd Zd Z	d Z
d	 Zed
ee   dedededef
d       Z xZS )ModuleWithDelayzThis class wraps a :class:`FSDPTestModel` to optionally add a delay
    after computing the loss and/or before the gradient reduction.r*  delay_after_loss_msdelay_before_reduction_msc                 L    t         |           || _        || _        || _        y r   )r   r   rM  rN  r*  )rU   r*  rM  rN  r   s       rJ   r   zModuleWithDelay.__init__  s'     	#6 )B&rI   c                 8    | j                   j                  |      S r   )r*  rW   rT   s     rJ   rW   zModuleWithDelay.get_input  s    {{$$V,,rI   c                 $    | j                  |      S r   r/  r0  s     rJ   r   zModuleWithDelay.forward  r1  rI   c                 B   | j                   j                  ||      }| j                  dkD  rst        st        r$t        j                  | j                  dz         |S t        r=t        j                  j                  t        | j                  t               z               |S Nr     )r*  r\   rM  r3   r5   timesleepr2   rh   r9   _sleepr   r0   r3  s       rJ   r\   zModuleWithDelay.get_loss  sy    {{##E62##a'8

433d:;  

!!#d&>&>ARAT&T"UVrI   c                      t         j                  j                   fd}t        j                  d|      5   j
                  j                  |       d d d        y # 1 sw Y   y xY w)Nc                     j                   dkD  rrt        r>t        j                  j	                  t        j                   t               z               n.t        st        r"t        j                  j                   dz          | i |S rS  )rN  r2   rh   r9   rW  r   r0   r3   r5   rU  rV  )ra   rb   orig_reduce_scatterrU   s     rJ   _delayed_reduce_scatterz=ModuleWithDelay.run_backward.<locals>._delayed_reduce_scatter  sk    --1JJ%%D::=N=PPQ JJt==DE&777rI   z'torch.distributed.reduce_scatter_tensor)rh   distributedreduce_scatter_tensorr   patchr*  r`   )rU   r_   r[  rZ  s   `  @rJ   r`   zModuleWithDelay.run_backward  sW    #//EE	8 ZZ57N
 	+ KK$$T*	+ 	+ 	+s   AA'module_class
model_argsmodel_kwargsc                <    t         | j                  |i |||      S )aA  
        Args:
            module_class (Type[FSDPTestModel]): Wrapped module class to which
                to add delays.
            model_args: Positional arguments forwarded to the ``module_class``
                ``init()``.
            delay_after_loss_ms (int): Delay after computing the loss/before
                the optimizer step (in ms).
            delay_before_reduction_ms (int): Delay before reduce-scattering
                gradients (in ms).
            model_kwargs: Keyword arguments forwarded to the ``module_class``
                ``init()``.
        )rL  rd   )r_  rM  rN  r`  ra  s        rJ   rd   zModuleWithDelay.init  s,    * Lz:\:%
 	
rI   )rC   rD   rE   rf   rk   rl   r   r   rW   r   r\   r`   rj   typerQ   r   rd   r  r   s   @rJ   rL  rL    s    F				 !	 $'		-+$ 
=)

 !
 $'	

 
 
rI   rL  c                   ~    e Zd Zeej
                  ddddfdej                  dedede	e
ef   dz  ded	ed
efd       Zy)NestedWrappedModuleWithDelayNFr   rr   r  r   r  r   rM  rN  c           
      D    t         j                  t        | ||||||      S )Nrr   r  r   r  r   rM  rN  )rL  rd   r"  rg  s          rJ   rd   z!NestedWrappedModuleWithDelay.init  s4     ##)-#' 3&? $ 	
 		
rI   )rC   rD   rE   rj   rL   rN   rx   r  rB   r   r   r   r  r   rd   rH   rI   rJ   re  re    s     ,:+F+F-1##$)*
  
$
 )
 #s(^d*	

 
 !
 $'
 
rI   re  c                   $     e Zd Z fdZd Z xZS )DummyDDPc                 0    t         |           || _        y r   )r   r   r*  )rU   r*  r   s     rJ   r   zDummyDDP.__init__  s    rI   c                 &     | j                   |i |S r   r/  rU   ra   rb   s      rJ   r   zDummyDDP.forward  s    t{{D+F++rI   rC   rD   rE   r   r   r  r   s   @rJ   ri  ri    s    ,rI   ri  c                        e Zd Zdej                  dedededef
 fdZd Z	d Z
e	 	 	 ddej                  d
ededeeef   d	z  dedefd       Z xZS )MixtureOfExpertsrr   r   r   delay_before_free_msr   c                    t         |   ||||       || _        || _        || _        |t
        j                  k(  | _        |r"t        j                  d| j                  z          d}d}d}	t        t        j                  ||      | j                        }
t        d |
j                         D              | _        |
j                         D ]	  }d|_         |rt        j                  d       t        t        j                  ||      | j                        }|rHt        j$                  j'                  |j                         g      }t)        |
|fi |}
t)        ||fi |}t        j*                  t        t        j                  |	|      | j                        ||
t        t        j                  ||	      | j                              | _        y )	N)rr   r   r   r   *   r   r   r   c              3   <   K   | ]  }|j                           y wr   )numel).0rE  s     rJ   	<genexpr>z,MixtureOfExperts.__init__.<locals>.<genexpr>
  s     $L1QWWY$L   Tr   )r   r   rr   rp  r   rL   rM   r   rh   r   r   r   rk   r   r  r   num_expert_paramsexpertr\  	new_groupr   r)  r*  )rU   rr   r   r   rp  r   r  d_expertd_sharedd_inputry  rE  sharedexpert_groupr   s                 rJ   r   zMixtureOfExperts.__init__  s    	-'	 	 	
 
$8!"..2N2NNb499n- 8X!>@S@ST!$$L8I8I8K$L!L""$ 	AAH	 a  8X!>@S@ST ,,66L &,>+>F&%7;7FmmBIIgx8$:M:MNBIIh8$:M:MN	
rI   c                 f     j                   dkD  r j                  d   }t        |t              ret        j
                  j                  j                  j                   fd}t        j                  d|      5   j                  |      cd d d        S  j                  |      S # 1 sw Y   xY w)Nr   r8   c                      t         r>t        j                  j                  t	        j
                  t               z               n.t        st        r"t        j                  j
                  dz          | i |S )NrT  )r2   rh   r9   rW  r   rp  r0   r3   r5   rU  rV  )ra   rb   orig_reshardrU   s     rJ   _delayed_reshardz2MixtureOfExperts.forward.<locals>._delayed_reshard)  s]     

)) 9 9<M<O OP "X

4#<#<t#CD'888rI   z.torch.distributed.fsdp._runtime_utils._reshard)rp  r*  r  r   rh   r\  fsdp_runtime_utils_reshardr   r^  )rU   r   ry  r  r  s   `   @rJ   r   zMixtureOfExperts.forward#  s    $$q([[^F&$'$0055DDMM9 ZZDFV *  ;;q>* *
 {{1~* *s   ;B''B0c                    |j                          | j                  st        j                         5  | j	                         D ]v  }t        |d      r|j                  |j                  j                  | j                         t        j                  j                  |j                  | j                         x 	 d d d        y y # 1 sw Y   y xY w)Nry  rq   )r	  r   rh   r   r   hasattrgraddiv_r   r\  
all_reducerr   )rU   r_   rE  s      rJ   r`   zMixtureOfExperts.run_backward;  s    ~~ O* OAq(+ vv)DOO4))44QVV4::4NOO O O Os   -CACCNr  r  c                    |i }|t         j                  k(  rt        | d|||      S |t         j                  k(  r;t        | fd|||d|}|t        j
                  k(  r|j                  t              }|S t        d|       )a  
        Initializes a :class:`MixtureOfExperts` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
                modules with FSDP, including the expert and shared layers, but
                not the top-level module. The model may later be wrapped with a
                top-level FSDP external to this method if desired.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
            delay_before_free_ms (int): Delay before resharding expert
                parameters in the forward pass (in ms).
        F)r   r   rp  r   Tr  )	rB   rF   ro  rG   rL   rN   r   r   r  )rr   r  r   r  r   rp  r  s          rJ   rd   zMixtureOfExperts.initG  s    4 K\111#!1%9+  |555)!1%9+ J  >#>#>>']];7
77GHIIrI   )NFr   )rC   rD   rE   rx   r  r  rL   r   r   r   r`   rj   rB   r   r   r   rd   r  r   s   @rJ   ro  ro    s    2
  2
 2
 )	2

 "2
 2
h0
O 
 .2#$%0J  0J$0J )0J #s(^d*	0J
 0J "0J 0JrI   ro  c                        e Zd Z	 ddddddedej
                  dz  ded	ed
ef
 fdZdej                  dej                  fdZ	d Z
 xZS )MLPNTFr7   )biaswith_bufferdim_multiplierdimrV   r  r  r  c                
   t         |           t        j                  |||z  ||      | _        t        j                  ||z  |||      | _        |r)| j                  dt        j                  |f|             y d | _	        y )N)rV   r  r   r   )
r   r   rk   r   in_projout_projr   rh   randnr   )rU   r  rV   r  r  r  r   s         rJ   r   zMLP.__init__|  so     	yyns&:6PTU		.3"6FQUV  5;;vf+MNDKrI   r   rR   c                     | j                  |      }t        j                  |      }| j                  |      }t        j                  |      }| j                  || j                  z   }|S r   )r  Frelur  r   )rU   r   zs      rJ   r   zMLP.forward  sS    LLOFF1IMM!FF1I;;"DKKArI   c                     | j                   4t        j                  j                  j	                  | j                          y y r   )r   rh   rk   rd   normal_r   s    rJ   reset_parameterszMLP.reset_parameters  s+    ;;"HHMM!!$++. #rI   r   )rC   rD   rE   r   rh   rV   r  r   ri   r   r  r  r   s   @rJ   r  r  {  sv     '+
 ! t#
   " %,, /rI   r  c                   F     e Zd Zdddedef fdZdededed	d fd
Z xZS )MLPStackF)with_seq_parallelmlp_dimr  c                    t        |d      t        |      t        |d      g}|r&|j                  t        j                  |d             t	        |   |  || _        y )N   )r  Fr  )r  appendrk   	LayerNormr   r   r  )rU   r  r  modulesr   s       rJ   r   zMLPStack.__init__  sX     *L*	$
 NN2<<e<='"!2rI   tp_meshdp_meshuse_activation_checkpointingrR   c           
         t        d      t        d      t        d      t        d      t        d      | j                  rt        t        d            n	t               d}| j                  rt	        d      |d<   t        | ||       | D ]8  }t        |t        j                        r|rt        |       t        |fd	|i| : t        | fd	|i| | S )
NF)use_local_outputr@   )output_layouts)z	0.in_projz
0.out_projz	1.in_projz
1.out_projz	2.in_projz
2.out_proj)sequence_dim3)device_meshparallelize_planmesh)r#   r%   r  r"   r&   r$   r  rk   r  r   r   )rU   r  r  r  r  r  r*  s          rJ   parallelizezMLPStack.parallelize  s     )%@)5A(%@)5A(%@%% *qB "
 !!$4!$DS!4WGWX 	=F&",,/+6"<W<<	= 	D6w6+6rI   )	rC   rD   rE   r   r  r   r   r  r  r   s   @rJ   r  r    sD    BG 
3 
34 
3  '+	 
rI   r  c                        e Zd ZdZddedef fdZdej                  de	ej                  ej                  f   ej                  z  fdZ
 xZS )	DoubleLinearz
    This can be used for returning multiple outputs from a module
    (``use_second_linear=True``) or for having an unused module (``False``).
    r  use_second_linearc                     t         |           t        j                  ||      | _        t        j                  ||      | _        t        j                         | _        || _        y r   )	r   r   rk   r   lin1lin2ReLUr  r  )rU   r  r  r   s      rJ   r   zDoubleLinear.__init__  sG    IIc3'	IIc3'	GGI	!2rI   r   rR   c                     | j                   r@| j                  | j                  |            | j                  | j                  |            fS | j                  | j                  |            S r   )r  r  r  r  r0  s     rJ   r   zDoubleLinear.forward  sQ     !!99TYYq\*DIIdiil,CCCyy1&&rI   T)rC   rD   rE   rf   r   r  r   rh   ri   rg   r   r  r   s   @rJ   r  r    sO    
3C 3D 3''	u||U\\)	*U\\	9'rI   r  new_all_gather_into_tensorc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )rx   all_gather_into_tensorbarrier)r  orig_all_gathers     rJ   patch_all_gatherr    sO     11OLLN"<D6&5# 	&5#   0A;A  A;!A88A;new_foreach_all_gatherc              #   :  K   t         j                  j                  j                  j                  j
                  }t        j                          | t         j                  j                  j                  j                  _        	 d  t        j                          |t         j                  j                  j                  j                  _        y # t        j                          |t         j                  j                  j                  j                  _        w xY wwr   )rh   r\  r  _fully_shard_fsdp_param_groupforeach_all_gatherrx   r  )r  orig_foreach_all_gathers     rJ   patch_foreach_all_gatherr    s      	++==PP  	LLN 
''99L
# 	++==P 	# 	++==P    B DC ADA	DDnew_foreach_reducec              #   :  K   t         j                  j                  j                  j                  j
                  }t        j                          | t         j                  j                  j                  j                  _        	 d  t        j                          |t         j                  j                  j                  j                  _        y # t        j                          |t         j                  j                  j                  j                  _        w xY wwr   )rh   r\  r  r  r  foreach_reducerx   r  )r  orig_foreach_foreach_reduces     rJ   patch_foreach_reducer    s      	++==LL   	LLN 
''99H
' 	++==L 	' 	++==Lr  new_reduce_scatter_tensorc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )rx   r]  r  )r  rZ  s     rJ   patch_reduce_scatterr    sP     44LLN!:D9%8" 	%8"r  new_all_reducec              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )rx   r  r  )r  orig_all_reduces     rJ   patch_all_reducer    sJ     ooOLLN$DO*) 	)r  new_unshardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   unshardrx   r  )r  orig_unshards     rJ   patch_unshardr  +  Q      "))LLLN(N.!- 	!-r  new_reshardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   reshardrx   r  )r  r  s     rJ   patch_reshardr  8  r  r  new_post_backwardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   post_backwardrx   r  )r  orig_post_backwards     rJ   patch_post_backwardr  E  sR      (55LLN#4N :'9$ 	'9$r  new_backwardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   r	  rx   r  )r  orig_backwards     rJ   *patch_register_post_backward_hook_backwardr  R  sT      199MLLN,8 )>0=$- 	0=$-r  rZ  ra   rb   c                     t        |      dkD  r|d   }nd|v r|d   }nt        d| d|        ||        ||i |S )Nr   r[   z,Cannot get reduce-scatter output from
args: z	
kwargs: )lenr|   )clsrZ  ro   ra   rb   r[   s         rJ   reduce_scatter_with_assertr  _  sa     4y1}a	V	!;D6F8T
 	
 f///rI   replicated_modulesharded_moduleprefixes_to_ignore.c                    t        |j                         |j                         d      D ]  \  \  }}\  }}|}|D ]  }	|j                  |	d      } | j                  ||       | j	                  |t
               t        |t
              st        d      |j                  |j                  }}
t        |      t        d      t        d      fk(  rt        d      t        ||
|      }| j                  |j                         |j                                |j                  | j                  |j                         | j!                  |j                         t        |j                  |
|      }| j	                  |j                  t
               t        |j                  t
              st        d      | j                  |j                  j                         |j                                 y )NTrs    z&Expected sharded_param to be a DTensorr   zmFSDP's (Shard(0), Shard(0)) layout differs from distribute_tensor(), so we cannot check for equality using itz+Expected sharded_param.grad to be a DTensor)r}   ru   replaceassertEqualassertIsInstancer!   r  r|   r  
placementsrg   r"   r    to_localr  assertIsNoneassertIsNotNone)r  r  r  r  replicated_namereplicated_paramsharded_namesharded_paramclean_sharded_nameprefixr  r  sharded_ref_paramsharded_ref_grads                 rJ   check_sharded_parityr  r  s    OR**,'')O TJ+*-JlM
 *( 	HF!3!;!;FB!G	H);<]G4-1 !IJJ(44m6N6Njq58 44 ;  ..>jQ..02C2L2L2NO  (]//0M../,-=-B-BD*U]//9-,,g6 !NOO**3357G7P7P7RS9TrI   znot-support-multithreadc                   @     e Zd Zed        Z fdZd Zd Zd Z xZ	S )FSDPTestMultiThreadc                     t         S r   DEVICE_COUNTr   s    rJ   r   zFSDPTestMultiThread.world_size      rI   c                 B    t         |           | j                          y r   )r   setUp_spawn_threadsrU   r   s    rJ   r  zFSDPTestMultiThread.setUp  s    rI   c                      t        | g|i |S r   r-   rl  s      rJ   r-   z FSDPTestMultiThread.run_subtests      D242622rI   c                 @    t         j                  j                          y r   rh   _dynamoresetr   s    rJ   perThreadSetUpz"FSDPTestMultiThread.perThreadSetUp      rI   c                 @    t         j                  j                          y r   r  r   s    rJ   perThreadTearDownz%FSDPTestMultiThread.perThreadTearDown  r  rI   )
rC   rD   rE   propertyr   r  r-   r  r  r  r   s   @rJ   r  r    s)     3rI   r  c            #       T   e Zd ZdZd Zd Zd Zd Zed        Z		 	 	 	 	 	 	 d"d	e
j                  d
ededededz  dededz  dededeeef   dz  fdZddd e       dddddddddfdee   dedededz  dededededz  dedz  dedz  dedededed eeef   dz  deeef   dz  f d!Zy)#FSDPTestMixinz
    Mixin class containing shared test utilities for FSDP tests.
    Provides common helper methods for both FSDPTest and FSDPTestContinuous.
    c                 <    | j                  ||j                         y r   )r  r   )rU   r  r   s      rJ   _check_cpu_offloadz FSDPTestMixin._check_cpu_offload  s    j&<&<=rI   c                 <    | j                  ||j                         y r   )r  backward_prefetch)rU   r  r  s      rJ   _check_backward_prefetchz&FSDPTestMixin._check_backward_prefetch  s    *J,H,HIrI   c                 <    | j                  ||j                         y r   )r  forward_prefetch)rU   r  r"  s      rJ   _check_forward_prefetchz%FSDPTestMixin._check_forward_prefetch  s    ):+F+FGrI   c                      t        | g|i |S r   r  rl  s      rJ   r-   zFSDPTestMixin.run_subtests  r  rI   c                     | |      }||_         ||_        |j                  dd      }t        d|j                    d|j                          t
        dk7  r^t        j                  j                         |j                  k  r3t        j                  t        d|j                      j                         	 |r`t        j                  j                  j                  j                   j#                         }t%        j&                  d|j                  ||       nDt%        j&                  |j(                  t*        t-        |j                        |j                   	       d }
|j                   t2        z  }t4        st6        rt        j                  j9                  |       |g}
t%        j:                  |
       t        j<                  j?                          tA                |jC                  ||       t        j<                  j?                          t%        j:                  |
       t%        jD                          y # t.        $ r=}	d
|	j0                  d   v r&t        j                  t        d   j                          d }	~	ww xY w)Nfake_pgFdist init r=, world=r>   
multi-gpu-fakebackendr   r   storeinit_methodr,  r   r   	recompiler   backend_unavailable
device_ids)#r   	file_namegetprintr   r   rh   acceleratordevice_countsysexitr.   	exit_codetesting	_internalr\  r&  	FakeStorerx   init_process_groupr/  DISTRIBUTED_BACKENDr   RuntimeErrorra   r  r2   r5   set_device_indexr  r  r  r1   run_testdestroy_process_groupr  r   	test_namer4  piperb   rU   r&  r-  er3  	device_ids               rJ   _runzFSDPTestMixin._run  s   9~	"**Y.TYYKx/@AB%E$5$5$B$B$Dt$VHHZ*T__,= >?IIJ	//;;CCMMO''"#	 '' $ 0 0/"4??3	 
II,	..y9[

 	
+i&
+""$1  	affQi'$9:DDE		s   6B&H8 8	I>8I99I>NFrm   	num_stepsautocastlrfsdp_cpu_offload
save_modelmixed_precisionenable_sharded_grad_scaleruse_pure_fp16sharded_grad_scaler_kwargsc           	         |xr |j                   }t        |j                               j                  }|
i }
t	        d	d|i|
}t
        j                  j                  |j                         |d      }t        |      D ]%  }|j                          t
        j                  j                  t        |      5  |j                  j                  t        j                  t                    }|	s|rMt        |t               s=t        |t
        j"                        r|j%                         }nt'        d |D              } || }|rft        |t               rV|j(                  t*        vrD|j                         D ]1  }| j-                  |j                  t        j                  d             3 |j                  j/                  ||      j1                  |      }d d d        |j3                        }|s*|	s(|j4                  t
        j6                  k7  rt9        d      |	r+| j-                  |j4                  t
        j:                         nnt        |t               r4|t9        d      | j-                  |j4                  |j<                         n*| j-                  |j4                  t
        j6                         |j                  j?                  |       |rTt        |t               rD|j                         D ]1  }| j-                  |j                  t        j                  d             3 |jA                  |       |jC                          |s|jE                         jG                         D ci c]  \  }}||jI                          }}}tK        |       |jM                  |       ( t        |t               r|jO                  tP        jR                         jU                         S # 1 sw Y   xY wc c}}w )
Nenabledg?)rM  momentum)rU  c              3   <   K   | ]  }|j                           y wr   )r   )ru  r   s     rJ   rv  z9FSDPTestMixin._train_for_several_steps.<locals>.<genexpr>  s     %>1affh%>rw  r>   zQloss data type should be float32, as the original parameter data type is float32.z'Expected mixed_precision to not be NonerH   )+offload_paramsnextr   rV   r   rh   optimSGDrz   	zero_gradamprL  r   r*  rW   r  r   ri   r   rg   r  r   r  r\   r   scaler   float32r|   float16param_dtyper`   stepupdater   r   cloner   load_state_dict_assert_stater   IDLErv   )rU   rm   rK  rL  rM  rN  rO  rP  rQ  rR  rS  cpu_offload_paramsmodel_devicesharded_grad_scalerrZ  r   rZ   r[   rE  r_   kvr   s                          rJ   _train_for_several_stepsz&FSDPTestMixin._train_for_several_steps  s@    .Q2B2Q2QE,,./66%-)+&/ 
.
2L

  0 0 2rCHy! =	2AOO##K#B M..u||K/HI _Zt=T!%6 %

 %%>%> > '"5$/ //>? #--/ H((5<<3FGH ||,,UF;>>|L-M. ',,T2D"=::.(: 
 !$$TZZ?t,&.,E  $$TZZ1L1LM$$TZZ?LL%%d+!j&=))+ DA$$QXXu||E/BCD  $$U+&&(7<7G7G7I7O7O7QRtq!alR
R E"%%j1{=	2~ eT" 2 23{{}M Mn Ss   9DO(,O5(O2	r8   Tmodel_classr  r   ref_init_fn	num_itersr   r  r  r"  use_orig_paramsinit_kwargsc                    |t         j                  k(  rt        d      |i }d}| j                  j	                         } |j
                  | j                  t         j                  t        j                  fddi|}|Ct        rt        |t        gt              }n-t        dk(  rt        |      }nt        ||g|      }n ||      }|r|j                         }| j                  |||
du|||
|||	      }t        |j                               }|j                  |||	|
||d	       	  |j
                  | j                  |||fddi|}t'        |t(              st)        || j                  fi |}|r|j                         }|t        j*                  k(  r|j-                  t              }|duxr |j.                  }|xr |t        j*                  k(  }|xr |t        j*                  k7  }|rFt1        j2                  d      }|j                         D ]  }| j5                  |j2                  |         |r| j7                  t8        dt               n	t;               }|5  | j                  ||d||||
|||
      } ddd       |ry|r[t1        j2                  d      }|j                         D ]  }| j5                  |j2                  |          j-                  t              } t=        |      }!t0        j>                  jA                  | d       |
|s| j5                  ||!dd       yyy# t         $ r }t#        d
| dt%        |             |d}~ww xY w# 1 sw Y   xY w)a  
        Tests FSDP training against a reference, which defaults to DDP but
        may be customized with ``ref_init_fn``.

        Args:
            model_class (Type[FSDPTestModel]): A model class that inherits from
                ``FSDPTestModel``, which defines the expected interface.
            fsdp_init_mode (FSDPInitMode): The mode to initialize the
                FSDP-wrapped model. This should not be ``NO_FSDP``.
            ref_init_fn (Optional[Callable]): A callable to invoke that wraps a
                non-wrapped model to construct the reference model, where this
                wrapper should provide data parallel semantics. If ``None``,
                then the callable defaults to the DDP constructor.
        z.Expects an FSDP init mode that wraps with FSDPN{Gz?r   T)r3  output_devicer>   )rL  rM  rN  rP  rQ  rR  rS  )r   r  r  rP  r"  rq  zInitializing z raised error zOAn FSDP-managed module with parameter CPU offloading enabled has parameters on F)rL  rM  rN  rO  rP  rQ  rR  rS  )check_dtypezFSDP did not match DDP)exact_devicemsg)!rB   rF   r|   rn   r   rd   rL   rM   r3   DDPr   r   rm  r   r   rc  	Exceptionr  r   r  r   rN   r   rX  rh   rV   r  assertRaisesRegexrA  r   r   r<  assert_close)"rU   rn  r  r   ro  rp  rO  r   r  r  rP  r"  rq  rQ  rR  rr  rS  r  rM  r   rm   	ref_modelref_loss
ddp_paramsr  rH  rX  expects_device_errorexpects_cpu_device
cpu_devicer   context	fsdp_lossfsdp_unsharded_paramss"                                     rJ   _test_fsdp_parityzFSDPTestMixin._test_fsdp_parityO  s   D \111 !QRRK!!&&(     ((
 	

 
 {m;	 %J	4&M	#E*I!(I00$D0(+'A''A 1 

 )..01
*%6%6#2$4#2		

	Y)))"" 	
 # J *d+ j$*<*<LLJ#*J~:::#{3J$D0O[5O5O
 N/>3N3NN 	 N/>3N3NN 	 e,J#..0 ;  z:; $ ""%%0M3  	  	55!,% /+E++E 6 I	   e,J#..0 ;  z:;![1I /
 ; 	""8YE"J "=%!,	   ,9"K  	Y}[MAxPQWXX	YF	 	s$   )"L, M,	M5MMM!)rt  NFNFFN)rC   rD   rE   rf   r  r   r#  r-   classmethodrJ  rk   rl   r   r  floatr   r   r   r   r   rm  rc  rQ   rB   rL   r   r   r   r  rH   rI   rJ   r  r    s   
>JH3 4% 4%v .2 15+0#<@YyyY Y 	Y
 Y %t+Y Y ($.Y %)Y Y %)cNT$9Y@ (,",,595915!& %+0#-1<@#h-(h %h )	h
 _h h h  h ,d2h ,d2h ($.h h h %)h h  #s(^d*!h" %)cNT$9#hrI   r  c                   t     e Zd Z fdZed        Zed        Zedefd       Zed        Z	e
d        Z xZS )FSDPTestc                 h    t         |           dt        j                  d<   | j	                          y )N0TORCH_NCCL_DESYNC_DEBUG)r   r  osenviron_spawn_processesr  s    rJ   r  zFSDPTest.setUp  s)     14

,-rI   c                     t         S r   r  r   s    rJ   r   zFSDPTest.world_size  r	  rI   c                 >    t         j                  j                         S r   )rx   distributed_c10d_get_default_groupr   s    rJ   rn   zFSDPTest.process_group  s    $$7799rI   rR   c                      yr7  rH   r   s    rJ   destroy_pg_upon_exitzFSDPTest.destroy_pg_upon_exit  s     rI   c                 *    t          | j                   S r   )r/   r4  r   s    rJ   r/  zFSDPTest.init_method  s    t~~.//rI   c                     | |      }||_         ||_        |j                  dd      }t        d|j                    d|j                          t
        j                  j                         |j                  k  r3t        j                  t        d|j                      j                         	 |r`t
        j                  j                  j                  j                  j!                         }t#        j$                  d|j                  ||       nDt#        j$                  |j&                  t(        t+        |j                        |j                          d }
|j                   t0        z  }t2        st4        rt
        j                  j7                  |       |g}
t#        j8                  |
       t
        j:                  j=                          t?                |jA                  ||       t
        j:                  j=                          t#        j8                  |
       t#        jB                          y # t,        $ r=}	d	|	j.                  d
   v r&t        j                  t        d   j                          d }	~	ww xY w)Nr&  Fr'  r(  r)  r*  r+  r.  r0  r   r1  r2  )"r   r4  r5  r6  r   rh   r7  r8  r9  r:  r.   r;  r<  r=  r\  r&  r>  rx   r?  r/  r@  r   rA  ra   r  r2   r5   rB  r  r  r  r1   rC  rD  rE  s               rJ   rJ  zFSDPTest._run  s   9~	"**Y.TYYKx/@AB))+doo=HHZ*T__,= >?IIJ	//;;CCMMO''"#	 '' $ 0 0/"4??3	 
II,	..y9[

 	
+i&
+""$1  	affQi'$9:DDE		s   -B&H/ /	I588I00I5)rC   rD   rE   r  r  r   rn   r  r  r/  r  rJ  r  r   s   @rJ   r  r    ss        : : d   0 0 4% 4%rI   r  c                        e Zd ZU dZeZeed<   ede	fd       Z
ede	fd       Ze fd       Z fdZ fdZed	        Z xZS )
FSDPTestContinuousz
    FSDP test base class using MultiProcContinuousTest for faster test execution.
    This class reuses worker processes across tests, reducing process spawn overhead.
    Use this for tests that don't require fresh process state between tests.
    r   rR   c                     t         S r   )r@  r  s    rJ   backend_strzFSDPTestContinuous.backend_strU  s    ""rI   c                     t         S r   )r   r  s    rJ   device_typezFSDPTestContinuous.device_typeY  s    rI   c                 J   dt         j                  d<   t        j                  j	                         |k  r)t        j                  t        d|    j                         |t        z  }t        st        rt        j                  j                  |       t        | 9  |||       y )Nr  r  r)  )r  r  rh   r7  r8  r9  r:  r.   r;  r  r2   r5   rB  r   _init_pg)r  r   r   	rdvz_filerI  r   s        rJ   r  zFSDPTestContinuous._init_pg]  s}    
 14

,-))+j8HHZ*ZL 9:DDE<'	..y9z95rI   c                     t         |           | j                  | j                  k7  rt	        j
                          t        j                  j                          t                y r   )
r   r  r   MAIN_PROCESS_RANKrx   r  rh   r  r  r1   r  s    rJ   r  zFSDPTestContinuous.setUpm  s>     99...LLNrI   c                     | j                   | j                  k7  rt        j                          t        |           t        j                  j                          y r   )	r   r  rx   r  r   tearDownrh   r  r  r  s    rJ   r  zFSDPTestContinuous.tearDownw  s9    99...LLNrI   c                 .    | j                   j                  S r   )r   r  r   s    rJ   rn   z FSDPTestContinuous.process_group~  s    ~~   rI   )rC   rD   rE   rf   r  r   r   __annotations__r  r   r  r  r  r  r  r  rn   r  r   s   @rJ   r  r  L  sz     #J"#C # # C   6 6 ! !rI   r  compile_compute_on_modulec                 @      fd G d dt               fd}|S )Nc                      t        j                  j                  j                  | i | t	        | d         r| d   j                          y y )Nr   )rh   r\  r  r   r  rH  )ra   rb   r  s     rJ   !fully_shard_with_compiled_computez=compiled_fsdp_test.<locals>.fully_shard_with_compiled_compute  sN    **D;F;$,
G.1
 GOO1
rI   c                   (    e Zd Z e       Z e       Zy)*compiled_fsdp_test.<locals>.FullyShardModeN)rC   rD   rE   r   EAGERCOMPILED_COMPUTErH   rI   rJ   FullyShardModer    s    6rI   r  c                 4     t                fd       }|S )Nc                     t         j                  j                  j                  }D ]>  }|j                  k7  r"t               st        j                  dd       5t         j                  j                  j                  }t         j                  j                          |j                  k(  r|}n?|j                  k(  r"dt         j                  j                  _
        }nt        d|       |j                  |j                  <    | i | t         j                  j                          |j                  |j                  <   |t         j                  j                  _
        A y )Nz0Inductor on GPU needs Triton and recent GPU archr8   )
stacklevelr@   z!Need to implement FullyShardMode=)rh   r\  r  r   r  r6   warningswarn	_inductorconfigcompile_threadsr  r  NotImplementedError__globals__rC   )	ra   rb   original_fully_shardmodeoriginal_compile_threadsfully_shard_patchr  r  funcs	         rJ   wrapperz6compiled_fsdp_test.<locals>.decorator.<locals>.wrapper  s1   (-(9(9(>(>(J(J & R>///
MMJWX +0??+A+A+Q+Q(!!))+>///(<%^<<<=>EOO**:(I%-;D6B  CT  !5!>!>?d%f%!!))+BV  !5!>!>?9Q&&67RrI   r
   )r  r  r  r  s   ` rJ   	decoratorz%compiled_fsdp_test.<locals>.decorator  s"    	t	R 
	R> rI   )r	   )r  r  r  r  s   ` @@rJ   compiled_fsdp_testr    s"    " "!F rI   c                   &     e Zd Zd fdZd Z xZS )
SkipModulec                 \    t         |           t        j                  ddd      | _        y N
   Fr  )r   r   rk   r   linr  s    rJ   r   zSkipModule.__init__  s"    99R%0rI   c                 $    | j                  |      S r   )r  r0  s     rJ   r   zSkipModule.forward  s    xx{rI   re   rm  r   s   @rJ   r  r    s    1rI   r  c                   $     e Zd Z fdZd Z xZS )NestedLinearc                     t         |           |r:t        t        j                  ddd      j                  t                    | _        y t        j                  ddd      j                  t              | _        y r  )r   r   r   rk   r   r   r   nested_linear)rU   	fsdp_wrapr   s     rJ   r   zNestedLinear.__init__  sV    !%biiBU&C&F&F{&S!TD!#2r!>!A!A+!NDrI   c                 $    | j                  |      S r   )r  r0  s     rJ   r   zNestedLinear.forward  s    !!!$$rI   rm  r   s   @rJ   r  r    s    O%rI   r  c                   $     e Zd Z fdZd Z xZS )	SkipModelc                    t         |           t        j                  ddd      j	                  t
              | _        t               j	                  t
              | _        t        t        |      t
              | _        y )Nr  Fr  )r  )rI  )r   r   rk   r   r   r   linearr  linear_skipr   r  r  )rU   double_nestr   s     rJ   r   zSkipModel.__init__  sW    iiBU366{C%<??;7!;/;
rI   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r  r  r  r0  s     rJ   r   zSkipModel.forward  s4    KKNQq!rI   rm  r   s   @rJ   r  r    s    
rI   r  )FT)FFr  )rH   r   )
contextlibr  r@  r9  rU  unittestr  abcr   r   collections.abcr   r   copyr   enumr   r	   	functoolsr   typingr   r   r   r   rh   torch.distributedr\  rx   torch.nnrk   torch.nn.functionalr  r  torch.distributed._composabler   torch.distributed.device_meshr   torch.distributed.fsdpr   r   r   r   $torch.distributed.fsdp._common_utilsr   5torch.distributed.fsdp._fully_shard._fsdp_param_groupr   r   "torch.distributed.fsdp._init_utilsr   2torch.distributed.fsdp.fully_sharded_data_parallelr   r   r   *torch.distributed.fsdp.sharded_grad_scalerr   torch.distributed.fsdp.wrapr   r   r   torch.distributed.tensorr    r!   r"   !torch.distributed.tensor.parallelr#   r$   r%   r&   r'   r(   torch.nn.parallel.distributedr)   ry  *torch.testing._internal.common_distributedr*   r+   r,   r-   r.   $torch.testing._internal.common_utilsr/   r0   r1   r2   r3   r4   r5   torch.utils._tritonr6   minmaxr9   r8  r  r   r@  r<   rB   rL   rl   rQ   r  r   r   r  r   r   r   r   r   r   r   r   r   r"  r9  r<  rL  re  ri  ro  r  r)  r  r  contextmanagerr  r  r  r  r  r  r  r  r  r  rg   r   r  skipIfr  r  r  r  rc  r  r  r  r  rH   rI   rJ   <module>r     s    	 	 
    # $ "    + +        4 4 
 ? S 
 I R R F F  F H    + q#a!8!8!:;<LLK ::**,LK K 99))+LK L4 T BIIs 499$$ B% #99##""2299 2t 2>299 >d >DBII D$ D .Q"- Q"h[J- [J|$7 D]J. ]J@J
m J
Z
? 
.,ryy ,JJ* JJZ/")) /@*r}} *Z'299 '6 6 6 6 
X 
 
" 
X 
 
" 9H 9 9 *X * * .x .  . .x .  . :8 :  : >X >  >0!0 0 	0
 0. +-	"Tyy"T II"T c3h	"TJ 45/  6&L L^
O%}2 O%d4!(? 4!n/$+ /d 	%299 	%		 rI   