
    9jh                        d dl Z d dlZd dlZd dlZd dlmZmZmZmZ d dl	m
Z
mZmZ d dlZd dlmZ d dlmc mc mZ d dlmc mc mZ d dlmc mc mZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, d d	l-m.Z.m/Z/m0Z0m1Z1 d d
l2m3Z3 d dl4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZD erd dlEmFZF dZG	 d dlHmIZImJZJ dZLdZMeNej                  ej                  f   ZPej                  ePz  dz  ZQe:j                  e1j                  e:j                  e1j                  e:j                  e1j                  e:j                  e1j                  e:j                  e1j                  iZWe:j                  e:j                  gZXe:j                  e:j                  fZYe	 d`de'deQde:de>dz  de"dz  de'fd       ZZede'deQde"de'fd       Z[ede
de\fd       Z]ede"de\fd       Z^ed e_dej                  fd!       Z`ed"ej                  d e_dej                  fd#       Zad"ej                  d e_deNej                  ej                  f   fd$Zbe	 d`de'd%ej                  d&eej8                  j                     dz  d'eej8                  j                     eej8                  j                     z  dz  de'f
d(       Zed'efe
   d)e\ddfd*Zgede'd%ej                  d+ehej                     d,e_ej                  z  dz  de'f
d-       Zjede'd%ej                  de'fd.       Zkede'de:dz  d/e9dz  d0e6dz  d1e\d2e\d3e_d4e_de'fd5       Zlede'de'fd6       Zmede'd7e5d8e\de'fd9       Zned`de'de"de'fd:       Zoede'de'fd;       Zpd%ej                  d<efej                     ddfd=Zqede'd>ej                  d,e_ej                  z  dz  d?eej                  gdf   dz  d@e\de'fdA       Zrede'd<efej                     d>ej                  fdB       ZsdCej                  dDeej8                  j                     dz  dehej                     fdEZt	 d`dCej8                  j                  d&ehej8                  j                     dFeej8                  j                     dz  dehej8                  j                     fdGZudCej8                  j                  d&ehej8                  j                     dehev   fdHZwdCej                  dehev   fdIZxd%ej                  d+ehej                     d,e_ej                  z  dz  ddfdJZyd,e_ej                  z  dz  dKe_dLe&dej                  dz  fdMZzd%ej                  d+ehej                     d&ehej                     deNe\e\f   fdNZ{dCej                  d?eej                  gdf   d&ehej                     ddfdOZ|dCej                  dPej                  dz  d&ehej                     dLe&fdQZ}dCej                  d&ehej                     defej                     fdRZ~d%ej                  d+ehej                     dSehej                     dPej                  dz  ddf
dTZd<efej                     dUefej                     dPej                  dz  ddfdVZdW Zd%ej                  d+ehej                     dPej                  dz  dKe_dLe&dej                  fdXZd%ej                  d<efej                     dej                  ddfdYZdZefej                     ddfd[Zd%ej                  d+ehej                     deej                     fd\Zd+ehej                     ddfd]Zde:fd^Zdej                  de j                  fd_Zy# eK$ r dZGY w xY w)a    N)Callable	GeneratorIterableIterator)Anyno_type_checkTYPE_CHECKING)
OpaqueBase)default_hooks)
DeviceMesh)_get_default_group)_FSDPDeviceHandle
_FSDPState_get_module_fsdp_state_is_fsdp_flattened!_named_parameters_with_duplicatesclean_tensor_nameTrainingState)_FSDP_USE_FULL_PREC_IN_EVALFlatParameterFlatParamHandleHandleShardingStrategy)_FreeEventQueue)BackwardPrefetch
CPUOffloadFullOptimStateDictConfigFullStateDictConfigMixedPrecisionShardingStrategyStateDictConfigStateDictType)_Policy)DTensorExtensions)_sync_params_and_buffers)is_traceable_wrapper_subclass)RemovableHandleT)deferred_initfakeFi  _fsdp_syncedstateprocess_groupsharding_strategypolicydevice_meshreturnc                 (   ||t        d      |t        v }|r#|||t        d| d      t        | ||      } n4|r|| _        |j	                  d      | _        n||n	t               | _        | j
                  j                         | _        | j
                  j                         | _	        | j                  }|r|| j                  j                         z  }t        j                  j                  |      | _        || j                  z  | _        | S )NzcCannot pass both process_group and device_mesh at the same time. Please just pass only one of them.zManual wrapping with zA requires explicit specification of process group or device_mesh.r   mesh_dim)
ValueErrorHYBRID_SHARDING_STRATEGIES*_init_process_group_state_for_hybrid_shard_device_mesh	get_groupr+   r   ranksize
world_size_inter_node_pgr   DefaultState_get_gradient_predivide_factor_gradient_predivide_factor_gradient_postdivide_factor)r*   r+   r,   r-   r.   is_hybrid_strategydata_parallel_world_sizes          b/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/distributed/fsdp/_init_utils.py_init_process_group_staterC   Z   sI     [%<<
 	
 +.HH V^8K '(9': ;S S 
 ?}kE !,E"-"7"7"7"CE "/!:@R@T  $$))+EJ**//1E$// E$8$8$=$=$?? ""AA$	
 
$ 	!5#C#CC 
% L    c                    |rYt        |      r6|| _        |j                  d      | _        |j                  d      | _        nt        d|j                         |@t               }t        || j                  j                               \  }}|| _        || _        n2t        |      r|\  | _        | _        nt        dt        |             t        | j                        | _        | S )Nr   r1      z,Expected device_mesh to have ndim=2 but got zmExpected process_group to be passed in as either None or Tuple[dist.ProcessGroup, dist.ProcessGroup] but got r+   )"_is_valid_hybrid_shard_device_meshr6   r7   r;   r+   r3   ndimr   !_init_intra_and_inter_node_groups_device_handledevice_count_is_valid_hybrid_shard_pg_typetype_get_default_comm_hook_state_inter_node_state)r*   r+   r.   default_groupintra_node_groupinter_node_groups         rB   r5   r5      s
    -k:!,E $/#8#8!#8#DE "-"7"7"7"CE>{?O?O>PQ  
	*,-N5//<<>.
** // *-8 9F5E!5GGKMGZF[] 
 ;**E LrD   c                 j    t        | t              xr" t        |       dk(  xr t        d | D              S )N   c              3   P   K   | ]  }t        |t        j                           y wN)
isinstancedistProcessGroup).0pgs     rB   	<genexpr>z1_is_valid_hybrid_shard_pg_type.<locals>.<genexpr>   s     Jb
2t001J   $&)rX   tuplelenallrG   s    rB   rM   rM      s:     	=%( 	K!#	KJMJJrD   c                 D    t        | t              xr | j                  dk(  S )NrU   )rX   r   rI   )r.   s    rB   rH   rH      s    k:.H;3C3Cq3HHrD   num_devices_per_nodec                 6    t        j                  |       \  }}|S )aU  
    Return a process group across the current node.

    For example, given each row is a distinct node:
    0  1  2  3  4  5  6  7
    8  9 10 11 12 13 14 15
    This API would return an intra-node subgroup across
    [0, 1, ..., 7] or [8, 9, ..., 15] depending on the process's rank.
    For example, rank 3 would get [0, 1, ..., 7].
    )rY   new_subgroups)rc   intra_node_subgroup_s      rB   _init_intra_node_process_grouprh      s!     "//0DErD   global_process_groupc                 \   d}t        j                  |       }t        j                  |       }||z  }t        j                  |       |z  }t	        |      D ]?  }t	        |      D cg c]
  }|||z  z    }	}t        j
                  |	|      }
||k(  s>|
}A |t        | d      |S c c}w )a  
    Return an inter-node process group where each contained rank has the same local rank.

    For example, given each row is a distinct node:
    0  1  2  3  4  5  6  7
    8  9 10 11 12 13 14 15
    This API would return inter-node process group [0, 8], [1, 9], [2, 10], and so forth
    depending on the process's rank. For example, rank 1 would get [1, 9], rank 5
    would get [5, 13].
    N)ranksbackendz. expected to assign inter-node pg, but did not)rY   get_backendget_world_sizeget_rankrange	new_groupAssertionError)ri   rc   inter_node_pgsharding_backendr:   	num_nodesmy_local_rank
local_rankiranks_for_inter_groupgrps              rB   _init_inter_node_process_groupr{      s      M''(<=$$%9:J22IMM"67:NNM01  
=B9=M!
89J!223!
 !
 nn#8BRS&M  oKL
 	
 !
s   %B)c                 0    t        |      t        | |      fS )a  
    Initialize intra and inter-node process groups and return the ones corresponding to this process's rank.

    This function can be used to initialize process groups for ``HYBRID_SHARD`` or
    ``_HYBRID_SHARD_ZERO2`` in FSDP.
    This function assumes each node has an equal number of CUDA-enabled devices.
    Returns:
        Tuple[dist.ProcessGroup, dist.ProcessGroup]: Intra and inter-node process group.
    )rh   r{   )ri   rc   s     rB   rJ   rJ      s#     	'';<&';=QR rD   moduleignored_modulesignored_statesc                    ||t        d      d }|d u}|rt        |      }t        |d       ng }t        |t        |      ng d       t        |      dkD  r"t	        |d   t
        j                        r|}n|}t        ||      | _        t        || j                  |      | _
        t        || j                        | _        | S )NzfCannot pass both ignored_modules and ignored_states at the same time. Please just pass ignored_states.TFr   )r3   list_check_ignored_statesr`   rX   nn	Parameter_get_ignored_modules_ignored_modules_get_ignored_params_ignored_params_get_ignored_buffer_names_ignored_buffer_names)r*   r}   r~   r   ignored_parameterspassed_as_ignored_statesignored_states_lists          rB   _init_ignored_module_statesr     s     "~'A:
 	
 -T9">2148 %4%@D!b%	
 !#)!,bll;!41O1&/JE/E
 #<#E LrD   r   c                    t        |       dk(  ry|r`t        d | D              }t        d | D              }|s9|s6t        | D ch c]  }t        |       c}t              }t        d|       yyt        d | D              s6t        | D ch c]  }t        |       c}t              }t        d|       yc c}w c c}w )	z
    Check that the ignored states are uniformly parameters or uniformly modules.

    We may remove this check in the future if we permit mixing.
    r   Nc              3   P   K   | ]  }t        |t        j                           y wrW   )rX   r   r   r[   r*   s     rB   r]   z(_check_ignored_states.<locals>.<genexpr>K  s     UUE2<<8Ur^   c              3   P   K   | ]  }t        |t        j                           y wrW   rX   r   Moduler   s     rB   r]   z(_check_ignored_states.<locals>.<genexpr>L  s     S5*UBII6Sr^   )keyzUignored_states expects all nn.Parameter or all nn.Module list elements but got types c              3   P   K   | ]  }t        |t        j                           y wrW   r   r   s     rB   r]   z(_check_ignored_states.<locals>.<genexpr>U  s     LE:eRYY/Lr^   z>ignored_modules expects nn.Module list elements but got types )r`   ra   sortedrN   reprr3   )r   r   
all_paramsall_modulesr*   sorted_typess         rB   r   r   @  s     >aUnUU
SNSS+!N"K54;"KQUVL**69  #.z L^LL!N"K54;"KQUVL%(  M #L #Ls   B;C ignored_params	device_idc                 6   d}|1t        |t        j                        r|nt        j                  |      }|t        ||      D ]|  }|j                  j                  dv r||j                  }+|j                  j                  |j                  k7  sOt        d|j                   d|j                  j                          |xs t        j                  j                         }|j                  dk(  rt        d      t        j                  |      | _
        | S )a=  
    Determine device handle used for initializing FSDP.

    If a device is specified by ``device_id``,
    then returns device handle corresponds to that device type. Otherwise, If the
    module is already on a non-CPU device, then the device type is that non-CPU device type.
    If the module is on CPU or meta, then the device type is the current accelerator device.
    See the :ref:`Accelerators<accelerators>` for details.


    This method will be called once ignored parameters was determined, as the device handle maybe needed
    for other initialization.
    N>   cpumetazLFSDP does not support modules with different device types but got params on z and r   zOFSDP needs a non-CPU accelerator device, but no accelerator device is detected.)rX   torchdevice_get_orig_paramsrN   RuntimeError_C_get_acceleratorr   from_devicerK   )r*   r}   r   r   determined_deviceparams         rB   _init_device_handler   ]  s   (  )U\\2 i( 	
  %fn= 
	E||  O3 ($)LL!<<$$(9(>(>>&-->-C-C,DE%,,J[J[I\^ 
	 .L1J1J1L!!U*a  -889JKELrD   c                     t        |      | _        i }|j                         D ]  \  }}t        |      }|j                  ||<   ! || _        | S rW   )_get_buffer_names_buffer_namesnamed_buffersr   dtype_buffer_name_to_orig_dtype)r*   r}   r   buffer_namebuffers        rB   _init_buffer_stater     s_    
 ,F3E
 :<%335 ?V'428,,";/? (BE$LrD   mixed_precisioncpu_offloadlimit_all_gathersuse_orig_paramsbackward_prefetch_limitforward_prefetch_limitc                    | j                   dk(  rQ|t        j                  k7  r-t        j                  d|xs t        j
                   dd       t        j                  }n/|t        j                  k(  rt        j                  dt        d       |xs t        j
                  | _        |xs
 t               | _	        |5t        j                  j                  dt        | j                                t        j                  j!                  t"        d	      d
k(  | _        |xs
 t'               | _        || _        || _        t.        j0                  | _        d | _        t7               | _        t;        j<                         | _        tA        jB                  | j>                  ||      | _"        d | _#        i }|| _$        d }	|	| _%        g }
|
| _&        | S )NrF   z/FSDP is switching to use `NO_SHARD` instead of z since the world size is 1.rU   
stacklevelzoThe `NO_SHARD` sharding strategy is deprecated. If having issues, please use `DistributedDataParallel` instead.   z'torch.distributed.fsdp.mixed_precision. 1)'r:   r   NO_SHARDwarningswarn
FULL_SHARDFutureWarningr,   r   r   r   r   _log_api_usage_oncestrosenvirongetr   _use_full_prec_in_evalr   r   r   _use_orig_paramsr   IDLEtraining_state_is_rootr   _free_event_queuerY   get_debug_level_debug_levelexec_order_utils_ExecOrderData_exec_order_data_unshard_event_fully_sharded_module_to_handle_handleparams)r*   r,   r   r   r   r   r   r   r   r   r   s              rB   _init_core_stater     s    1 0 9 99MMA$C(8(C(CD E'' 	 -55	.77	7< 	
 0N3C3N3NE+?~/?E"$$5c%:O:O6P5QR	
 	

2B73> 
  $3z|E/E,E(--EEN-/E--/E-<<E
  E IK#,KE) '+GEM"$FELLrD   c                 f    g }|| _         g }|| _        g }|| _        d| _        d | _        d | _        | S )NT)_root_pre_forward_handles_pre_forward_handles_post_forward_handles_sync_gradients
_comm_hook_comm_hook_state)r*   r   r   r   s       rB   _init_runtime_stater     sK     8:&?E#24!5E35"7E EE!ELrD   backward_prefetchforward_prefetchc                 "    || _         || _        | S rW   )r   r   )r*   r   r   s      rB   _init_prefetching_stater     s     0E-E LrD   c                     ||j                         nd }|r+|| j                  k7  rt        | j                        | _        | S d | _        | S rW   )_get_root_meshr6   r#   rK   _fsdp_extension)r*   r.   	root_meshs      rB   _init_extensionr     sT    
 1<0G**,TI yE$6$66 1%2F2F G
 L !%LrD   c                     t         j                  | _        t               }t	               | _        || _        i }|| _        | S rW   )r!   FULL_STATE_DICT_state_dict_typer   r   _optim_state_dict_config_state_dict_config_unshard_params_ctx)r*   state_dict_configunshard_params_ctxs      rB   _init_state_dict_stater     s?    *::E)<)>%=%?E"0E57 2ELrD   r   c                     |D ]X  }t        |j                        dk(  sd}| j                         D ]  \  }}||u s|} n |st        d      t	        d| d       y)z
    Verify if the parameters are accepted by FSDP. The only restriction now
    is that the parameter cannot be a scalar tensor (param.shape == []).
    r   r   zExpected param_name to be setz/FSDP doesn't support scalar parameters. Change z& to a 1D tensor with numel equal to 1.N)r`   shapenamed_parametersrr   r3   )r}   r   r   
param_namenameparam_s         rB   _verify_managed_paramsr     s    
  u{{q J & 7 7 9 fF?!%J $%DEE$%KM rD   fully_sharded_moduleparam_init_fnsync_module_statesc                     t        | j                  |       t        | j                   j                        }t        | j                   j                        \  }}|s|r|t        || j                         nA|r#t        || j                   j                         n|rt        j                  | fd        j                  D 	ch c]  }|j                         D ]  }	|	  }
}}	t        | j                  |
|       t        | j                  | j                   j                         _        t        t!        | j                              }t#        ||       |rw j$                  t&        v rNt)        || j*                         |j                         D ]$  }	t-        |	t.              st1        |	t.        d       & t)        || j2                         t5         ||        S c c}	}w )zHInitialize a ``FlatParamHandle`` from a module ``fully_sharded_module``.c                 >    t        |       d u xr | j                  vS rW   )r   r   )	submoduler*   s    rB   <lambda>z0_init_param_handle_from_module.<locals>.<lambda>Q  s(    '=i'HD'P (8!7!77 rD   )check_fnF)_check_single_device_moduler   _get_device_from_device_idr8   rK   _need_to_materialize_moduler   _materialize_with_param_init_fn_materialize_meta_moduler'   materialize_modulebuffers_move_module_to_device_get_compute_devicecompute_devicer   r   r   r,   r4   _sync_module_params_and_buffersr;   hasattrFSDP_SYNCEDsetattrr+   _init_param_handle_from_params)r*   r   r   r   r   device_from_device_idis_meta_moduleis_torchdistX_deferred_initignored_moduler   ignored_buffersmanaged_paramss   `           rB   _init_param_handle_from_moduler  2  s      4e6K6KYW65::u33 3Ne33U5K5K3/N/ 	5=;T' -1G1G	
 
  ""  		
 
%(( 8	
 $44$,,.  	O  	 /

E *+?AVAVWXN/@""&@@ ,$ne6J6J /668 86;/FK78 	( .%2E2E	
 #5.:NOLYs   G*c                    t        |      dk(  ry t        ||| j                  t        | j                     | j
                  j                  | j                  j                  | j                  j                  | j                  j                  | j                  | j                  | j                        }|j                          | j                  rt!        d      | j"                  j%                  |j&                         || _        || j(                  |j*                  <   t-        j.                  d      }| j
                  j                  r,|j&                  j.                  |k7  r|j1                  |       y y y )Nr   )fsdp_extensionz!Expected state._handle to be Noner   )r`   r   r  SHARDING_STRATEGY_MAPr,   r   offload_paramsr   param_dtypereduce_dtypekeep_low_precision_gradsr+   r   r   shardr   rr   r   append
flat_paramr   _fully_sharded_moduler   r   flat_param_to)r*   r   r   handle
cpu_devices        rB   r  r    s+    6{ae556(())**66,,F LLN}}@AA	LL))*EMJPE))&*F*FGe$J''F,=,=,D,D
,RZ( -S'rD   root_moduler   c           	      2   d}	 |t        |      n	t               }|D ]V  }t        |t        j
                  j                        st        |dt        |       z         t        |      sMt        d       | j                         D ])  }t        j                  |      r|j                  |       + |D ch c]3  }|j                         D ]  }t        |t        j                        s|  5 }}}| |v rt        j                   d d       | j                         D ]B  }t        |      }	|	t#        |	d	      st%        d
      |j'                  |	j(                         D |S # t        $ r }t        |dt        |       z         |d}~ww xY wc c}}w )ah  
    Check that ``_ignored_modules`` is an iterable of ``nn.Module`` s without any FSDP instances.

    Return the modules contained in their module
    subtrees as a :class:`set`. Nested FSDP instances are excluded, but their
    already-computed ignored modules are included.

    ``_ignored_modules`` represents the argument passed by the user to FSDP.
    z>`ignored_modules` should be an iterable of `torch.nn.Module`s Nzbut got zbut got an iterable with z1`ignored_modules` should not include FSDP moduleszTrying to ignore the top-level module passed into the FSDP constructor itself will result in all parameters being ignored and is not well-supported: rU   r   r   z?Expected optional_fsdp_state to have _ignored_modules attribute)set	TypeErrorrN   rX   r   r   r   r   r3   modulestraversal_utils_composableadd	fsdp_fileFullyShardedDataParallelr   r   r  rr   updater   )
r'  r   
msg_prefixignored_root_moduleser}   childr~   r   optional_fsdp_states
             rB   r   r     s    RJQ%5%AC !su 	
 ' R&%((//2J+DT&\N)SSTT!&) PQQR %%' -**62 $$V,- +^^% %!C!CD 	O  o%228; 		
 !((* I	4Y?*.0BC$U  ""#6#G#GHI Q  Q
x5E0F/G%HHIqPQ$s   E' 78F'	F0FFr   c                    t               }|D ch c]%  }|j                         D ]  }t        |      r| ' }}}|j                  |       |,|D ch c]  }t        |      r| }}|j                  |       | j	                         D ]B  }t        |      }	|	t        |	d      st        d      |j                  |	j                         D |S c c}}w c c}w )z
    Return the parameters of the modules in ``ignored_modules`` and the parameters in ``ignored_parameters``.

    :class:`FlatParameter` s are excluded from the result.
    r   z>Expected optional_fsdp_state to have _ignored_params attribute)	r)  
parametersr   r1  r+  r   r  rr   r   )
r'  r~   r   all_ignored_paramsmpparams_in_ignored_modulesparams_in_ignored_parametersr   r6  s
             rB   r   r     s    36% #!ALLN!'(BTUVBW!	! ! 78%)(
1CA1FA(
$ (
 	!!">? !((* K	4Y?*.0AB$T  %%&9&I&IJK -!(
s   #CCC%Cc           	         t               }|D ch c]  }|j                         D ]  }|  }}}|j                  | j                         D ch c]  \  }}||v rt	        |       c}}       | j                         D ]B  }t        |      }|t        |d      st        d      |j                  |j                         D |S c c}}w c c}}w )z6Return the cleaned buffer FQNs in ``ignored_modules``.r   zDExpected optional_fsdp_state to have _ignored_buffer_names attribute)
r)  r	  r1  r   r   r+  r   r  rr   r   )	r'  r~   all_ignored_buffer_namesr:  r   buffers_in_ignored_modulesr   r   r6  s	            rB   r   r     s    
 *- ("aiik",2"" " ## (3'@'@'B	
#V33 k*	
 !((* W	4Y?*.0GH$Z  %++,?,U,UVW $#-"
	
s   CC
c                 f    | j                         D ch c]  \  }}t        |       c}}S c c}}w )zrReturn the fully prefixed names of all buffers in the module hierarchy rooted at ``root_module`` as a class:`set`.)r   r   )r'  r   rg   s      rB   r   r   &  s5     >I=V=V=X+9;+&  s   -c                     t        | |      D ch c]  }|j                   }}t        |      dk(  r%t        j                  d      |v r|t	        d      yt        |      dkD  rt	        d|       yc c}w )z
    Raise an error if ``module`` has original parameters on multiple devices, ignoring the parameters in ``ignored_params``.

    Thus, after this method, the
    module must be either fully on the CPU or fully on a non-CPU device.
    rU   r   NzTTo support a module with both CPU and GPU params, please pass in device_id argument.rF   z;FSDP only supports single device modules but got params on )r   r   r`   r   r   )r}   r   r   r   devicess        rB   r  r  -  s     *:&.)QRu||RGR 7|qU\\%0G;5  
 
W	I'S
 	
 
 Ss   A4r8   device_handlec                 b   | yt        | t        j                        r| nt        j                  |       }|j                  dk7  rk|j                  _t        j                  d|  d| d|j                          d|j                   d	d	       t        j                  |j                               }|S )
z
    Return a ``torch.device`` for the specified ``device_id``.

    Processes ``device_id`` and returns either the corresponding device or
    ``None`` if ``device_id`` is ``None``.
    Nr   z"FSDP got the argument `device_id` z	 on rank zJ, which does not have an explicit index. FSDP will use the current device z6. If this is incorrect, please explicitly call `torch.zk.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.rU   r   )rX   r   r   rN   indexr   r   current_device)r   r8   rD  r   s       rB   r  r  K  s     	5<<8	ell9>U  {{e 409f 00=0L0L0N/O PCCI;;- P11 	
 m::<=MrD   c                    t        t        | |            }t        d |D              }| j                         D ]-  }||v r|j	                  d      D ]  }||j
                  z  } / | xr t        xr t        d |D              }||fS )z
    Return if ``module`` has parameters on meta device and if ``module`` is using torchdistX deferred initialization.

    At most of the returned bools can
    be ``True``. If either is ``True``, then ``module`` needs to be
    materialized.
    c              3   4   K   | ]  }|j                     y wrW   )is_metar[   r   s     rB   r]   z._need_to_materialize_module.<locals>.<genexpr>v  s     C5Cs   Frecursec              3   F   K   | ]  }t        j                  |        y wrW   )r(   is_fakerK  s     rB   r]   z._need_to_materialize_module.<locals>.<genexpr>  s     @U#@s   !)r   r   anyr+  r	  rJ  _TORCHDISTX_AVAIL)r}   r   r~   r  r  r   bufr  s           rB   r  r  i  s     *6>BCNCNCCN ^^% *	'$$U$3 	*Cckk)N	**  	A	A@@@  
 666rD   c                     t        |      st        d| dt        |             t        | |      }|D ]
  } ||        y )Nz	Expected z to be callable but got )callabler3   rN   _get_modules_to_materialize)r'  r   r~   modules_to_materializer}   s        rB   r  r    sV    
 M"&>tM?R>ST
 	
 9oV( frD   r  c           	      >   |xs# t        j                  |j                               }t        | |      }d }	 t        j                         5  |D ]u  }t        j                  |j                  d      |j                  d            }t        t        |            dkD  }|sS|j                  |d       |j                          w 	 d d d        y # 1 sw Y   y xY w# t        $ r7}	t        j                  dt!        |	       dt#        |       dd	       |	d }	~	ww xY w)
NFrL  r   )r   rM  zIUnable to call `reset_parameters()` for module on meta device with error z(. Please ensure that your module oftype z* implements a `reset_parameters()` method.rU   r   )r   r   rG  rU  no_grad	itertoolschainr8  r	  r`   r   to_emptyreset_parametersBaseExceptionr   r   r   rN   )
r'  r  r~   rD  materialization_devicerV  r}   module_state_iterhas_module_statesr4  s
             rB   r  r    s$    3 ell$$&7 9oVF ]]_ 	.0 . %.OO%%e%4NN5N1%!
 %(->(?$@1$D!$OO+A5OQ++-.	. 	. 	.  !!$Q )L>!KM 		
 s<   C AC!%CC CC C 	D%2DDc                 "   g }t        j                  | g      }| h}|rq|j                         }|j                  |       |j	                         D ]:  }||vst        |      ||vs|j                  |       |j                  |       < |rq|S rW   )collectionsdequepopleftr!  childrenr   r.  )r'  r~   rV  queuevisited_modulesr}   child_modules          rB   rU  rU    s    
 /1{m,E'2mO
%%f-"OO- 	+LO3*<8@ 7##L1\*	+  "!rD   r  c                    t        j                  d      |	t        j                         }|j	                  |        g }g }|r|j                         }|j                  fd|j                  d      D               |j                  fd|j                  d      D               |j                         D ].  }t        |t        j                        r|j	                  |       0 |r|D 	cg c]	  }	|	|vs|	 }
}	|D 	cg c]	  }	|	|vs|	 }}	t        |
||       yt        t        | |      d      }||j                  k(  rt!                yyyc c}	w c c}	w )a  
    Move ``module`` depending on ``device_from_device_id`` and its current device.

    This includes moving ignored modules' parameters.

    - If ``device_from_device_id`` is not ``None``, then this moves
    ``module`` to the device.
    - If ``device_from_device_id`` is ``None``, then this does not move
    ``module`` but warns the user if it is on CPU.

    Precondition: ``_check_single_device_module()``.
    r   Nc              3   @   K   | ]  }|j                   k(  r|  y wrW   r   )r[   r   r&  s     rB   r]   z)_move_module_to_device.<locals>.<genexpr>  s%      <<:-    FrL  c              3   @   K   | ]  }|j                   k(  r|  y wrW   rk  )r[   r   r&  s     rB   r]   z)_move_module_to_device.<locals>.<genexpr>  s%      ==J. rl  )r   r   rb  rc  r!  rd  extendr8  r	  re  rX   r/  r0  _move_states_to_devicenextr   _warn_cpu_init)r}   r   r  r  rf  r   r	  curr_moduler   r;  params_to_movebufs_to_mover   r&  s                @rB   r
  r
    s\   $ e$J( /:.?.?.AV%'&(--/K
 MM (33E3B 
 NN )11%1@ 
 )113 ,	!)Y-O-OPLL+,! & &,Gq/F!GG#*Gaa.FGG~|=RS!&.94@EU\\Z7 8 HGs   1	E;E	EEr	  c                 6   t        |       dk(  rt        |      dk(  ryt        |       dkD  r| d   j                  }nt        |      dkD  r|d   j                  }t        j                  d      }|| D ]k  }t        j                         5  |j	                  |      |_        |j                  *|j                  j	                  |      |j                  _        ddd       m |D ]  }|j	                  |      |_         y|k(  rt                yy# 1 sw Y   xY w)z
    Move states to the specified device.

    Precondition: ``_check_single_device_module()`` and module's parameters and
    buffers have been materialized if needed.
    r   Nr   )r`   r   r   rX  todatagradrq  )r   r	  r  rG  r&  r   r   s          rB   ro  ro    s    6{aCLA-
6{Q))	W	 **e$J(  	KE K"XX&;<
::)&+jjmm4I&JEJJOK K	K
  	;F ))$9:FK	;	:	% 
&K Ks   	ADD	c                  2    t        j                  dd       y )Nam  The passed-in `module` is on CPU and will thus have FSDP's sharding initialization run on CPU, which may be slower than on GPU. We recommend passing in the `device_id` argument for FSDP to move `module` to GPU for the sharding initialization. `module` must also be on GPU device to work with the `sync_module_states=True` flag since that requires GPU communication.rU   r   )r   r    rD   rB   rq  rq  '  s    MM	1 rD   c                     t        t        | |      d      }|&|j                  j                  dk7  r|j                  }n#t	        j                  |j                               }|||k7  rt        d| d| d|       |S )a)  
    Determine and return this FSDP instance's compute device.

    If the module is already on a non-CPU device, then the compute device is that non-CPU
    device. If the module is on CPU, then the compute device is the current
    device.

    Since this method should be called after materializing the module, any
    non-CPU device should not be meta device. For now, the compute device is
    always a CUDA or CUDA-like device with its explicit index.

    Precondition: ``_check_single_device_module()`` and
    ``_move_module_to_device()``.
    Nr   z4Inconsistent compute device and `device_id` on rank z: z vs )rp  r   r   rN   r   rG  r3   )r}   r   r  r8   rD  r   r  s          rB   r  r  3  s    * !&.94@EU\\..%7m&B&B&DE(^?T-TB4&d#8"9;
 	
 rD   c                 V   g }| j                         D ]  }t        |t        d      rt        |t        d       |j	                         }t        |      ry|j                         \  }}|D ]`  }t        ||      xxt        j                  d x\   }	 |j                  |	       9  xt        d x\    I }
t        dt        |
              |j                  |        |D ]  }|j	                         }t        |      ry|j                         \  }}|D ]`  }t        ||      xxt        j                  d x\   }	 |j                  |	       9  xt        d x\    I }
t        dt        |
              |j                  |        t        |       t        ||t        d       y)z
    Synchronize module states (i.e. parameters ``params`` and all not-yet-synced buffers) by broadcasting from rank 0 to all ranks.

    Precondition: ``sync_module_states == True`` and ``self.process_group`` has
    been set.
    FTrz  Nz#expected Tensor or OpaqueBase, got r   )src)r	  getattrr  r  detachr%   __tensor_flatten__r   Tensorr!  r
   rr   rN   +_check_module_states_for_sync_module_statesr$   PARAM_BROADCAST_BUCKET_SIZE)r}   r   r+   module_statesr   detached_bufferattrsrg   attrv
unexpectedr   detached_params                rB   r  r  V  s    )+M.." 6v{E2FK.$mmoO,_= +==?q! 	D!/480U\\^0)003 1)Z\  *'"0"Ed:FVEW X# 	 $$_5)6,  1(8%88:HE1 	nd3,,%,,Q/ -% &#,A$zBRAST 	   01" 0>#	rD   r  c                 D    | rt        d | D              rt        d      y y )Nc              3   `   K   | ]&  }|j                   t        j                   d       k(   ( yw)r   N)r   r   )r[   tensors     rB   r]   z>_check_module_states_for_sync_module_states.<locals>.<genexpr>  s'      17e,,s   ,.zThe module has CPU parameters or buffers when `sync_module_states=True`, which requires them to be on GPU. Please specify the `device_id` argument or move the module to GPU before passing it to FSDP.)rP  r3   )r  s    rB   r  r    s7      ;H  C
 	
}rD   c              #      K   | j                         }	 	 t        |      }||vrt        |      s| # t        $ r Y yw xY ww)aD  
    Return an iterator over the original parameters in ``module``.

    The iterator does not return
    the parameters in ``ignored_params``, any ``FlatParameter`` s (which may be
    present due to nested FSDP wrapping), or any original parameters already
    flattened (only relevant when ``use_orig_params=True``).
    N)r8  rp  r   StopIteration)r}   r   	param_genr   s       rB   r   r     sT      !!#IOEN*3Ee3L   s   A 4 	A AA  Ac           	          t        |       D ]A  \  }}||vst        |      rt        d| d|j                          d|j                          y)a5  
    Check that original parameters in ``fsdp_module`` have been flattened.

    The flattened parameters are made
    invisible to ``named_parameters()`` for the module hierarchy rooted at
    ``fsdp_module``. This should be called as a sanity check after flattening
    the wrapped module's parameters.
    z Found an unflattened parameter: z;  N)r   r   r   r9   	__class__)fsdp_moduler   r   r   s       rB   _check_orig_params_flattenedr    s^     ?{K 
E&/A%/H2:,b::<.%//!24 rD   c                 h    | t         j                  k(  rt        j                  S t        j                  S rW   )r   r   r   allreduce_hookreduce_scatter_hook)r,   s    rB   _get_default_comm_hookr    s3      0 9 99 	$$ ..rD   c                 .    t        j                  |       S )NrG   )r   r<   rG   s    rB   rO   rO     s     %%MBBrD   rW   )rb  rY  r   r   collections.abcr   r   r   r   typingr   r   r	   r   torch.distributeddistributedrY   (torch.distributed.fsdp._exec_order_utilsfsdp_exec_order_utilsr   'torch.distributed.fsdp._traversal_utils_traversal_utilsr,  2torch.distributed.fsdp.fully_sharded_data_parallelfully_sharded_data_parallelr/  torch.nnr   torch._opaque_baser
   (torch.distributed.algorithms._comm_hooksr   torch.distributed.device_meshr   "torch.distributed.distributed_c10dr   $torch.distributed.fsdp._common_utilsr   r   r   r   r   r   r   "torch.distributed.fsdp._flat_paramr   r   r   r   %torch.distributed.fsdp._limiter_utilsr   torch.distributed.fsdp.apir   r   r   r   r   r   r    r!   torch.distributed.fsdp.wrapr"   &torch.distributed.tensor.parallel.fsdpr#   torch.distributed.utilsr$   torch.utils._python_dispatchr%   torch.utils.hooksr&   rQ  
torchdistxr'   r(   ImportErrorr  r  r_   rZ   HybridShardProcessGroupTypeProcessGroupTyper   r   SHARD_GRAD_OPHYBRID_SHARD_HYBRID_SHARD_ZERO2r  r4   #NO_RESHARD_AFTER_FORWARD_STRATEGIESrC   r5   boolrM   rH   intrh   r{   rJ   r   r   r   r   r   r)  r   r   r   r   r   r   r   r   r   r  r  r   r   r   r   r   r  r  r  r  r  rU  r  r
  ro  rq  r  r  r  r   r  r  r<   rO   rz  rD   rB   <module>r     s
     	  C C 4 4    C C A A F F  ) B 4 A    B	 	 	 0 D < F 1 . 0 #D$5$5t7H7H$HI $$'BBTI 
 5>>!7!B!B""$:$H$H!!#9#F#F((*@*T*T  !!(( 
 ""((' #  &*00#0 (0 dN	0
 d"0 0 0f ((#( ( 	( (V # $   IJ I4 I I  ARAR   "++"" 
" "J++ 4d///0&  ++II+ ehhoo.5+ UXX//0uxx 
	+ + +\I9=	: --II- %- U\\!D(	-
 - -` II  " ??'$.? $d*? d"	?
 ? ? !?  ? ? ?D    		'	 	 		 	 : J *   *   299 d2<<6H T ( NN))N U\\!D(N RYYK-.5	N
 N N Nb ))) ))) )>::uxx/$6: 	^:@ ?C"")" !!3!34t;" 				"J$$)$ 	X$@299 S 
II
%
 U\\!D(
 
	
<U\\!D(
 % \\D	<7II7%7 ^7 4:	7<RYYK-. ^ 
	## <<$.# ^# %	#L""-0^"	"))_",3II3%3 &3 !<<$.	3
 
3l%,, !<<$. 
	@	II% !<<$. 	
 % \\F9II99 $$9 
	9x

%

	

II% bll,% 
(.> C$$CCq$  s   ]; ;^^