
    9j4                     j   d dl Z d dlmZ d dlmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZmZmZmZmZ d dlm Z m!Z! d dl"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 e5e6e7ee8   dz  ee8   f   f   Z9dgZ:d*de8de6de6fdZ;	 d+dejx                  dz  defdZ=dej|                  de?fdZ@	 d*dedee8   de6dej|                  fd ZAd!ede7e9ejx                  dz  f   fd"ZB G d# d$e      ZC	 d+d%ed&e6d'e(d(e!dz  def
d)ZDy),    N)Sequence)cast)_get_device_module)ShardedTensor)TensorProperties)Shard)ChunkShardingSpec)unflatten_state_dict)DefaultLoadPlanner)BytesStorageMetadataChunkStorageMetadataMetadataMetadataIndexSTATE_DICT_TYPEr   TensorStorageMetadata)LoadPlanLoadPlanner)_create_read_items create_read_items_for_chunk_list)load_state_dict)StorageReader)_element_wise_add_element_wise_sub_normalize_device_info)_get_default_group)_create_chunk_sharded_tensor)_remote_device)DTensor!load_sharded_optimizer_state_dictglobal_rankdevice_typereturnc                     |dk(  ryt        |      }|j                         rt        || |j                         z        S y)Ncpu)r   is_availabler   device_count)r    r!   device_modules      f/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/distributed/checkpoint/optimizer.py_gen_rank_devicer)   8   sH    e&{3M!!#%}'A'A'CC
 	
     pgc                    t         j                  j                  |       j                  }| ;t	        t        j
                               D cg c]  }d| dt        ||        }}nJt	        | j                               D cg c](  }d| dt        t        j                  | |      |       * }}t        dt        t        t        t        z     |            S c c}w c c}w )Nrank:/r   dim
placements)distdistributed_c10d_get_pg_default_devicetyperangeget_world_sizer)   sizeget_global_rankr	   r   listr   str)r+   pg_device_typeidxr1   s       r(   _create_colwise_specr>   C   s     **AA"EJJN	z T0023
 C5*3?@A

 
 RWWY'
 C5*4+?+?C+H.YZ[

 
 ^c12J? 


s   C-Cvalc                    t        |       t        u rt        | j                               dk(  ryt        | j                         d   j                        t        u ryt        | j                         d   j                        t
        u rt        d      yt        |       t
        u rAt        | j                        t
        u st        | j                        t        u rt        d      y)Nr   FTz1Cannot handle DTensor nested inside ShardedTensorzCannot handle nested DTensor)r5   r   lenlocal_shardstensorr   
ValueError_local_tensor)r?   s    r(   _is_nested_tensorrF   W   s    CyM!s!"a'  "1%,,->  "1%,,-8PQQ
 	 
cg	S7*d33D3D.E.V788r*   propsr8   c                 P   |dk(  r2t        t        j                  t        |      j	                               }n-t        j                  |t        |      j	                               }t        j
                  || j                  | j                  | j                  | j                  |      S )Nr$   )r8   dtypelayoutrequires_grad
pin_memorydevice)
r   torchrM   r   current_deviceemptyrI   rJ   rK   rL   )rG   r8   r!   rM   s       r(   _alloc_tensorrQ   f   s     eell$6{$C$R$R$TU+K8GGI
 ;;kk||))## r*   
state_dictc                    i }d}| j                         D ]  \  }}d|j                         f||<   t        |      s't        |j	                               dk(  st        d      t        |t              st        d      |j	                         d   }|j                  j                  |j                  j                  f||<   |j                  j                  } ||fS )a+  
    Load the right TP slice of the optimizer state.

    This is not easy since the per-tensor slicing can't be inferred from checkpoint metadata.
    We take advantage of the model state_dict producing a sliced ST to figure out what we need to load.
    This is pretty fragile and it might be easier for FSDP to compute this info for us.
    Returns a dictionary where keys are the same of the state_dict and the value is a tuple of
    (offset, size) for the current rank TP slice.
    N.B. The state_dict *MUST* come from FSDP.sharded_state_dict.
    N   z%Cannot handle ST with multiple shardsz$Can only handle nested ShardedTensorr   )itemsr8   rF   rA   rB   AssertionError
isinstancer   metadatashard_offsetsshard_sizesrC   _process_group)rR   specsdp_pgkeyvalueshards         r(   _get_state_dict_2d_layoutra   z   s     #%E&*E &&( 0
UEJJL)c
U#u))+,1$%LMMe]3$%KLL&&(+E,,**E#J LL//E0 	 r*   c                        e Zd ZU eeef   ed<   eed<   eed<   deee	e
   f   ddf fdZdefdZd	edej                  f fd
Z xZS )_ReaderWithOffsettranslationrR   rX   fqn_to_offsetr"   Nc                 l    t         |           || _        t        i       | _        i | _        i | _        y N)super__init__re   r   rX   rR   rd   )selfre   	__class__s     r(   ri   z_ReaderWithOffset.__init__   s0    * r*   c           	         g }i | _         | j                  j                         D ]  \  }}| j                  j                  |   }t        |t              s|t        |||      z  }A|| j                  vr|t        |||      z  }`| j                  |   }t        |j                               dk(  st        d      |j                         d   }t        t        j                  t        |j                  j                   |            t        j                  |j                  j"                              g}t%        |t'        t(        |      |      }|D ]  }	|	j*                  j,                  t        d      t/        |	j*                  j,                  |      }
t1        j2                  |	j*                  t        j                  |
            }|| j                   |	j*                  <    ||z  } t5        |      S )NrT   z Expected exactly one local shardr   )offsetssizesz"dest_index.offset must not be None)offset)rd   rR   rU   rX   state_dict_metadatarW   r   r   re   rA   rB   rV   r   rN   Sizer   rY   rZ   r   r   r   
dest_indexro   r   dataclassesreplacer   )rj   requestsfqnobjmdro   original_shardlocal_chunksreqsrioriginal_offsetoriginal_indexs               r(   create_local_planz#_ReaderWithOffset.create_local_plan   s   --/ &	HC2237Bc=1.sB<<$,,,.sB<<'',Fs'')*a/$%GHH --/2N$!JJ).*A*A*O*OQWX  **^%<%<%H%HI	L 4T/4lD
  A==''/()MNN"3BMM4H4H&"Q!,!4!4MM%**_*E" 3A  /A HM&	N !!r*   indexc                 V    t         |   | j                  j                  ||            S rg   )rh   lookup_tensorrd   get)rj   r   rk   s     r(   r   z_ReaderWithOffset.lookup_tensor   s&    w$T%5%5%9%9%%GHHr*   )__name__
__module____qualname__dictr   __annotations__r   r   r;   r   intri   r   r   rN   Tensorr   __classcell__)rk   s   @r(   rc   rc      sm    m]233d3+=&> 4 *"8 *"XI= IU\\ I Ir*   rc   model_state_dictoptimizer_keystorage_readerplannerc                 .   |j                         }t        |       \  }}t        j                  j	                  |      j
                  }t        |      }|fg }	t        t        j                               D ]6  }
t        ||
|j                         z        }|	j                  d|
 d|        8 t        d|	      }nt        |      }i }i }|j                  j                         D ]|  \  }}|j                   |   }|d   |k7  rt#        |t$              rd||<   5|j&                  j)                         dk(  r%t+        |j,                  |j&                  |      ||<   w|mt/        t+        |j,                  |j&                  |      t        j0                         t        j                         |j                         t3                     ||<   |d	   }|j5                  |d|j&                  f      d   }t7        |j,                  j8                  |j,                  j:                  |j,                  j<                  |j,                  j>                  |j,                  j@                  
      }|jC                  tE        jF                  |      |      }g }t        j0                  |      }|jH                  D ]i  }tK        tL        |jN                        jQ                         |k7  r/|j                  tS        t+        |j,                  |jT                  |      |             k tW        jX                  |||      }||v r(||   d    tK        tZ        t\           ||   d         ||<   |||<    t_        |||ta        |      n|       tc        ||j                         }|S )a  
    Load a state_dict in conjunction with FSDP sharded optimizer state.

    This is the current recommended way to checkpoint FSDP.
    >>> # xdoctest: +SKIP
    >>> import torch.distributed.checkpoint as dist_cp
    >>> # Save
    >>> model: torch.nn.Model
    >>> optim_params = model.parameters()
    >>> optim = torch.optim.SGD(optim_params, lr=0.01)
    >>> # Save
    >>> with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
    >>>     state_dict = {
    >>>         "optimizer": FSDP.optim_state_dict(model, optim),
    >>>         "model": model.state_dict()
    >>>     }
    >>>     dist_cp.save_state_dict(
    >>>         state_dict=optim_state,
    >>>         storage_writer=dist_cp.FileSystemWriter("checkpoint"),
    >>>         planner=dist_cp.DefaultSavePlanner(),
    >>>     )
    >>>
    >>> # Load
    >>> with FSDP.state_dict_type(model_tp, StateDictType.SHARDED_STATE_DICT):
    >>>     model_state_dict = model_tp.state_dict()
    >>>     checkpoint = {
    >>>         "model": model_state_dict
    >>>     }
    >>>     dist_cp.load_state_dict(
    >>>         state_dict=checkpoint,
    >>>         storage_reader=dist_cp.FileSystemReader(checkpoint_file),
    >>>         planner=dist_cp.DefaultLoadPlanner(),
    >>>     )
    >>>     model.load_state_dict(checkpoint["model_state"])
    >>>
    >>>     optim_state = dist_cp.load_sharded_optimizer_state_dict(
    >>>         model_state_dict,
    >>>         optimizer_key="optimizer",
    >>>         storage_reader=dist_cp.FileSystemReader("checkpoint"),
    >>>     )
    >>>
    >>>     flattened_osd = FSDP.optim_state_dict_to_load(
    >>>        model, optim, optim_state["optimizer"]
    >>>     )
    >>>
    >>>     optim.load_state_dict(flattened_osd)
    Nr-   r.   r   r/   z
<bytes_io>rT   )rank
world_sizenum_devices_per_noder+      )rI   rJ   rK   memory_formatrL   )rC   rX   )process_group)rR   r   r   )2read_metadatara   r2   r3   r4   r5   r   r6   r7   r   r&   appendr	   r>   rp   rU   planner_datarW   r   r8   numelrQ   
propertiesr   get_rankr   r   ShardTensorPropertiesrI   rJ   rK   r   rL   build_metadatarN   rq   shards_metadatar   r   	placementr   r   rZ   r   +_init_from_local_shards_and_global_metadatar   r   r   rc   r
   )r   r   r   r   rX   layout_specsr]   dp_pg_device_typer'   r1   idevice_infosharding_specrR   re   r^   r_   key_pathspec_key
alloc_sizer   st_mdrB   current_rankshard_mdsts                             r(   r   r      sf   j ++-H34DEL%--DDUKPP&'89M}
t**,- 	9A0!1}'A'A'C#CK aS+78		9
 *aJG,U3 #%J.0M2288: 8!
U((-A;-'e12*JsO ::"+  %**.?JsO ]:e..

<MN]]_..0%2%?%?%A%'JsO  {H%))(T5::4FGJJ.&&,,''..#..<<#..<< ++66J "00J1GTEL==/L!11 
(:(:;@@BlR##,!,,h.B.BDU  "*	
 JJe5B <'L,B1,E,Q%)(3-h9OPQ9R%Sc" JsOq8!v %494E!-07	 &j(2G2GHJr*   )cudarg   )Ers   collections.abcr   typingr   rN   torch.distributeddistributedr2   torch._utilsr   +torch.distributed._shard.sharded_tensor.apir   0torch.distributed._shard.sharded_tensor.metadatar   r   -torch.distributed._shard.sharded_tensor.shardr   :torch.distributed._shard.sharding_spec.chunk_sharding_specr	   )torch.distributed.checkpoint._nested_dictr
   ,torch.distributed.checkpoint.default_plannerr   %torch.distributed.checkpoint.metadatar   r   r   r   r   r   $torch.distributed.checkpoint.plannerr   r   ,torch.distributed.checkpoint.planner_helpersr   r   .torch.distributed.checkpoint.state_dict_loaderr   $torch.distributed.checkpoint.storager   "torch.distributed.checkpoint.utilsr   r   r   "torch.distributed.distributed_c10dr   #torch.distributed.fsdp._shard_utilsr   torch.distributed.remote_devicer   torch.distributed.tensorr   r   r;   tupler   STATE_DICT_2D_LAYOUT__all__r)   ProcessGroupr>   r   boolrF   rQ   ra   rc   r    r*   r(   <module>r      s    $     + E @ X J K   G K > 
 B L : , Cx}t';Xc]'J!KKL 
 (
# C S  $(D (5<< D   FL#+C=?B
\\(  
!2!2T!99: F:I* :IB #'	N%NN "N 4	N
 Nr*   