
    9jG                        d dl Z d dlZd dlmZ d dlmZ d dlmZmZ dddZ	d Z
ej                  fdZd ej                  fdZd ej                  fd	Zej                  ej                  fd
Zej                  ej                  fdZej                  fdZej                  fdZej                  fdZddej                  fdZej                  ej                  fdZ G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d d e      Z G d! d"e      Z G d# d$e      Z y)%    N)Function)groupReduceOp
suggestionc                :    d|  d}|r	|d| dz  }t        |      )N torch.distributed.nn.functional.z& is not supported under torch.compile.z Use 	 instead.)RuntimeError)namer   msgs      _/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/distributed/nn/functional.py_not_supported_under_compiler      s7    
*4&0VW  zl),,
s
    c                 J    t        j                  d|  d| dt        d       y )Nr	   z is deprecated, use r
      )category
stacklevel)warningswarnFutureWarning)r   r   s     r   _deprecatedr      s,    MM
*4& 1l)	%	r   c                     t         j                  j                         rt        dd       t	        dd       t
        j                  |||       S )a  
    Broadcasts the tensor to the whole group.

    ``tensor`` must have the same number of elements in all processes
    participating in the collective.

    Arguments:
        tensor (Tensor): Data to be sent if ``src`` is the rank of current
            process.
        src (int): Source rank.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Received tensor from the broadcast op.

    	broadcastz3torch.distributed._functional_collectives.broadcastr   )torchcompileris_compilingr   r   
_Broadcastapply)tensorsrcr   s      r   r   r       sD    " ~~""$$L	
 RSC//r   c                     t         j                  j                         rt        d       t        j                  |||       S )aT  
    Gathers a list of tensors in a single process.

    Arguments:
        tensor (Tensor): Input tensor.
        dst (int, optional): Destination rank (default is 0).
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        tuple[Tensor]: List of appropriately-sized tensors with the gathered data.
    gather)r   r   r   r   _Gatherr   )r    dstr   s      r   r#   r#   :   s/     ~~""$$X.==eV,,r   c                     t         j                  j                         rt        d       t	        j
                  ||g|  S )a  
    Scatters a list of tensors to all processes in a group.

    Each process will receive exactly one tensor and store its data in the
    ``tensor`` argument.

    Arguments:
        tensors (list[Tensor]): List of tensors to scatter on the source rank.
            Receivers must pass ``None`.
        src (int, optional): Source rank (default is 0).
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output tensor from the scatter operation.

    scatter)r   r   r   r   _Scatterr   )tensorsr!   r   s      r   r'   r'   K   s2    " ~~""$$Y/>>#u/w//r   c                     t         j                  j                         rt        d       t        j                  ||||       S )a  
    Reduces the tensor data across all machines.

    Only the process with rank ``dst`` is going to receive the final result.

    Arguments:
        tensor (Tensor): Input of the collective.
        dst (int): Destination rank.
        op (optional): One of the values from
            ``torch.distributed.ReduceOp``
            enum.  Specifies an operation used for element-wise reductions.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output of the collective.

    reduce)r   r   r   r   _Reducer   )r    r%   opr   s       r   r+   r+   a   s1    $ ~~""$$X.==b%00r   c                     t         j                  j                         rt        dd       t	        dd       t        j                  ||| g| S )a  
    Reduces, then scatters a list of tensors to all processes in a group.

    Arguments:
        output (Tensor): Output tensor.
        input_list (list[Tensor]): List of tensors to reduce and scatter.
        op (optional): One of the values from
            ``torch.distributed.ReduceOp``
            enum.  Specifies an operation used for element-wise reductions.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output of the collective.

    reduce_scatterz?torch.distributed._functional_collectives.reduce_scatter_tensorr   )r   r   r   r   r   _Reduce_Scatterr   )output
input_listr-   r   s       r   r/   r/   x   sN      ~~""$$X	
 I   UF@Z@@r   c                     t         j                  j                         rt        dd       t	        dd       t
        j                  ||       S )a  
    Gathers tensors from the whole group in a list.

    Arguments:
        tensor (Tensor): Tensor to be broadcast from current process.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        tuple([Tensor]): Output of the collective.

    
all_gatherz;torch.distributed._functional_collectives.all_gather_tensorr   )r   r   r   r   r   
_AllGatherr   )r    r   s     r   r4   r4      sG     ~~""$$T	
 S E6**r   c                     t         j                  j                         rt        d       t        j                  | ||      S )a  
    Single tensor all gather. Gathers a single tensor from all ranks, and puts them in a single output tensor.

    Args:
        output_tensor (Tensor): Output tensor. It should contain
            correctly-sized tensors to be used for output of the collective.
        input_tensor (Tensor): Tensor to be broadcast from current process.
        group (ProcessGroup, optional): The process group to work on. If None,
            the default process group will be used.

    Examples:
        >>> # All tensors below are of torch.int64 dtype.
        >>> # We have 2 process groups, 2 ranks.
        >>> # xdoctest: +SKIP("incorrect want text")
        >>> output_tensor = torch.zeros(2, dtype=torch.int64)
        >>> output_tensor
        [tensor([0, 0])] # Rank 0 and 1
        >>> tensor = torch.arange(1, dtype=torch.int64) + 1 + rank
        >>> tensor
        tensor([1]) # Rank 0
        tensor([2]) # Rank 1
        >>> dist.all_gather_base(output_tensor, tensor)
        >>> output_tensor
        tensor([1,2]) # Rank 0
        tensor([1,2]) # Rank 1

    .. warning::
        `_all_gather_base` is experimental and subject to change.
        It is the caller's responsibility to ensure the output_tensor
        is correctly sized.

    _all_gather_base)r   r   r   r   _AllGatherBaser   )output_tensorinput_tensorr   s      r   r7   r7      s3    B ~~""$$%78|UCCr   c                     t         j                  j                         rt        d       t	        j
                  || g| S )a  
    Each process scatters list of input tensors to all processes in a group and return gathered list of tensors in output list.

    Arguments:
        output_tensor_list (list[Tensor]): list of tensors to gather one per rank.
        input_tensor_list (list[Tensor]): List of tensors to scatter one per rank.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        tuple([Tensor]): Output of the collective.

    
all_to_all)r   r   r   r   	_AlltoAllr   )output_tensor_listinput_tensor_listr   s      r   r<   r<      s4     ~~""$$\2??5"4I7HIIr   c                     t         j                  j                         rt        dd       t	        dd       t
        j                  || |||      S )a  
    Each process splits input tensor and then scatters the split list to all processes in a group.

    Then concatenate the received tensors from all the processes in the group and return single output tensor.

    Arguments:
        output (Tensor): Gathered concatenated output tensor.
        input (Tensor): Input tensor to scatter.
        output_split_sizes: (list[Int], optional): Output split sizes for dim 0
            if specified None or empty, dim 0 of ``output`` tensor must divide
            equally by ``world_size``.
        input_split_sizes: (list[Int], optional): Input split sizes for dim 0
            if specified None or empty, dim 0 of ``input`` tensor must divide
            equally by ``world_size``.

    Returns:
        Tensor: Output of the collective.

    all_to_all_singlez;torch.distributed._functional_collectives.all_to_all_singler   )r   r   r   r   r   _AlltoAllSingler   )r1   inputoutput_split_sizesinput_split_sizesr   s        r   rA   rA      sT    4 ~~""$$T	
 E   v)+<e r   c                     t         j                  j                         rt        dd       t	        dd       t
        j                  |||       S )a&  
    Reduces the tensor data across all machines in such a way that all get the final result.

    After the call the returned tensor is going to be bitwise
    identical in all processes.

    Arguments:
        tensor (Tensor): Input of the collective.
        op (optional): One of the values from
            ``torch.distributed.ReduceOp``
            enum.  Specifies an operation used for element-wise reductions.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output of the collective

    
all_reducez4torch.distributed._functional_collectives.all_reducer   )r   r   r   r   r   
_AllReducer   )r    r-   r   s      r   rG   rG     sD    $ ~~""$$M	
 TUBv..r   c                   ,    e Zd Zed        Zed        Zy)r   c                     || _         || _        t        j                  |      | _        |j                         }t        j                  |||       |S Nr   )r!   r   distget_rankrankcloner   )ctxr!   r   r    s       r   forwardz_Broadcast.forward'  sD     	==u- vs%0r   c                     t         j                  | j                  t        j                  | j
                  |      }| j                  | j                  k7  r|j                          d d |fS N)r,   r   r!   r   SUMr   rO   zero_)rQ   grad_outputgxs      r   backwardz_Broadcast.backward3  sJ     ]]377HLL#))[I77chhHHJdBr   N__name__
__module____qualname__staticmethodrR   rY    r   r   r   r   &  s(         r   r   c                   ,    e Zd Zed        Zed        Zy)r$   c                    || _         || _        t        t        j                  |            D cg c]  }t        j                  |       }}|j                         }t        j                  |      |k(  r$t        j                  ||||       t        |      S t        j                  |d ||       t        |      S c c}w rK   )r%   r   rangerM   get_world_sizer   
zeros_like
contiguousrN   r#   tuple)rQ   r%   r   r    itensor_lists         r   rR   z_Gather.forward=  s     	 /4D4G4Ge4T.U
)*EV$
 
 ""$==u%,KKS> [!! KKc7[!!
s   B?c                 `    dt        j                  | j                  | j                  g| fz   S NNN)r(   r   r%   r   )rQ   grad_outputss     r   rY   z_Gather.backwardQ  s(    x~~cggsyyP<PRRRr   NrZ   r_   r   r   r$   r$   <  s*    " "$ S Sr   r$   c                   ,    e Zd Zed        Zed        Zy)r(   c                 4   || _         || _        t        fdD              st        t	        j
                  d         }t        j                  |      |k(  r$t        j                  |t              ||       |S t        j                  |d ||       |S )Nc              3   f   K   | ](  }|j                         d    j                         k(   * yw)r   N)size).0tr)   s     r   	<genexpr>z#_Scatter.forward.<locals>.<genexpr>\  s'     BQ1668wqz00Bs   .1r   rL   )
r!   r   allAssertionErrorr   rd   rM   rN   r'   list)rQ   r!   r   r)   r1   s      ` r   rR   z_Scatter.forwardW  s     	B'BB  !!'!*-==u%,LLg5A  LLs%8r   c                 ^    dt         j                  | j                  | j                  |      z   S rj   )r$   r   r!   r   rQ   rW   s     r   rY   z_Scatter.backwarde  s$     gmmCGGSYYLLLr   NrZ   r_   r   r   r(   r(   V  s*    
 
 M Mr   r(   c                   ,    e Zd Zed        Zed        Zy)r,   c                 t    || _         || _        |j                         }t        j                  ||||       |S )Nr-   r   )r!   r   rP   rM   r+   )rQ   r!   r-   r   r    s        r   rR   z_Reduce.forwardl  s4     	FCBe4r   c                 `    dt         j                  | j                  | j                  |      fz   S N)NNN)r   r   r!   r   rx   s     r   rY   z_Reduce.backwardu  s)     "Z%5%5cggsyy+%V$XXXr   NrZ   r_   r   r   r,   r,   k  s*      Y Yr   r,   c                   ,    e Zd Zed        Zed        Zy)r0   c                     || _         |j                         }t        d |D              }t        j                  |t        |      ||       |S )Nc              3   <   K   | ]  }|j                           y wrT   re   rq   rr   s     r   rs   z*_Reduce_Scatter.forward.<locals>.<genexpr>  s     !LQ!,,.!L   r{   )r   re   rf   rM   r/   rv   )rQ   r-   r   r    r?   s        r   rR   z_Reduce_Scatter.forward|  sJ     	""$!!L:K!LLFD):$;%Pr   c                 H    dt         j                  | j                  |      z   S r}   )r5   r   r   rx   s     r   rY   z_Reduce_Scatter.backward  s      "J$4$4SYY$LLLr   NrZ   r_   r   r   r0   r0   {  s*      M Mr   r0   c                   ,    e Zd Zed        Zed        Zy)r5   c                     |j                         }|| _        t        t        j                  |            D cg c]  }t        j                  |       }}t        j                  |||       t        |      S c c}w rK   )	re   r   rb   rM   rc   r   
empty_liker4   rf   )rQ   r   r    _out_tensor_lists        r   rR   z_AllGather.forward  sp     ""$	.3D4G4Ge4T.U
)*EV$
 
 	u=_%%
s   A:c                 n   t        j                  | j                        t         j                  j                  t         j                  j
                  fv rlt        j                  | j                        }t        j                  ||         }t        j                  t        j                  | j                  |g| }d |fS |D cg c]  }t        j                  |       }}t        j                  | j                  |g| }t        j                  t        j                  |      d      }d |fS c c}w )NrL   r   )dim)rM   get_backendr   BackendNCCLXCCLrN   r   r   r0   r   r   rU   r=   sumstack)rQ   rl   rO   rX   r    rh   gxss          r   rY   z_AllGather.backward  s    #)),1B1BDLLDUDU0VV==syy1D!!,t"45B &&x||SYYR\RB bz COO5++F3OKO//#))[H<HC5;;s+3Bbz Ps   D2NrZ   r_   r   r   r5   r5     s(    
& 
&  r   r5   c                   ,    e Zd Zed        Zed        Zy)r8   c                 `    || _         t        j                  ||j                         |       |S rK   )r   rM   r7   re   )rQ   r9   r:   r   s       r   rR   z_AllGatherBase.forward  s,     	m\-D-D-FeTr   c                    t        j                  | j                        t         j                  j                  t         j                  j
                  fv rt        j                  | j                        }t        |j                               }|d   |z  dk7  rt        d| d|       |d   t        j                  | j                        z  |d<   t        j                  ||j                  |j                        }t        j                  ||t        j                   | j                         nt        d      d |d fS )NrL   r   zTensor with dimensions: z8 does not have first dimension divisible by world_size: devicedtypezBackend not supported!)rM   r   r   r   r   r   rc   rv   rp   r   r   emptyr   r   _reduce_scatter_baser   rU   )rQ   rW   
world_sizeout_sizerX   s        r   rY   z_AllGatherBase.backward  s    #)),1B1BDLLDUDU0VV,,399=JK,,./H{Z'1,".xj 9IISV  #1+)<)<399)MMHQK!3!3;;L;LB %%b+x||SYYO788b$r   NrZ   r_   r   r   r8   r8     s(     
    r   r8   c                   ,    e Zd Zed        Zed        Zy)r=   c                 b   || _         t        t        j                  |            D cg c]  }||   j	                          c}| _        t        j                  |      }t        d |D              }t        j                  |      t        j                  j                  u r]t        t        j                  |            D ]0  }d }||k(  rt        |      }t        j                  ||   |||       2 t        |      S t        j                  |t        |      |       t        |      S c c}w )NrL   c              3   <   K   | ]  }|j                           y wrT   r   r   s     r   rs   z$_AlltoAll.forward.<locals>.<genexpr>  s     818r   )r   rb   rM   rc   rp   input_tensor_size_listrN   rf   r   r   GLOOrv   r'   r<   )rQ   r   r   r)   rg   my_rankto_sends          r   rR   z_AlltoAll.forward  s    	',T-@-@u-M'N&
"#GAJOO&
" --e,888%(DLL,=,==4..U;< J<"7mG_Q/!5I	J _%% OOW
 _%%%&
s   D,c           	          | j                   D cg c]4  }t        j                  ||d   j                  |d   j                        6 }}dt        j                  | j                  |g| z   S c c}w )Nr   r   rk   )r   r   r   r   r   r=   r   r   )rQ   rl   rp   rh   s       r   rY   z_AlltoAll.backward  ss     22	
  KK\!_33<?;P;P
 
 ioociiT|TTT
s   9A.NrZ   r_   r   r   r=   r=     s*    & &, U Ur   r=   c                   ,    e Zd Zed        Zed        Zy)rB   c                     || _         |j                         | _        || _        || _        t        j                  |||||       |S )N)rD   rE   r   )r   rp   
input_sizerD   rE   rM   rA   )rQ   r   r1   rD   rE   rC   s         r   rR   z_AlltoAllSingle.forward  sL     	!2 21/	
 r   c           	          t        j                  | j                  |j                  |j                        }dt
        j                  | j                  || j                  | j                  |j                               fz   S )Nr   )NNNN)r   r   r   r   r   rB   r   r   rD   rE   re   )rQ   rW   r    s      r   rY   z_AlltoAllSingle.backward  ss     NN;#5#5[=N=N
 (!!		&&%%&&(+
 
 	
r   NrZ   r_   r   r   rB   rB     s(      
 
r   rB   c                   ,    e Zd Zed        Zed        Zy)rH   c                     || _         || _        |j                  t        j                        }t        j                  |||       |S )N)memory_formatr{   )r   r-   rP   r   contiguous_formatrM   rG   )rQ   r-   r   r    s       r   rR   z_AllReduce.forward  s=     	E,C,CD2U3r   c                 `    dt         j                  | j                  | j                  |      fz   S rj   )rH   r   r-   r   rx   s     r   rY   z_AllReduce.backward  s)     z//		;OQQQr   NrZ   r_   r   r   rH   rH     s*      R Rr   rH   )!r   r   torch.distributeddistributedrM   torch.autogradr   r   r   r   r   WORLDr   r#   r'   rU   r+   r/   r4   r7   r<   rA   rG   r   r$   r(   r,   r0   r5   r8   r=   rB   rH   r_   r   r   <module>r      sY       #
 . 6:  "' 04  -" %++ 0, $<<u{{ 1. +3,,ekk A8 #[[ +. 9> #DL =BKK J* 
++%P #,,ekk /6   ,Sh S4Mx M*Yh Y Mh M" < X  :!U !UH
h 
DR Rr   