
    9jF              	          d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlZd dl	Z	d dl
mc mZ d dlmZ d dlmZ ddlmZ ddlmZmZmZ dd	lmZ  ej2                  e      Z G d
 de      Z G d de      Ze j<                  defd       Zde defdZ!dejD                  defdZ#d6de	jH                  de%de%fdZ&d6de	jH                  de%de%fdZ'dejD                  de%fdZ(dejD                  de%fdZ) G d de      Z* G d de      Z+ G d de      Z,d gd!ggZ-d"gd"ggd#gd#ggd$gd%gggZ.g d&g d'g d(g d)gZ/de0dz  fd*Z1d+e%d,e%d-ede0fd.Z2dejD                  de0fd/Z3d0e	jh                  jj                  de%fd1Z6d0e	jh                  jj                  de%fd2Z7	 	 d7d0e	jh                  jj                  d3e%dz  d4e8de0fd5Z9y)8    N)IntEnum)Any)optimization_hint)normalize_function   )ir)get_dtype_sizesnode_args_kwargssympy_product)Vc                   $    e Zd ZdZdZdZdZdZdZy)	NCCL_COLLr   r               N)	__name__
__module____qualname__
ALL_REDUCE
ALL_GATHERREDUCE_SCATTER
ALL_TO_ALLUNSUPPORTEDP2P     ]/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_inductor/comm_analysis.pyr   r      s     JJNJK
Cr   r   c                       e Zd ZdZdZdZdZy)NVIDIA_GPU_TYPEr   r   r   r   N)r   r   r   VOLTAAMPEREHOPPER	BLACKWELLr   r   r   r    r        s    EFFIr   r    returnc                      t         j                  j                  j                  t         j                  j                  j                        xs d d v rt
        j                  S d v rt
        j                  S d v rt
        j                  S t         fddD              rt
        j                  S t
        j                  S )N V100A100H100c              3   &   K   | ]  }|v  
 y wNr   ).0gpugpu_infos     r   	<genexpr>zget_gpu_type.<locals>.<genexpr>0   s     ASH_A   )B100B200B300)torchutilscollect_envget_gpu_inforunr    r!   r"   r#   anyr$   )r/   s   @r   get_gpu_typer;   '   s    {{&&33EKK4K4K4O4OPVTVH$$$	8	%%%	8	%%%	A(@A	A((( %%%r   kernel_namec                 4     J d v rt         j                  S d v rt         j                  S d v rt         j                  S t	         fddD              rt         j
                  S t	         fddD              rt         j                  S t         j                  S )N
all_reduce
all_gatherreduce_scatterc              3   &   K   | ]  }|v  
 y wr,   r   r-   commr<   s     r   r0   z7get_collective_type_from_kernel_name.<locals>.<genexpr>?   s     HTT[ Hr1   )
all_to_allalltoallc              3   &   K   | ]  }|v  
 y wr,   r   rB   s     r   r0   z7get_collective_type_from_kernel_name.<locals>.<genexpr>A   s     MTT[ Mr1   )isendirecv	batch_p2p)r   r   r   r   r:   r   r   r   )r<   s   `r   $get_collective_type_from_kernel_namerJ   7   s    """{"###		$###	[	('''	H-GH	H###	M-LM	M}}$$$r   nodec                     t        | t        j                        st        d|        | j                  }|J t        |      S )Nz!node is not a collective kernel: )
isinstancer   _CollectiveKernel
ValueErrorpython_kernel_namerJ   )rK   names     r   get_collective_typerR   G   sG    dB001<TFCDD""D/55r   sizefallbackc                     t        |       }t        |t        j                        rt	        |      S t
        j                  j                  j                  ||      S )NrT   )	r   rM   sympyIntegerintr   graphsizevarsr   )rS   rT   numels      r   get_ir_node_size_numelr]   P   sD    $E%'5z77--eh-GGr   c                 j    t        j                  t        j                  | d      }t	        ||      }|S )Nr   rV   )	functoolsreduceoperatormulr   )rS   rT   r\   results       r   get_fx_node_size_numelrd   W   s+    X\\43Eux8FMr   c                     d}| j                   D ]F  }t        |j                  j                        }||t	        |j                  j
                        z  z  }H |S )Nr   )inputsr]   layoutrS   r	   dtype)rK   sz_bytesinpr\   s       r   get_collective_input_size_bytesrk   ]   sQ    H{{ =&szz7EN3::+;+;<<<= Or   c                     t        | t        j                        r5t        | t        j                        sddlm}  || j                  d         S t        d|        )Nr   _get_group_size_by_namezUnsupported collective type: )rM   r   rN   _WaitKernel"torch.distributed.distributed_c10drn   constant_args	TypeError)rK   rn   s     r   get_collective_group_sizert   e   sK    $,,-jr~~6VN&t'9'9"'=>>7v>??r   c                       e Zd ZdZdZdZy)NCCL_HWr   r   r   N)r   r   r   NVLINKPCINETr   r   r   rv   rv   s   s    F
C
Cr   rv   c                       e Zd ZdZdZy)	NCCL_ALGOr   r   N)r   r   r   TREERINGr   r   r   r{   r{   y   s    DDr   r{   c                       e Zd ZdZy)
NCCL_PROTOr   N)r   r   r   LLr   r   r   r   r   ~   s	     
Br   r   g333333@gffffff@g333333?      ?g      @g@)     C@r   gffffff4@)gU@g     6@g      3@)g     a@g     F@g     A@)g     q@g     V@g     Q@c                 H   | j                   }|J t        |dd      }|j                  d   }ddlm}  ||      }t
        j                  j                  |      }t        j                  d|       }t        |      }t        |       \  }	}
d|v r|	dd  |	d   z   }	t
        j                  j                  ||	      5 } ||	i |
}t
        j                  j                  j                  j                  |       d d d        j                   }|dk  ry |d
z  }|S # 1 sw Y   "xY w)NrP   r'   ro   r   )_resolve_process_groupzcuda:all_gather_into_tensor_outr   groupdevice     @@)rK   getattrrr   rq   r   r5   distributedget_rankr   evalr
   _time_estimatorops_c10d_functionalwait_tensordefaultestimated_time)snodekernelpy_kernel_namepg_namer   pgrankr   fnargskwargstime_estimatorwest_time_usest_time_mss                  r   /estimate_nccl_collective_runtime_nccl_estimatorr      s+   ZZFV%92>N""2&GI		(B!!**2.D \\E$.)F	n	B$U+LD& $~5ABx$q'!				*	*F	*	C :~		""..66q9: !//K Q#K: :s   ;<DD!tensor_storage_size_bytes
group_sizecollc                    | dz  dz  dz  }d}t        j                  ||z        }|}|dk  ry|t        j                  k(  ryt        j
                  }t        j                  }t        j                  j                  j                  }	t        j                  j                  j                  }
t               }|dk  r|dz
  nd}|dk(  r|nd}t        |   |   }|dk(  r|	n|
}d}||z  }t        |||dkD  s|t        j                   k(  rdndz        }|t        j                   k(  r	d|dz
  z  }nY|t        j"                  k(  r	d|dz
  z  }n=|t        j$                  t        j&                  fv r|dz
  }n|t        j(                  k(  rd}d|z  z  }||z  }|d	z  }t*        j,                  }|t        j                   k(  r|dkD  rd|z  }nVd}nS|t        j$                  t        j&                  t        j"                  fv r|dz
  }n|t        j(                  k(  r	|dkD  rdnd}t.        |   |   }t0        |   |   |   }t0        t*        j2                     |   |   }d
}|dkD  rd}t5        ||      }||z
  |z  ||z  z   z  }|dz  }||z  }||z   }|dz  }|S )a:  
    Returns estimated NCCL collective runtime in milliseconds (ms).

    The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
    We aim to estimate the runtime as accurately as possible.

    Assumptions:
    - only ring algorithm (NCCL_ALGO_RING) is used
    - only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
    - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
    - collective is one of: allreduce, reducescatter, allgather
    i      r   r   r   g      ?gUUUUUU?r   g    eAg        r   g    .A)mathceilr   r   r{   r}   r   r   r5   	_inductorconfigintra_node_bwinter_node_bwr;   llMaxBwsminr   r   r   r   r   rv   rw   baseLathwLatry   max) r   r   r   tensor_storage_size_GBnum_gpus_per_nodenNodesnRanks	nccl_algo
nccl_protobwIntrabwIntercompCapIndexindex2index1llMaxBwbw	nChannelsbusBwnstepsratio	bandwidthbandwidth_GB_per_nsintraHwnInterStepslatencyintraLatinterLatnetOverhead
latency_nstransport_nsnsmss                                    r   %estimate_nccl_collective_runtime_implr      s     7=DtK YYz$556FF{y$$$ IJ
 oo$$22Goo$$22G>L!Q;VaZAF#q[\aFvv&G aKWBINE !ty/C/C'C9)	UE y###fqj!	%%	%fqj!	)**I,@,@A	A!		 6\V#EI#c/ nnGy###A:f*KK	)**I,@,@)BVBVW	Wqj		!A:a1 i ,GW~i(4HW[[!),Z8H Kz8[)H$0;3IIIG3J *,??L	
	"B	cBIr   c                 ^    t        |       }t        |       }t        |       }t        |||      S )9  
    Returns estimated NCCL collective runtime in nanoseconds (ms).

    The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
    We aim to estimate the runtime as accurately as possible.

    Assumptions:
    - only ring algorithm (NCCL_ALGO_RING) is used
    - only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
    - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
    - collective is one of: allreduce, reducescatter, allgather
    )rk   rt   rR   r   )rK   r   r   r   s       r    estimate_nccl_collective_runtimer   ^  s8     !@ E*40Jt$D0!:t r   fx_nodec                 <   d| j                   | j                  }}t        |      }|j                  dd       dt        j
                  dt        fddt        j                  j                  ffd}t        j                  t        j                  j                  |||f       | j                  j                  dd      }|y	t        |t        t        f      rt!        fd
|D              }|z   S t        |t        j
                        r |      }|z   S y	)zSEstimate the size of a collective operation in bytes, including inputs and outputs.Nouttr%   c                 `    t        | j                               t        | j                        z  S r,   )rd   rS   r	   rh   )r   s    r   tensor_bytesz1estimate_fx_collective_size.<locals>.tensor_bytes}  s!    %affh/.2IIIr   rj   c                     | j                   j                  dd       }t        |t        j                        sy d |      z  y )Nvalr   )metagetrM   r5   Tensor)rj   inp_valinput_bytesr   s     r   add_inp_bytesz2estimate_fx_collective_size.<locals>.add_inp_bytes  sD    ((,,ud+'5<<0 K|G,,r   r   r   c              3   d   K   | ]'  }t        |t        j                        s |       ) y wr,   )rM   r5   r   )r-   r   r   s     r   r0   z.estimate_fx_collective_size.<locals>.<genexpr>  s'      
 !:a3NLO
s   00)r   r   dictpopr5   r   rY   fxNodepytreetree_map_onlyr   r   rM   listtuplesum)r   r   r   r   
output_valoutput_bytesr   r   s         @@r   estimate_fx_collective_sizer   s  s   K<<&D&\F JJudJ J J-588== - 	v !!%.Jj0 *tUm, 
%/
 
 %% 
J	-#J/ %% r   c                 B    ddl m} t        |       } ||       s|S |dz  S )zEstimate the memory footprint of a collective operation in bytes.

    This returns the total bytes that need to be live concurrently in memory.
    For all_reduce, we divide by 2 since it can be done in-place.
    r   )is_all_reduce_tensorr   )#torch._inductor.fx_passes.bucketingr   r   )r   is_all_reducerS   s      r   'estimate_fx_collective_memory_footprintr     s)     'w/D$W-4<419<r   override_sizeuse_nccl_estimatorc                 n   
 ddl m}  j                  t        j                  j
                  j                  j                  u rd}t               }n}t         j                  t              rJ t         j                   j                   j                  d      }|J |\  
d    |      }t         j                  t        j                  j                        sJ t!         j                  j#                               }dt$        dz  f
 fd	}|r |       }	|	|	S t'        |||      S )
r   r   rm   FNT)r   r   normalize_to_only_use_kwargs
group_namer%   c                  "   ddl m} m}  |       }t        j                  j
                  j                  |      |j                  k(  ry t        j                  d      }	 |j                  |      }|j                  sy t        j                  f      \  }}dt        j                  ffddt        dt        ffd|D cg c]
  } |       }}t        j                   ||      \  }}	j"                  }
t%        |
t        j&                  j(                        sJ t        j                  j+                  ||      5 } |
|i |	}t%        |t,        t.        f      r;|D ]5  }t        j0                  j2                  j4                  j7                  |       7 n3t        j0                  j2                  j4                  j7                  |       d d d        j8                  }|dk  ry |d	z  }|S # t        $ r Y y w xY wc c}w # 1 sw Y   6xY w)
Nr   )r   Backendcudar%   c                 >    t        j                  | ng||      S )N)rh   r   )r5   empty)rS   rh   r   r   s      r   _tensorzVestimate_nccl_collective_runtime_from_fx_node.<locals>._nccl_estimate.<locals>._tensor  s&    ;;%-M? r   ec                    t        | t        j                  j                        r | j                  d         S t        | t        j
                        r6 t        | j                               g| j                  | j                        S | S )Nr   )
rM   r5   r   r   r   r   rd   rS   rh   r   )r   r   to_real_tensors    r   r   z]estimate_nccl_collective_runtime_from_fx_node.<locals>._nccl_estimate.<locals>.to_real_tensor  sa    !UXX]]+%affUm44!U\\* 6qvvx @A177AHHUUHr   r   r   )rq   r   r   r5   r   distributed_c10dget_backendFAKEr   _get_backendRuntimeErrorsupports_time_estimater   tree_flattenr   r   tree_unflattentargetrM   _ops
OpOverloadr   r   r   r   r   r   r   r   )r   r   r   r   backend	flat_argsflat_args_pytree_speca	real_argsreal_kwargsr   r   r   r   r   r   r   r   r   r   r   r   r   s                   @@r   _nccl_estimatezEestimate_nccl_collective_runtime_from_fx_node.<locals>._nccl_estimate  s   V#J/--99"=Mf%	oof-G --+1+>+>f~+N(	(	ELL 		c 	c 	 1::1^A&:	:!'!6!6yBW!X	;^^"ejj33444..V / 
 		BI--A!dE]+ FAII..::BB1EF 		**66>>qA		B %33 ?!C'U  		* ;
		B 		Bs%   !G1 H BH1	G=<G=H)rq   rn   r	  r5   r   r   all_to_all_singler   r   rM   strr   r   r   r
  r  rJ   rQ   floatr   )r   r   r   rn   r   opt_args_kwargsr   r   r  r   r   r   r   s   ``        @@@r   -estimate_nccl_collective_runtime_from_fx_noder    s#   " K~~33EEMMM #$?$H!$1!'..#...(\\~~%)	O &&&"LD&%J(4Jgnnejj&;&;<<</0C0C0EFD6EDL 6 6p $&"0!:t r   )i   )NT):r_   loggingr   ra   enumr   typingr   rW   r5   torch.utils._pytreer6   _pytreer   %torch.fx.experimental.symbolic_shapesr   torch.fx.operator_schemasr   r'   r   r	   r
   r   virtualizedr   	getLoggerr   logr   r    	lru_cacher;   r  rJ   IRNoderR   SizerY   r]   rd   rk   rt   rv   r{   r   r   r   r   r  r   r   r   r   r   r   r   boolr  r   r   r   <module>r&     s}           $ $ C 8  C C  g! g  &o & &%c %i % 6bii 6I 6H Hs HS H s S "))  @BII @# @g  
  	
 		  
	 
	 
		,)8edl Bn"n03n;Dn
nl299  *,& ,&3 ,&^=UXX]] =s =  !%#kXX]]k:k k 	kr   