
    9j                     0	   U d dl mZ d dlZd dlZd dlmZmZmZ ddlm	Z	 d dl
mZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlZddgZ ed      Z ed      Z ej<                  e      Z 	 d dl!m"Z# ejN                  jP                  Z(d Z)i Z*e+eef   e,d<   d Z-dDdeeeef   geeef   f   fdZ. e.e(j^                        ddde0fd       Z1 e.e(jd                        dEde0fd       Z3 e.e(jh                        dEde0fd       Z5 e.e(jl                        dEde0fd       Z7 e.e(jp                        	 	 	 	 	 dFde0fd        Z9	 dDd!e:e0   d"e:e0   d#e:e0   d$e;de0f
d%Z< e.e(jz                  e(j|                  e(j~                  e(j                  e(j                  g      ddde0fd&       ZB e.e(j                        de0fd'       ZDd( ZE e.e(j                  e(j                  e(j                  g      ddde0fd)       ZId* ZJdd+deeKeKe0d,f   eKe0d,f   eKe0d,f   eKe0d,f   dz  f      fd-ZLdd+deeKeKe0d,f   eKe0d,f   eKe0d,f   eKe0d,f   dz  f      fd.ZM e.e(j                  d/0      ddde0fd1       ZO e.e(j                  d/0      de0fd2       ZQd3 ZR e.e(j                  e(j                  e(j                  g      ddde0fd4       ZV e.e(j                  d/0      de0fd5       ZX e.e(j                  d/0      de0fd6       ZZdd7de0fd8Z[dd7de0fd9Z\dd7de0fd:Z]i e(j^                  e1e(jd                  e3e(jh                  e5e(jl                  e7e(jp                  e9e(jz                  eBe(j|                  eBe(j~                  eBe(j                  eBe(j                  eBe(j                  eDe(j                  eIe(j                  eIe(j                  eIe(j                  eVe(j                  eVe(j                  eVe(j                  eOe(j                  eQe(j                  eXe(j                  eZiZ*d; Z^g d<Z_d= Z`d> Zadebfd?Zcd@ Zd G dA d      Ze G dB dCe      Zfy# e$$ r&  e%d dD              re jM                  d       eZ#Y w xY w)G    )NoneTypeN)tree_maptree_flattentree_unflatten   )ModuleTracker)AnyTypeVar)Callable)Iterator)	ParamSpec)defaultdict)TorchDispatchModeprodwrapsFlopCounterModeregister_flop_formula_T_PJITFunctionc              #   V   K   | ]!  }t        t        j                  |d       d u # y wN)getattrtorchversion).0attrs     X/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/utils/flop_counter.py	<genexpr>r"      s"     
]d75==$-T9
]s   '))cudahipxpuz@triton not found; flop counting will not work for triton kernelsc                 R    t        | t        j                        r| j                  S | S r   )
isinstancer   Tensorshape)is    r!   	get_shaper+   #   s    !U\\"wwH    flop_registryc                 4     t               d d fd
       }|S )Nout_valc                 F    t        t        ||| f      \  }}} |d|i|S )N	out_shape)r   r+   )r0   argskwargsr2   fs       r!   nfzshape_wrapper.<locals>.nf+   s2    "*9tVW6M"Nfi$6)6v66r,   r   r5   r6   s   ` r!   shape_wrapperr8   *   s#    
1X 7 7 Ir,   returnc                 d     dt         t        t        f   dt         t        t        f   f fd}|S )Nflop_formular9   c                      st                d fd}t        j                  j                  j	                  |        S )Nc                     t        | t        j                  j                  t        f      st        d|  dt        |              | t        v rt        d|        t        | <   y )Nz|register_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), or JitFunction, got z which is of type zduplicate registrations for )	r'   r   _opsOpOverloadPacket_JITFunction
ValueErrortyper-   RuntimeError)targetr;   s    r!   registerz=register_flop_formula.<locals>.register_fun.<locals>.register7   sm    v

(C(C\'RS #H$6tF|nFG G &"%A&#JKK$0M&!r,   )r9   N)r8   r   utils_pytree	tree_map_)r;   rE   get_rawtargetss   ` r!   register_funz+register_flop_formula.<locals>.register_fun3   s7    (6L	1 	%%h8r,   )r   r   r   )rJ   rI   rK   s   `` r!   r   r   1   s0    8BF#3 R8H & r,   )r2   c                X    | \  }}|\  }}||k7  rt        d| d|       ||z  dz  |z  S )zCount flops for matmul.z3matmul: inner dimensions must match (k == k2), got  and    AssertionError)	a_shapeb_shaper2   r3   r4   mkk2ns	            r!   mm_floprW   H   sM    
 DAqEBBwRSTRUUZ[]Z^_``q519q=r,   c                     t        ||      S )zCount flops for addmm.rW   
self_shaperQ   rR   r2   r4   s        r!   
addmm_flopr\   T   s     7G$$r,   c                     | \  }}}|\  }}}	||k7  rt        d| d|       ||k7  rt        d| d|       ||z  |	z  dz  |z  }
|
S )z"Count flops for the bmm operation.z0bmm: batch dimensions must match (b == b2), got rM   z0bmm: inner dimensions must match (k == k2), got rN   rO   )rQ   rR   r2   r4   brS   rT   b2rU   rV   flops              r!   bmm_flopra   Y   s}    
 GAq!IBABwOPQsRWXZW[\]]BwOPQsRWXZW[\]]q519q=1DKr,   c                     t        ||      S )z&Count flops for the baddbmm operation.)ra   rZ   s        r!   baddbmm_floprc   h   s    
 GW%%r,   c	                     t        | |      S )zCount flops for _scaled_mm.rY   )
rQ   rR   scale_a_shapescale_b_shape
bias_shapescale_result_shape	out_dtypeuse_fast_accumr2   r4   s
             r!   _scaled_mm_floprk   o   s     7G$$r,   x_shapew_shaper2   
transposedc                 t    | d   }|r| n|dd }|^}}}	 t        |      t        |      z  |z  |z  |z  dz  }	|	S )a  Count flops for convolution.

    Note only multiplication is
    counted. Computation for bias are ignored.
    Flops for a transposed convolution are calculated as
    flops = (x_shape[2:] * prod(w_shape) * batch_size).
    Args:
        x_shape (list(int)): The input shape before convolution.
        w_shape (list(int)): The filter shape.
        out_shape (list(int)): The output shape after convolution.
        transposed (bool): is the convolution transposed
    Returns:
        int: the number of flops
    r   rN   Nr   )
rl   rm   r2   rn   
batch_size
conv_shapec_outc_infilter_sizer`   s
             r!   conv_flop_countru      s]    ( J''Y;J 'E4+ 
d;//*<uDtKaODKr,   c                     t        | |||      S )zCount flops for convolution.rn   )ru   )
rl   rm   _bias_stride_padding	_dilationrn   r2   r3   r4   s
             r!   	conv_flopr|      s     7GY:NNr,   c                    d }d}	 |
d   r t        |d         }|t        | |||       z  }|
d   rZt        |d         }|r&|t         ||        ||       ||      d      z  }|S |t         ||       ||        ||      d      z  }|S )Nc                 4    | d   | d   gt        | dd        z   S )Nr   r   rN   )list)r)   s    r!   tzconv_backward_flop.<locals>.t   s$    a%(#d59o55r,   r   r   Frw   )r+   ru   )grad_out_shaperl   rm   rx   ry   rz   r{   rn   _output_padding_groupsoutput_maskr2   r   
flop_countgrad_input_shapegrad_weight_shapes                   r!   conv_backward_flopr      s    6JDL 1~$Yq\2ong?OU_Q_``
1~%il3/!N*;QwZK\I]joppJ
  /!G*a6GK\I]joppJr,   c                 @   | \  }}}}|\  }}}	}
|\  }}}}||cxk(  r|k(  rn n||k(  r
||
k(  r|	|k(  st        d|  d| d|       ||k  s||z  dk7  rt        d| d| d      d}|t        ||z  ||f||z  ||	f      z  }|t        ||z  ||	f||z  |	|f      z  }|S )z
    Count flops for self-attention.

    Supports GQA (grouped-query attention) where key/value have fewer heads
    than the query. The kernel broadcasts KV heads to match query heads.
    z<sdpa_flop_count: query/key/value shapes are incompatible: q=z, k=z, v=r   zsdpa_flop_count: query heads ()) must be a multiple of key/value heads ()rP   ra   )query_shape	key_shapevalue_shaper^   h_qs_qd_q_b2h_kvs_k_d2_b3_h3_s3d_vtotal_flopss                   r!   sdpa_flop_countr     s    #AsC#CsC$Cc3OOs
sczT)D?
 	
 TzS4Z1_,SE 2  $vQ(
 	
 K8QWc3/!c'31DEEK8QWc3/!c'31DEEKr,   c                    t        | ||      S )Count flops for self-attention.r   )r   r   r   r2   r3   r4   s         r!   	sdpa_flopr   5  s     ;	;??r,   c                     ddl m} ddlm} t	        | ||f      s7| j
                  j                  dk7  r| j                         j                         S |g| j                  d      dz
  z  S )z
    If the offsets tensor is fake, then we don't know the actual lengths.
    In that case, we can just assume the worst case; each batch has max length.
    r   )
FakeTensor)FunctionalTensormetar   )
torch._subclasses.fake_tensorr   #torch._subclasses.functional_tensorr   r'   devicerB   difftolistsize)offsetsmax_lenr   r   s       r!   _offsets_to_lengthsr   >  s[    
 9Dg
,<=>7>>CVCVZ`C`||~$$&&9Q!+,,r,   )grad_out.c              #     K   |)t        |j                        dk7  rt        d      t        |j                        dk7  rt        d      |$|j                  | j                  k7  rt        d      | j                  \  }}	}
|j                  \  }}}|j                  \  }}}|t        d      |t        d      |j                  |j                  k7  rt        d      t        ||      }t        ||      }t	        ||d	
      D ]%  \  }}d|	||
f}d|||f}d|||f}||nd}||||f ' y| j                  |j                  |j                  ||j                  ndf yw)a;  
    Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   z7sdpa_flop_count: expected key.shape to be 3-dimensionalz9sdpa_flop_count: expected value.shape to be 3-dimensionalzDsdpa_flop_count: grad_out.shape must match query.shape when providedz+sdpa_flop_count: cum_seq_q must not be Nonez+sdpa_flop_count: cum_seq_k must not be NonezAsdpa_flop_count: cum_seq_q and cum_seq_k must have the same shapeTstrictr   lenr)   rP   r   zip)querykeyvaluer   	cum_seq_q	cum_seq_kmax_qmax_k_r   r   h_kd_kh_vr   seq_q_lengthsseq_k_lengths	seq_q_len	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shapes                          r!   %_unpack_flash_attention_nested_shapesr   J  s    $  syy>Q !Z[[u{{q  !\]]HNNekk$A !ghhkk3ii3kk3 !NOO !NOO??ioo- !dee+Iu=+Iu=&)-t&T 	V"Y	 #y#6OY4M #y#6O4<4Hd!=/CUUU	V 	
++syy%++AUx~~[_
__s   E"E$c              #     K   |,t        |j                        dk7  rt        d      t        |j                        dk7  rt        d      |$|j                  | j                  k7  rt        d      | j                  \  }}}	}
|j                  \  }}}}|j                  \  }}}}|t        d      |t        d      |j                  |j                  k7  rt        d      t        ||      }t        ||      }t	        ||d	
      D ]%  \  }}d|	||
f}d|||f}d|||f}||nd}||||f ' y| j                  |j                  |j                  ||j                  ndf yw)a?  
    Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   zQ_unpack_efficient_attention_nested_shapes: expected key.shape to be 4-dimensionalzS_unpack_efficient_attention_nested_shapes: expected value.shape to be 4-dimensionalz^_unpack_efficient_attention_nested_shapes: grad_out.shape must match query.shape when providedzH_unpack_efficient_attention_nested_shapes: cu_seqlens_q must not be NonezH_unpack_efficient_attention_nested_shapes: cu_seqlens_k must not be Noneza_unpack_efficient_attention_nested_shapes: cu_seqlens_q and cu_seqlens_k must have the same shapeTr   r   r   )r   r   r   r   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   r   r   r   r   r   r   	seqlens_q	seqlens_klen_qlen_kr   r   r   r   s                          r!   )_unpack_efficient_attention_nested_shapesr   ~  s    $  syy>Q !tuuu{{q  !vwwHNNekk$A   "B  C  C1c31c31c3 !kll !kll!3!33  "Z [ ['lC	'lC		9TB 	VLE5 #uc2OUC0M #uc2O4<4Hd!=/CUUU	V 	
++syy%++AUx~~[_
__s   E%E'T)rI   c          	      J    t        | ||||||      }
t        d |
D              S )r   r   r   r   r   r   r   r   c              3   @   K   | ]  \  }}}}t        |||        y wr   r   r   r   r   r   r   s        r!   r"   z0_flash_attention_forward_flop.<locals>.<genexpr>  )      2KK 	Y<   r   sum)r   r   r   r   r   r   r   r2   r3   r4   sizess              r!   _flash_attention_forward_flopr     s?    " 2E  6;  r,   c           	      J    t        | ||||||      }
t        d |
D              S )r   )r   r   r   r   r   r   r   c              3   @   K   | ]  \  }}}}t        |||        y wr   r   r   s        r!   r"   z4_efficient_attention_forward_flop.<locals>.<genexpr>  r   r   r   r   )r   r   r   biasr   r   r   r   r3   r4   r   s              r!   !_efficient_attention_forward_flopr     s?    " 6!!!!E  6;  r,   c                 2   |\  }}}}|\  }}	}
}|\  }}}}| \  }}}}||cxk(  r|cxk(  r|k(  rn t        d      |	|k(  r||k(  st        d      ||	k  s||	z  dk7  rt        d| d|	 d      ||k(  r||k(  r
|
|k(  r||k(  st        d      d}|t        ||z  ||f||z  ||
f      z  }|t        ||z  ||f||z  ||
f      z  }|t        ||z  |
|f||z  ||f      z  }|t        ||z  ||
f||z  |
|f      z  }|t        ||z  ||f||z  ||
f      z  }|S )Nz<sdpa_backward_flop_count: batch/heads mismatch among tensorsr   z'sdpa_backward_flop_count: query heads (r   r   zJsdpa_backward_flop_count: grad_out/value/key/query shapes are incompatibler   )r   r   r   r   r^   r   r   r   r   r   r   r   r   r   r   r   _b4_h4_s4_d4r   s                        r!   sdpa_backward_flop_countr     s   "AsC#CsC$Cc3'Cc3""s"J
 	
 (,s{sczJ
 	
 TzS4Z1_5cU ;  $vQ(
 	
 3J3#:#*X
 	
 K 8QWc3/!c'31DEEK 8QWc3/!c'31DEEK8QWc3/!c'31DEEK 8QWc3/!c'31DEEK8QWc3/!c'31DEEKr,   c                    t        | |||      S )z(Count flops for self-attention backward.r   )r   r   r   r   r2   r3   r4   s          r!   sdpa_backward_flopr     s    
 $NKKXXr,   c
           
      L    t        |||| ||||	      }t        d |D              S )Nr   r   r   r   r   r   r   r   c              3   B   K   | ]  \  }}}}t        ||||        y wr   r   r   r   r   r   r   s        r!   r"   z1_flash_attention_backward_flop.<locals>.<genexpr><  +      ?KK 	!iU   r   )r   r   r   r   out	logsumexpr   r   r   r   r3   r4   shapess                r!   _flash_attention_backward_flopr   !  sB    " 3	F  CI  r,   c
           
      L    t        |||| ||||	      }t        d |D              S )N)r   r   r   r   r   r   r   r   c              3   B   K   | ]  \  }}}}t        ||||        y wr   r   r   s        r!   r"   z5_efficient_attention_backward_flop.<locals>.<genexpr>]  r   r   r   )r   r   r   r   r   r   r   r   r   r   r3   r4   r   s                r!   "_efficient_attention_backward_flopr   B  sB    " 7!!!!	F  CI  r,   r/   c          	      R    t        | |||||n|||      }
t        d |
D              S )z$Count flops for varlen_attn forward.r   c              3   @   K   | ]  \  }}}}t        |||        y wr   r   r   s        r!   r"   z,_varlen_attn_forward_flop.<locals>.<genexpr>y  r   r   r   )r   r   r   cu_seq_qcu_seq_kr   r   r0   r3   r4   r   s              r!   _varlen_attn_forward_flopr   c  sF     2&2(E  6;  r,   c          	      $    t        |||||||      S )z(Count flops for varlen_attn_out forward.)r   )r   r   r   r   r   r   r   r   r0   r3   r4   s              r!   _varlen_attn_out_flopr     s      %sE8Xue r,   c
          
      L    t        |||| ||||	      }t        d |D              S )z%Count flops for varlen_attn backward.r   c              3   B   K   | ]  \  }}}}t        ||||        y wr   r   r   s        r!   r"   z-_varlen_attn_backward_flop.<locals>.<genexpr>  r   r   r   )r   r   r   r   r   lser   r   r   r   r0   r3   r4   r   s                 r!   _varlen_attn_backward_flopr    sB      2	E  CH  r,   c                 ,    t        | t              s| fS | S r   )r'   tuple)xs    r!   normalize_tupler    s    atHr,   ) KMBTc                     t        dt        t        t              dz
  t        t	        |             dz
  dz              }t        |   S )Nr   r   rN   r   )maxminr   suffixesstr)numberindexs     r!   get_suffix_strr    s=     3s8}q(3s6{+;a+?A*EFGEE?r,   c                 X    t         j                  |      }| d|z  z  d}|t         |   z   S )Ni  z.3f)r  r  )r  suffixr  r   s       r!   convert_num_with_suffixr    s2    NN6"E%c*E8E?""r,   c                     |dk(  ry| |z  dS )Nr   0%z.2% )numdenoms     r!   convert_to_percent_strr    s    zEk#r,   c                 .     t                fd       }|S )Nc                 B    t        |       \  }} | }t        ||      S r   )r   r   )r3   	flat_argsspecr   r5   s       r!   r6   z)_pytreeify_preserve_structure.<locals>.nf  s'    &t,	4mc4((r,   r   r7   s   ` r!   _pytreeify_preserve_structurer#    s     
1X) )
 Ir,   c                        e Zd ZdZ	 	 	 	 ddej
                  j                  eej
                  j                     z  dz  dede	de
eef   dz  ddf
 fdZdefd	Zde
ee
eef   f   fd
ZddZd Zd Zd Z xZS )r   a  
    ``FlopCounterMode`` is a context manager that counts the number of flops within its context.

    It does this using a ``TorchDispatchMode``.

    It also supports hierarchical output by passing a module (or list of
    modules) to FlopCounterMode on construction. If you do not need hierarchical
    output, you do not need to use it with a module.

    Example usage

    .. code-block:: python

        mod = ...
        with FlopCounterMode(mod) as flop_counter:
            mod.sum().backward()

    Nmodsdepthdisplaycustom_mappingr9   c                 d   t         |           t        d       | _        || _        || _        d | _        |i }|t        j                  dd       i t        |j                         D ci c]   \  }}|t        |dd      r|n
t        |      " c}}| _	        t               | _        y c c}}w )Nc                       t        t              S r   )r   intr  r,   r!   <lambda>z*FlopCounterMode.__init__.<locals>.<lambda>  s    +VYJZ r,   z<mods argument is not needed anymore, you can stop passing itrN   )
stacklevel_get_rawF)super__init__r   flop_countsr&  r'  modewarningswarnr-   itemsr   r8   r   mod_tracker)selfr%  r&  r'  r(  rT   v	__class__s          r!   r0  zFlopCounterMode.__init__  s     	6ABZ6[
-1	!NMMXefg

WeWkWkWmntqRSqwq*e4!-:JJn
 )? os   -%B,c                 N    t        | j                  d   j                               S )NGlobal)r   r1  valuesr7  s    r!   get_total_flopszFlopCounterMode.get_total_flops  s!    4##H-44677r,   c                 |    | j                   j                         D ci c]  \  }}|t        |       c}}S c c}}w )a  Return the flop counts as a dictionary of dictionaries.

        The outer
        dictionary is keyed by module name, and the inner dictionary is keyed by
        operation name.

        Returns:
            Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
        )r1  r5  dict)r7  rT   r8  s      r!   get_flop_countszFlopCounterMode.get_flop_counts  s3     (,'7'7'='='?@tq!47
@@@s   8c                    
 | j                   }|d}dd l}d|_        g d}g } j                         
t	        
      d
 fd}t         j                  j                               D ]?  }|dk(  r	|j                  d      d	z   }||kD  r# |||d	z
        }|j                  |       A d j                  v r s|D ]  }	d
|	d   z   |	d<     |dd      |z   }t        |      dk(  rg dg}|j                  ||d      S )Ni?B r   T)ModuleFLOPz% TotalFc           	         t        
j                  |    j                               }	|k\  z  	d|z  }g }|j                  || z   t	        |      t        |      g       
j                  |    j                         D ]<  \  }}|j                  |dz   t        |      z   t	        |      t        |      g       > |S )N z - )r   r1  r<  appendr  r  r5  r  )mod_namer&  r   paddingr<  rT   r8  global_flopsglobal_suffixis_global_subsumedr7  s          r!   process_modz.FlopCounterMode.get_table.<locals>.process_mod8  s     d..x8??ABK+"==EkGFMM("']C&{LA 
 ((288: 1eOc!f,+A}=*1l;  Mr,   r;  .r   rF  )r;  0r  )leftrightrQ  )headerscolalign)r&  tabulatePRESERVE_WHITESPACEr>  r  sortedr1  keyscountextendr   )r7  r&  rT  headerr<  rM  mod	mod_depth
cur_valuesr   rJ  rK  rL  s   `         @@@r!   	get_tablezFlopCounterMode.get_table(  s0   =JJE=E 	'+$.++-&|4"	, $**//12 	&Ch		#*I5 $S)a-8JMM*%	& t'''0B *q>a* !1-6Fv;!+,F  B\ ]]r,   c                     | j                   j                          | j                  j                          t	        |       | _        | j
                  j                          | S r   )r1  clearr6  	__enter___FlopCounterModer2  r=  s    r!   ra  zFlopCounterMode.__enter__g  sG     ""$$T*			r,   c                    | j                   t        d       | j                   j                  | }d | _         | j                  j                          | j                  r$t        | j                  | j                               |S )Nz<Internal error: FlopCounter.__exit__ called but mode is None)r2  rP   __exit__r6  r'  printr^  r&  )r7  r3   r^   s      r!   rd  zFlopCounterMode.__exit__n  sh    99 !_``DII%	!!#<<$..,-r,   c                     || j                   v rY| j                   |   } ||i |d|i}t        | j                  j                        D ]  }| j                  |   |xx   |z  cc<    |S )Nr0   )r-   setr6  parentsr1  )r7  func_packetr   r3   r4   flop_count_funcr   pars           r!   _count_flopszFlopCounterMode._count_flopsx  sv    $,,,"00=O($F&F#FJ4++334 A  %k2j@2A
r,   )NrN   TNr   )__name__
__module____qualname____doc__r   nnrC  r   r+  boolr@  r	   r0  r>  r  rA  r^  ra  rd  rl  __classcell__)r9  s   @r!   r   r     s    * DH 48+((//D$99D@+ + 	+
 !cNT1+
 >B+*8 8
Ac4S>&9!: 
A<^~r,   c                   4    e Zd ZdZdeddfdZd Zd Zd	dZy)
rb  Tcounterr9   Nc                     || _         y r   )ru  )r7  ru  s     r!   r0  z_FlopCounterMode.__init__  s	    r,   c                     ddl }|j                  | j                  j                        }| 5   || }ddd       |j                  | j                  j                        }|| j                  _        |fS # 1 sw Y   CxY w)a  Execute a branch function and capture its FLOP counts without
        affecting self.counter.flop_counts

        Args:
            branch_fn: The branch function to execute
            operands: Arguments to pass to the branch function

        Returns:
            Tuple of (result, flop_counts) where result is the branch output
            and flop_counts is a copy of the FLOP counts after execution
        r   N)copyru  r1  )r7  	branch_fnoperandsrx  checkpointed_flop_countsresultr1  s          r!   $_execute_with_isolated_flop_countingz5_FlopCounterMode._execute_with_isolated_flop_counting  sq     	#'99T\\-E-E#F  	*)F	*ii 8 89#; {""		* 	*s   A44A=c                 V   |t         j                  j                  j                  t         j                  j                  j                  hv }|rhddlm} ddlm}  ||d         }t        ||      s't        |d      r|j                  }nnt        ||      s'| j                  j                  |d ||      S |t         j                  j                  j                  u rI|\  }	}
}}| j                  |
|      \  }}|t         u rt         S | j                  ||      \  }}|t         u rt         S t#        |j%                               t#        |j%                               z  }i }|D ]  }||   }||   }i }t#        |j%                               t#        |j%                               z  }|D ]5  }|j'                  |d      }|j'                  |d      }t)        ||      ||<   7 |||<    |j+                         D ]-  \  }}| j                  j,                  |   j/                  |       / |S t         S )Nr   )
get_kernelr   
kernel_idxfn)r   opshigher_ordertriton_kernel_wrapper_mutation triton_kernel_wrapper_functional*torch._higher_order_ops.triton_kernel_wrapr  triton.runtime.jitr   r'   hasattrr  ru  rl  condr}  NotImplementedrg  rW  getr  r5  r1  update)r7  functypesr3   r4   	is_tritonr  r   kernel_namepredtrue_branchfalse_branchrz  true_outtrue_flop_counts	false_outfalse_flop_countsall_mod_keysmerged_flop_counts	outer_keytrue_func_countsfalse_func_countsmerged_func_countsall_func_keysfunc_keytrue_val	false_val
inner_dicts                               r!   _handle_higher_order_opsz)_FlopCounterMode._handle_higher_order_ops  s<   UYY33RR"YY33TTV V	M6$VL%9:K k:;-"-..K	 !k:
 <<,,[$fMMUYY++000
 9=5D+|X)-)R)RX*&H& >)%%+/+T+Th,(I( N*%% /4467#>O>T>T>V:WWL!#) C	#3I#> $5i$@!%'" #$4$9$9$; <sCTCYCYC[?\ \ - LH/33Ha@H 1 5 5h BI36x3K&x0L
 1C"9-C *<)A)A)C G%	:((3:::FG
 O!!r,   c                 B   |r|ni }|t         j                  j                  j                  j                  t         j                  j                  j
                  j                  t         j                  j                  j
                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                  j                  t         j                  j                  j                   j                  t         j                  j                  j"                  j                  t         j                  j$                  j&                  j                  hv rt(        S t+        |t         j,                  j.                        r| j1                  ||||      S || j2                  j4                  vra|t         j                  j$                  j6                  j                  ur1| 5   |j8                  |i |}|t(        ur|cd d d        S 	 d d d         ||i |}| j2                  j;                  |j<                  |||      S # 1 sw Y   9xY wr   )r   r  atensym_is_contiguousdefaultis_contiguousmemory_formatis_strides_like_formatis_non_overlapping_and_denser   sym_sizestride
sym_stridestorage_offsetsym_storage_offsetnumel	sym_numeldimprimlayoutr  r'   r>   HigherOrderOperatorr  ru  r-   r   	decomposerl  _overloadpacket)r7  r  r  r3   r4   rr   s          r!   __torch_dispatch__z#_FlopCounterMode.__torch_dispatch__  sK   !r EIINN44<<IINN0088IINN00>>IINN99AAIINN??GGIINN''//IINN++33IINN))11IINN--55IINN1199IINN55==IINN((00IINN,,44IINN&&..IINN))113 3  "!dEJJ::;00udFKK t||111d%))..BWBWB_B_6_ "DNND3F3N* * D#F#||(()=)=sD&QQ s   6NN)r  N)	rm  rn  ro  supports_higher_order_operatorsr   r0  r}  r  r  r  r,   r!   rb  rb    s,    &*# D #(;"z"Rr,   rb  )Fr   )NNNFN)gr  r   loggingr   torch.utils._pytreer   r   r   module_trackerr   typingr	   r
   collections.abcr   r   typing_extensionsr   collectionsr   torch.utils._python_dispatchr   mathr   	functoolsr   r3  __all__r   r   	getLoggerrm  logr  r   r@   ImportErroranywarningr  r  r+   r-   r@  __annotations__r8   r   mmr+  rW   addmmr\   bmmra   baddbmmrc   
_scaled_mmrk   r   rr  ru   convolution_convolutioncudnn_convolution_slow_conv2d_forwardconvolution_overrideabler|   convolution_backwardr   r   '_scaled_dot_product_efficient_attention#_scaled_dot_product_flash_attention#_scaled_dot_product_cudnn_attentionr   r   r  r   r   _flash_attention_forwardr   _efficient_attention_forwardr   r   0_scaled_dot_product_efficient_attention_backward,_scaled_dot_product_flash_attention_backward,_scaled_dot_product_cudnn_attention_backwardr   _flash_attention_backwardr   _efficient_attention_backwardr   r   r   r  r  r  r  r  r  r  r#  r   rb  r  r,   r!   <module>r     s}      F F )  $ $ ' # :   5
6T]t_g!> yy~~
 !#tCH~ "XxB?O>PRZ[]_a[aRb>b5c . tww/3 	# 	  	 tzz"%# % #% txx C  ! t||$&C & %& t' % 	% (%( 	$#Y$#Y$ Cy$ 	$
 	$L (())..1155	7 8
 cg Oux O8
O t001e e 2eN8 DD@@@@B C EI @WZ @C@	-" 1` eE#s(OU38_eCHouSRUXY]G]]^_1`r 4` eE#s(OU38_eCHouSRUXY]G]]^_4`n t44dC  	 D> t88$G 	 H>"J MMIIIIK L ^b Yps YLY t55tD 	 E@ t994H 	 IR  	L  	>  	@GGWJJ
 	HHh 	LL,	
 	OO_ 	i 	y 	I 	!!9 	y 	1 	00) 	,,i 	,,i 	99;M  	557I!" 	557I#$ 	!!#@%%'H""$B&&(J+0 $# #  
N N`yR( yRK  

]F\
]]VWLs   =Q* *'RR