
    9j'                     2   d Z ddlZddlmZ ddlmZmZ ddlmZ ddl	Z	ddl
Z	ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZmZmZmZmZ ddlmZ ddl m!Z! de	jD                  jF                  de$e%   fdZ&de	jD                  jN                  de(e	jR                  e	jD                  jT                  f   fdZ+de	jD                  jN                  de%de,dz  fdZ-de	jD                  jN                  de%de,dz  fdZ.de	jD                  jN                  de%fdZ/de	jD                  jN                  de0e,dz     fdZ1de	jD                  jN                  dee   defdZ2 G d d      Z3 e!d e3              	 	 d&d ed!ef   d"ee   d#e4d$e4ded!ee   f   f
d%Z5y)'a  
This module implements CUDA graphs support for TorchDynamo backends.

CUDA graphs allow for capturing and replaying GPU operations, which can significantly
reduce CPU overhead in GPU-accelerated PyTorch models. This module provides:

- CUDA graph creation and management for both forward and backward passes
- Input mutation detection and handling
- Device compatibility checking
- Stack trace management for debugging
- Integration with TorchInductor's cudagraph trees

The backend supports two main modes:
1. cudagraphs: Full CUDA graph support with both forward and backward pass optimization
2. cudagraphs_inner: Lower-level CUDA graph implementation used for benchmarking

Key components:
- CudagraphsBackend: Main backend class for CUDA graph integration
- Mutation detection utilities to ensure graph safety
- Device mapping and compatibility checks
- Stack trace collection for debugging
    N)defaultdict)CallableSequence)Any)config)aot_autograd)	boxed_nop)BoxedDeviceIndex'check_multiple_devices_or_any_cpu_nodesformat_default_skip_messageget_mutation_stack_traceget_placeholder_info#log_cudagraph_skip_and_bump_counter)	BoxedBoolcount_tangents%get_first_incompatible_cudagraph_nodenum_fw_fixed_argumentsoutput_node)StorageWeakRef   )register_backendgreturnc           	         dt         t        t        f   dt        fd}t        t              }d}t	               }| j
                  D ]  }|j                  dk(  rkt         ||j                        t        j                        r;|t         ||j                        j                                  j                  |       |dz  }~|j                  dk(  st        |j                  d      s|j                  j                   }t#        |j$                        D ]  \  }}|t'        |j(                        k  r|j(                  |   }	n2|j*                  |j,                  vrG|j,                  |j*                     }	d	}
|j.                  r|j.                  j0                  rd
}
|
s||t         ||	j                        j                                  z  }  |S )Nmetar   c                     d| v r| d   S | d   S )Nvalfake_result )r   s    a/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/_dynamo/backends/cudagraphs.pymeta_fkz%find_input_mutations.<locals>.meta_fk7   s    #tmtE{Dm1DD    r   placeholderr   call_function_schemaFT)dictstrr   r   setnodesop
isinstancer   torchTensorr   _typed_storageaddhasattrtargetr%   	enumerate	argumentslenargsnamekwargs
alias_infois_write)r   r!   inputs	input_idxmutated_inputsnschemaiargargumentmut_args              r    find_input_mutationsrC   6   s   Ed38n E E FIUNWW 44= '!&&/5<<8~gaffo&D&D&FGHLLYWNITT_$188Y/XX%%F#F$4$45 3s166{? vvayHxxqxx/  xx1H>>~~.."& #f&wx}}'='L'L'NO' N: r"   gmc                     i }| j                   j                  D ]W  }|j                  j                  dd       }t	        |t
        j                        s:|j                  |vsI|||j                  <   Y |S )Nr   )graphr)   r   getr+   r,   r-   device)rD   device_node_mappingr=   ts       r    get_device_node_mappingrK   ]   sg     >@XX^^ .FFJJud#a&188;N+N,-). r"   	aot_model	num_fixedc                     t        | j                        t        t        |            z
  }|sy t	        | j                        }t        ||      S N)rC   rF   r(   ranger   r   )rL   rM   mutation_indicesplaceholderss       r    3check_for_mutation_ignore_cuda_graph_managed_tensorrS   h   sD     ,IOO<s5CS?TT'	8L#L2BCCr"   c                     t         j                  st        | |      x}r|S t        t	        |             x}r|S t        |       x}rt        d|j                   d      S y )Nzincompatible op ())r   (cudagraph_backend_support_input_mutationrS   r   rK   r   r   r6   )rL   rM   mut_skipskipnodes        r    check_for_skiprZ   s   sz    ::Jy
 
8 
 O6	* t  4Y??t?*->tyyk+KLLr"   c                 v    t        t        t        |                   }|j                  dk(  sJ |j                  S )Ncuda)nextiterrK   typeindex)rD   rH   s     r    get_device_indexra      s3    $.r234F;;&   <<r"   c                 $   t        |       }t        |j                        dk(  sJ |j                  d   }t        |d      sg S |D cg c]>  }t	        |t
        j                  j                  j                        r|j                  nd @ c}S c c}w )Nr   r   __iter__)
r   r4   r5   r0   r+   r,   fxrY   Nodestack_trace)rD   outputr5   r@   s       r    get_stack_tracesrh      s    _Fv{{q   ;;q>D4$	  'sEHHMM,>,>?T	I  s   ABdynamo_modeldynamo_inputsc           	         ddl m t        d      t        d       	 ddt        j
                  j                  dt        t           dt        dt        ffd}dt        j
                  j                  dt        t           dt        ffd	}t        ||t        j                  |d
      t        j                  j                  j                        } ||       S )Nr   )cudagraphify_implTrL   
aot_inputsis_inferencer   c                    t        | |      }t        t        
      t        |            }t        | |      x}r%t	        j
                  	       t        d|        |S j                  t        |               ||t        |      j                  d|t        |       t        | j                        t        | j                        	      }d|_        |S )Nskipping cudagraphs due to Fdevice_indexis_backwardrn   stack_tracesrR   mutated_input_idxsT)r	   r   r4   rZ   r   disabler   r(   ra   rP   valuerh   r   rF   rC   _boxed_call)rL   rm   rn   interpfixedskip_msgoutboxed_device_indexrl   do_cudagraphsrj   s          r    forward_cudagraphsz&cudagraphs.<locals>.forward_cudagraphs   s    
 9j1&s='93z?K%i7787m,/-hZ8 M/	:;%L+11%))4-ioo>3IOOD

 
r"   c                     t         |      }s S t               }t         |      x}rpt        d|        	j                  }|d}t
        j                  j                  j                  |d      J dt        t           dt        f fd}d|_        |S  
||t        |      t               ddt               t         j                         t#         j                         		      }d|_        |S )
Nrp   r   F)create_if_none_existsr:   r   c                 4    j                           |       S rO   )set_to_running_backward)r:   rL   managers    r    fnz3cudagraphs.<locals>.backward_cudagraphs.<locals>.fn   s    //1 ((r"   Trq   )r	   r   rZ   r   rw   r,   	_inductorcudagraph_treesget_managerlistr   rx   rP   ra   rh   r   rF   rC   )rL   rm   ry   rz   r{   
device_idxr   r|   r   r}   rl   r~   s   `       @r    backward_cudagraphsz'cudagraphs.<locals>.backward_cudagraphs   s    9j1y)%i7787/-hZ8
 ,11J!
oo55AA% B G &&&)49 ) )
 "BNI%L))4))4-ioo>3IOOD

 
r"   )rn   )fw_compilerbw_compilerinference_compilerkeep_inference_input_mutations)F)torch._inductor.cudagraph_treesrl   r   r
   r,   rd   GraphModuler   r   boolr   	functoolspartial_dynamor   %cudagraph_backend_keep_input_mutation)ri   rj   r   r   aot_cudagraphsr}   rl   r~   s    `   @@@r    
cudagraphsr      s    AdOM)$/
 #88''I  
	 :*88''*59#Y*	*X "&'$,,-?dS',}}';';'a'a	N ,66r"   c                   n    e Zd ZdZedd       Zedej                  j                  de	e
   de
fd       Zy)	CudagraphsBackendr   r   Nc                      ddl m}   |         y )Nr   reset_cudagraph_trees)r   r   r   s    r    resetzCudagraphsBackend.reset   s    Ir"   modelr:   c                     t        | |      S rO   )r   )r   r:   s     r    __call__zCudagraphsBackend.__call__   s    %((r"   )r   N)__name__
__module____qualname__compiler_namestaticmethodr   r,   rd   r   r   r   r   r   r"   r    r   r      sP     M   
 ),, )hsm ) ) )r"   r   r   )r6   compiler_fnr   .r:   copy_outputscopy_inputsc                   	 t        |t        t        f      sJ r$|D cg c]  }t        j                  |       c}nt        |      t        j
                  j                          t        j
                  j                         }|j                  t        j
                  j                                t        j
                  j                  |      5   | |  ddd       |j                          t        j
                  j                         j                  |       t        j
                  j                          t        j
                  j                         t        j
                  j                  |      5   |  	ddd       t        	t        t        f      s	f	dt        dt        t           f	fd}|S c c}w # 1 sw Y   xY w# 1 sw Y   RxY w)zBThis isn't registered as a backend, but is used in some benchmarksN)stream
new_inputsr   c                      t              t        |       k(  sJ r%t        |       D ]  \  }}|j                  |        j                          rD cg c]  }|j	                          c}S S c c}w rO   )r4   zipcopy_replayclone)	r   dstsrcxr   r   rF   static_inputsstatic_outputss	       r    runzcudagraphs_inner.<locals>.run   sp    =!S_444z: S		#'56!AGGI66!! 7s   A4)r+   r   tupler,   
zeros_liker\   synchronizeStreamwait_streamcurrent_streamr   	CUDAGraphrF   r   r   )
r   r:   r   r   r   r   r   rF   r   r   s
     ``   @@@r    cudagraphs_innerr     sm    ftUm,,,6<=))!,=V 
JJZZ F
uzz0023			6	" v
	JJ++F3	JJ JJ  "E			%		/ /./ntUm4(*	" 	"# 	" 	" JA > / /s   GG
?G
GG)TT)6__doc__r   collectionsr   collections.abcr   r   typingr   r,   torch.fxtorch._dynamor   torch._dynamo.backends.commonr    torch._dynamo.backends.debuggingr	   torch._inductor.cudagraph_utilsr
   r   r   r   r   r   torch._inductor.utilsr   r   r   r   r    torch.multiprocessing.reductionsr   registryr   rd   Graphr(   intrC   r   r&   rH   re   rK   r'   rS   rZ   ra   r   rh   r   r   r   r   r   r"   r    <module>r      s  .  # .      6 6   < &$EHHNN $s3x $N	%,,
%&Dxx##D03D4ZDehh22 s sTz $-- # 	-- 	$sTz2B 	U7UXX11 U7(3- U7TW U7p) )  l0A0C D 	)CH)SM) ) 	)
 c8C= !)r"   