
    9jR                        d dl Z d dlZd dlmZ d dlZ	 d dlmZ dZd dl
mZ de j                  fdZde j                  fd	Zde j                  fd
ZdeddfdZdedefdZde j                  fdZde j                  fdZde j                  fdZdee   fdZ	 	 	 	 d$dedededz  dedz  dedz  dedeeef   fdZ G d d      Z G d d      Z	 d%deez  dee   dz  dee edf   z  fdZ!	 d&d ed!ed"edefd#Zy# e	$ r dZdZY w xY w)'    N)Any)runtimeTF)_get_device_indexreturnc                     	 dd l } t        j                  t        | j	                  d      d               }|j                  |_        |j                  |_        |j                   |_        |j$                  |_        |j(                  |_        |S # t
        t        f$ r` t        j                  dk(  r5t        j                  dt        j                  j                  d    d      }nt        j                  d      }Y w xY w)Nr   amdhip64win32	amdhip64_.dllzlibamdhip64.so)rocm_sdkctypesCDLLstrfind_librariesImportError
IndexErrorsysplatformtorchversionhiphipGetErrorStringcuGetErrorStringhipModuleLoadDatacuModuleLoadDatahipModuleGetFunctioncuModuleGetFunctionhipModuleLaunchKernelcuLaunchKernelhipFuncSetAttributecuFuncSetAttribute)r   libs     Q/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/cuda/_utils.py_get_hip_runtime_libraryr$      s    	0kk#h55jA!DEF 00C00C!66C22C 44CJ $ 0<<7"++	%--*;*;A*>)?tDEC++./C	0s   4B A,C<;C<c                  |    t         j                  dk(  rt        j                  d      S t        j                  d      S )Nr	   z
nvcuda.dllzlibcuda.so.1)r   r   r   r        r#   _get_cuda_libraryr(   -   s,    
||w{{<(({{>**r'   c                  ^    t         j                  j                  r
t               S t	               S N)r   r   r   r$   r(   r&   r'   r#   _get_gpu_runtime_libraryr+   5   s!    }}')) ""r'   resultc                     | dk(  ry t        j                         }t               }|j                  | t        j                  |             |j
                  |j
                  j                         nd}t        d|       )Nr   Unknown CUDA errorCUDA error: )r   c_char_pr+   r   byrefvaluedecodeRuntimeError)r,   err_strlibcudaerror_messages       r#   _check_cudar8   =   sn    {ooG&(GVV\\'%:;")--";AU  m_5
66r'   c                 B   t         st        d      | ^}}|t        j                  j                  k7  rJt        j
                  |      \  }}t        |t              r|j                         }t        d| d| d      t        |      dk(  ryt        |      dk(  r|d   S |S )a  Check a cuda.bindings (cuda-python) call result for errors.

    All cuda.bindings runtime calls return ``(error, *outputs)``.  This
    helper unpacks the tuple, raises on non-success, and returns the
    outputs (``None`` for zero outputs, scalar for one, tuple otherwise).
    zcuda.bindings is not availabler/   z ()r   N   )
_HAS_CUDA_BINDINGSr4   _cuda_bindings_runtimecudaError_tcudaSuccesscudaGetErrorString
isinstancebytesr3   len)r,   errout_r5   s        r#   _check_cuda_bindingsrG   I   s     ;<<IC#!--99	: #55 	7
 gu%nn&G\#b	;<<
3x1}
3x1}1vJr'   c                  
   	 dd l } t        j                  t        | j	                  d      d               }|j                  |_        |j                  |_        |j"                  |_        |j&                  |_        |j*                  |_        |j.                  |_        |j2                  |_        |j6                  |_        |j:                  |_        |j>                  |_         |S # t
        t        f$ r t        j                  dk(  redj                  dt        j                  j                  d   dt        j                  j                  d   g      }t        j                  d| d      }nt        j                  d      }Y Kw xY w)	Nr   hiprtcr	    0   r   zlibhiprtc.so)!r   r   r   r   r   r   r   r   r   joinr   r   r   hiprtcGetErrorStringnvrtcGetErrorStringhiprtcCreateProgramnvrtcCreateProgramhiprtcDestroyProgramnvrtcDestroyProgramhiprtcCompileProgramnvrtcCompileProgramhiprtcGetCodeSizenvrtcGetCUBINSizehiprtcGetCodenvrtcGetCUBINhiprtcGetProgramLogSizenvrtcGetProgramLogSizehiprtcGetProgramLognvrtcGetProgramLoghiprtcAddNameExpressionnvrtcAddNameExpressionhiprtcGetLoweredNamenvrtcGetLoweredName)r   r"   version_strs      r#   _get_hiprtc_libraryrc   f   sB   .kk#h55h?BCD "66C 44C!66C!66C11C))C!$!<!<C 44C!$!<!<C!66CJ) $ .<<7"''emm''*C1B1B11EFK ++{m489C++n-C.s   4C" "BFFc                      t        t        j                  j                  j	                  d      d         } t
        j                  dk(  rd|  dg}nd|  dg}|D ]  }	 t        j                  |      c S  t        d      # t        $ r Y 2w xY w)	N.r   r	   nvrtc64_z0_0.dllzlibnvrtc.so.zlibnvrtc.soz Could not find any NVRTC library)
intr   r   cudasplitr   r   r   r   OSError)major_version
nvrtc_libslib_names      r#   _get_nvrtc_libraryrn      s    **005a89M
||w}oW-


 =/*

  	;;x((
 4
55  		s   B	BBc                  ^    t         j                  j                  r
t               S t	               S r*   )r   r   r   rc   rn   r&   r'   r#   _get_gpu_rtc_libraryrp      s#     }}"$$!##r'   c                      ddl m} m} dh}|D cg c]	  }||vs| }}t        j                  j
                  r|j                  |        |S c c}w )z
    Get HIPCC/NVCC flags that are compatible with NVRTC compilation.

    Returns:
        List of HIPCC/NVCC flags that can be safely used with NVRTC.
    r   )COMMON_HIPCC_FLAGSCOMMON_NVCC_FLAGSz--expt-relaxed-constexpr)torch.utils.cpp_extensionrr   rs   r   r   r   extend)rr   rs   nvrtc_unsupported_flagsflagcompatible_flagss        r#   _get_gpu_rtc_compatible_flagsry      sc     P 	# +d:Q.Q  }} 23s
   	AAkernel_sourcekernel_namecompute_capabilitycuda_include_dirsnvcc_optionsauto_pchc           
         ddl }t               ddt        ddffd}| j                  d      }|q|j                  j                  |j                  j                               }	|j                  j                  r|	j                   }n|	j                   |	j                   }g }
|j                  j                  r#|
j                  d| j                                n"|
j                  d| j                                dd	lm}  |d
      }|D ]$  }|
j                  d| j                                & |r)|D ]$  }|
j                  d| j                                & |rYt        |j                  j                        dk  r"t!        d|j                  j                         |g }|j                  d       |r'|D ]"  }|
j                  |j                  d             $ t#               }|
j%                  |D cg c]  }|j                  d       c}       t'        |
      }t)        j*                  |z  |
 }t)        j,                         } |j/                  t)        j0                  |      || dj                         ddd             |j                  d      } |j3                  ||             j5                  |||      }|k7  rt)        j6                         }j9                  |t)        j0                  |             t)        j:                  |j<                        }j?                  ||       tA        d|j<                  jC                                t)        j6                         } |jE                  |t)        j0                  |                   t)        j:                  |j<                        } |jG                  ||             t)        j*                         } |jI                  ||t)        j0                  |                   |j<                  |j<                  jC                         }nd}jK                  t)        j0                  |             |jL                  |fS c c}w )a  
    Compiles a CUDA kernel using NVRTC and returns the PTX code.

    Args:
        kernel_source (str): The CUDA kernel source code as a string
        kernel_name (str): The name of the kernel function to compile
        compute_capability (str, None): The compute capability to target (e.g., "86").
                                           If None, will detect from current device.
        cuda_include_dirs (list, None): List of directories containing CUDA headers
        nvcc_options (list, None): Additional options to pass to NVRTC
        auto_pch (bool): Enable automatic precompiled headers (CUDA 12.8+)

    Returns:
        Tuple[bytes, str]: The compiled PTX code and mangled kernel name
    r   Nr,   r   c                     | k7  rot        j                         }j                  | t        j                  |             |j                  |j                  j                         nd}t        d|       y )Nr.   r/   )r   r0   rO   r1   r2   r3   r4   )r,   r5   r7   NVRTC_SUCCESSlibnvrtcs      r#   check_nvrtcz#_nvrtc_compile.<locals>.check_nvrtc   so    ]"oo'G((g1FG ==, $$&) 
 m_=>> #r'   utf-8z--offload-arch=z--gpu-architecture=sm_)include_pathsrh   z-Iz12.8zPCH requires CUDA 12.8+, got z--pchz.cuzKernel compilation failed:
rJ   )'
torch.cudarp   rg   encoderh   get_device_propertiescurrent_devicer   r   gcnArchNamemajorminorappendrt   r   r   AssertionErrorry   ru   rC   r   r0   c_void_prQ   r1   r_   rU   c_size_tr[   create_string_bufferr2   r]   r4   r3   rW   rY   ra   rS   raw)rz   r{   r|   r}   r~   r   r   r   source_bytespropsoptionsr   cuda_include_paths	cuda_path	directoryoptionnvrtc_compatible_flagsrw   num_optionsoptions_arrayprogc_kernel_namereslog_sizelogbinary_sizebinaryc_mangled_namemangled_namer   r   s                                @@r#   _nvrtc_compiler      s   0  $%H M	?C 	?D 	? !''0L !

001J1J1LM==$)$5$5#6$)KK=!> G}});(<=DDFG/0B/CDKKMN 8&v.' 2	I;'..012 * 	6INNR	{+2245	6 u}}!!"V+ #@ASAS@T!UVVLG$ " 	3FNN6==12	3 ;<NN5KLTDKK(LM g,K__{2W=M ??D##LLm3&&(	
	  &&w/M//mDE 
&
&t[-
HC m??$''fll8.DE))(..9##D#.9#)):J:J:L9MNOO //#K**4k1JKL(():):;F&&tV45 __&N$$T=&,,~:VW '%++224  d!34 ::|##o Ms   Q7c                   @    e Zd Zdej                  ddfdZdeddfdZy)_CudaModulemoduler   Nc                      || _         i | _        y r*   )_module_kernels)selfr   s     r#   __init__z_CudaModule.__init__J  s    02r'   name_CudaKernelc           	         || j                   v r| j                   |   S ddlm}  |       }t        j                         }	 t        |j                  t        j                  |      | j                  |j                  d                   t        || j                        }|| j                   |<   |S # t        $ r}t        d| d      |d }~ww xY w)Nr   )r+   r   zNo kernel named 'z' in this module)r   torch.cuda._utilsr+   r   r   r8   r   r1   r   r   r   r4   AttributeError)r   r   r+   r6   funckernelrD   s          r#   __getattr__z_CudaModule.__getattr__N  s    4== ==&& 	?*, 	V++LL&dkk'6J
 !t||4F"(DMM$M 	V #4TF:J!KLRUU	Vs    A.B/ /	C8CC)__name__
__module____qualname__r   r   r   r   r   r&   r'   r#   r   r   I  s/    3v 34 3V V Vr'   r   c                       e Zd ZdZdej
                  dej
                  ddfdZ	 	 	 	 	 ddeeeef   deeeef   d	e	dz  d
ede
dz  ddfdZdeddfdZy)r   zT
    Represents a compiled CUDA kernel that can be called with PyTorch tensors.
    r   r   r   Nc                 .    || _         || _        d| _        y )Nr   )r   r   _max_shared_mem_bytes)r   r   r   s      r#   r   z_CudaKernel.__init__l  s    	%&"r'   gridblockargs
shared_memstreamc                 B   ddl }|j                  j                  j                         }|sg }g }g }	|D ]O  }
t	        |
|j
                        r|
j                  s'|
j                  r|
j                         st        d      t        j                  |
j                               }|j                  |       |	j                  t        j                  |             t	        |
t              r:t        j                   |
      }|	j                  t        j                  |             t	        |
t"              r;t        j$                  |
      }|	j                  t        j                  |             ;t'        dt)        |
              t        j                  t+        |	      z         }t-        |	      D ],  \  }}
t        j.                  |
t        j                        ||<   . |ddl}|j                  j3                         }|dk\  rQ| j4                  dk(  s|| j4                  kD  r3| j4                  dk(  rdnd| j4                   d}t7        d	| d
| d      t9        |j;                  | j<                  |d   |d   |d   |d   |d   |d   ||j>                  |d             y)a  
        Call the compiled CUDA kernel

        Args:
            grid (tuple): Grid dimensions (grid_x, grid_y, grid_z)
            block (tuple): Block dimensions (block_x, block_y, block_z)
            args (list): List of arguments to pass to the kernel.
                         PyTorch tensor arguments will be automatically converted to pointers.
            shared_mem (int): Shared memory size in bytes
            stream (torch.cuda.Stream): CUDA stream to use. If None, uses current stream.
        r   Nz?All tensor arguments must be CUDA tensors or pinned CPU tensorszUnsupported argument type:    znot configuredzonly z bytes configuredzKernel requires z' bytes of shared memory (>= 48KB), but ze. Call kernel.set_shared_memory_config(shared_mem) after compilation and before launching the kernel.r;   rL   ) r   rh   _utilsr+   rA   Tensoris_cudais_cpu	is_pinned
ValueErrorr   r   data_ptrr   r1   rg   c_intfloatc_double	TypeErrortyperC   	enumeratecastr   current_streamr   r4   r8   r   r   _as_parameter_)r   r   r   r   r   r   r   r6   processed_argsc_argsargptrr   r   c_args_arrayiconfigured_msgs                    r#   __call__z_CudaKernel.__call__q  s_   & 	**##<<>D 13 	KC#u||,{{CJJ3==?$Y  ooclln5%%c*fll3/0C%S)fll512C'!??3/fll845"=d3i[ IJJ+	K0 #f+58' 	@FAs$kk#v?LO	@ >ZZ..0F "&&!+zD<V<V/V --2 !T7788IJ 
 ":, /%& '33  	""		QQQaaa%%	
r'   shared_mem_bytesc                 p   |dk  r|| _         y t               }t        j                  j	                         }t        j
                  j                  r|j                  dk7  rdnd}nt        |dd      }||kD  rt        d| d| d      d	}t        |j                  | j                  ||             || _         y )
Nr   gfx950i   i  shared_memory_per_block_optinzRequested shared memory (z bytes) exceeds device limit (z= bytes). Consider reducing block size or shared memory usage.   )r   r+   r   rh   r   r   r   r   getattrr4   r8   r!   r   )r   r   r6   device_propsmax_shared_mem+cudaFuncAttributeMaxDynamicSharedMemorySizes         r#   set_shared_memory_configz$_CudaKernel.set_shared_memory_config  s    i')9D&*, zz779== &11X=:  %=uN n,+,<+= >!!/ 0 1GG  783&&		; 	
 &6"r'   )r;   r;   r;   r   Nr   N)r   r   r   __doc__r   r   r   tuplerg   listr   r   r   r&   r'   r#   r   r   g  s    'V__ 'foo '$ ' &/&/ !_
CcM"_
 S#s]#_
 Tk	_

 _
 d
_
 
_
B(6 (6 (6r'   r   ptxkernel_namesc           
      8   ddl }t               }t        | t              r| j	                  d      } t        j                         }|j                  j                         }|5  t        |j                  t        j                  |      |              ddd       |st        |      S i }|D ]c  }t        j                         }t        |j                  t        j                  |      ||j	                  d                   t        ||      ||<   e |S # 1 sw Y   xY w)a,  
    Loads a CUDA module from PTX code and returns a module object that can access kernels.

    Args:
        ptx (bytes or str): The PTX code to load
        kernel_names (list, optional): List of kernel names to extract from the module.
                                      If None, will return a module object with __getattr__.

    Returns:
        object: If kernel_names is None, returns a module object with __getattr__ to access kernels.
               If kernel_names is provided, returns a dict mapping kernel names to _CudaKernel objects.
    r   Nr   )r   r+   rA   r   r   r   r   rh   r   r8   r   r1   r   r   r   )	r   r   r   r6   r   r   kernelsr   r   s	            r#   _cuda_load_moduler     s       '(G #sjj! __FZZ&&(F	 IG,,V\\&-A3GHI 6"" G 2 ''T"FDKK,@	

 $D&12 N!I Is    /DDdeviceoptional	allow_cpuc                    t        | t              r| S t        | t              rt        j                  |       } t        | t        j                        r;|r| j
                  dvr+t        d|        | j
                  dk7  rt        d|        t        j                  j                         s0t        | t        j                  j                        r| j                  S t        | ||      S )a  Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.

    If :attr:`device` is a torch.device object, returns the device index if it
    is a CUDA device. Note that for a CUDA device without a specified index,
    i.e., ``torch.device('cuda')``, this will return the current default CUDA
    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
    CPU devices will be accepted and ``-1`` will be returned in this case.

    If :attr:`device` is a Python integer, it is returned as is.

    If :attr:`device` is ``None``, this will return the current default CUDA
    device if :attr:`optional` is ``True``.
    )rh   cpuz(Expected a cuda or cpu device, but got: rh   z!Expected a cuda device, but got: )rA   rg   r   r   r   r   r   jitis_scriptingrh   idx_torch_get_device_index)r   r   r   s      r#   r   r   -  s      &#&#f%&%,,'{{/1 #KF8!TUU[[F"@IJJ99!!#fejj//0::"68Y??r'   )NNNFr*   )FF)"r   r   typingr   r   cuda.bindingsr   r=   r<   r   torch._utilsr   r   r   r$   r(   r+   rg   r8   rG   rc   rn   rp   r   r   ry   boolr   rB   r   r   r   dictr   r&   r'   r#   <module>r      s    
    F&++ .+6;; +#&++ #	7 	7 	7  :V[[ :6FKK 6&$fkk $tCy 6 &*%) $O$O$O$ d
O$ d{	O$
 +O$ O$ 5#:O$dV V<S6 S6n 8<-	u-$(I$4-4]*++-b <A@@@48@@  !s   D   	DD