
    9jR                       U d Z ddlmZ ddlZddlZddlmZ erddlmZ ddl	m
Z
 ddlmZ ddlmZmZ ddlZdd	lmZ d
dlmZ dgZdaded<   daded<   e
 G d d             Zed.d       Z	 d/	 	 	 d0dZd1dZd2dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d3dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d4dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d5dZ  ed      Z!d6dZ"d7dZ#	 	 	 	 	 	 d8	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d9dZ$	 d:	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d;dZ%	 	 	 d<dddddddddd 		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d=d!Z&dddddddd"	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d>d#Z'ddddddddd$	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d?d%Z(dddd&	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d@d'Z)	 	 	 	 	 	 dAdd(	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dBd)Z*	 	 	 dCdd(	 	 	 	 	 	 	 	 	 	 	 	 	 dDd*Z+dd(	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dEd+Z, ejZ                  d,e-       y)Fz
PROTOTYPE!
Flash Attention 3 implementation.
For fp8: only supports forward pass right now.
For fp16/bf16: supports forward and backward pass.
    )annotationsN)TYPE_CHECKING)Callable)	dataclass)cache)TypeVarTupleUnpack)Library   )	_registryregister_flash_attention_fa3zCallable | None_FA3_CUDA_FWD_FA3_CUDA_BWDc                       e Zd ZU ded<   ddZy)
_FA3HandlezLibrary | Nonelibraryc                P    d | _         t        j                  j                  d       y )NF)r   torch_C_set_sdp_use_fa3)selfs    W/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/nn/attention/_fa3.pyremovez_FA3Handle.remove*   s    !!%(    N)returnNone)__name__
__module____qualname____annotations__r    r   r   r   r   &   s    )r   r   c                J    t         j                  j                  |       \  }}|S N)r   cudaget_device_capability)devicemajor_s      r   _get_device_majorr)   0   s    zz//7HE1Lr   c                |    t        |        t        j                  j                  d       t	        t                     S )z
    Register FA3 flash attention kernels with the PyTorch dispatcher.

    Args:
        module_path: Python module path to the FA3 implementation.
    T)_fa3_import_moduler   r   r   r   _fa3_register_kernelsmodule_paths    r   r   r   6   s/     {# 
HHd#+-..r   c                   t        j                  |        t        t        j                  d      st        d|  d      t        t        j                  j                  d      st        d|  d      t        t        j                  j                  d      st        d|  d      t        j                  j                  j                  at        j                  j                  j                  a
y )Nflash_attn_3zModule 'z' does not expose FA3 kernelsfwdz%' does not expose FA3 forward kernelsbwdz&' does not expose FA3 backward kernels)	importlibimport_modulehasattrr   opsRuntimeErrorr0   r1   r   r2   r   r-   s    r   r+   r+   G   s    K(599n-Xk]2OPQQ599))51{m#HI
 	
 599))51{m#IJ
 	
 II**..MII**..Mr   c                 b   t        ddd      } | j                  dt        d       | j                  dt        d       | j                  dt        d       | j                  dt
        d       | j                  dt        d       | j                  d	t        d       | j                  d
t        d       | S )NatenIMPLCUDAz"_flash_attention_forward.quantizedz-_scaled_dot_product_flash_attention.quantized_flash_attention_forward+_flash_attention_forward_no_dropout_inplace#_scaled_dot_product_flash_attention_flash_attention_backward,_scaled_dot_product_flash_attention_backward)	r
   impl!_fa3_flash_attention_forward_impl4_fa3_scaled_dot_product_flash_attention_forward_impl)_fa3_flash_attention_forward_impl_default4_fa3_flash_attention_forward_no_dropout_inplace_impl<_fa3_scaled_dot_product_flash_attention_forward_impl_default"_fa3_flash_attention_backward_impl5_fa3_scaled_dot_product_flash_attention_backward_impl)libs    r   r,   r,   X   s    
&&&
)CHH,.OQW HH7<
 HH"$Mv HH5<
 HH-D HH(*LfUHH6=
 Jr   c                   |dk7  ryt        d |D              syt        |D ch c]  }|j                   c}      dk7  ry| j                  t        j
                  k(  r |||t        j                  dt               || j                         dk7  ry	|| j                         d
k7  ryt        j                  j                         syt        | j                        dk7  ryy c c}w )N        zdropout_p must be 0c              3  4   K   | ]  }|j                     y wr#   )is_cuda.0ts     r   	<genexpr>z,_fa3_common_support_error.<locals>.<genexpr>   s     *Qqyy*s   zinputs must be CUDA tensorsr   inputs must share devicezWhen using SDPA with fp8, descale tensor should always be used for accurate dequantization. Please use _scaled_dot_product_attention_quantized and provide the descale tensors.   zdense query must be 4D   zragged query must be 3DzCUDA not available	   z#FA3 requires compute capability 9.0)alllenr&   dtyper   float8_e4m3fnwarningswarnUserWarningdimr$   is_availabler)   )querytensors	dropout_p	cum_seq_q	q_descale	k_descale	v_descalerP   s           r   _fa3_common_support_errorrf   y   s     C$*'**,
g&AHH&'1,){{e)))Y.)2C+ 	
 UYY[A-'!1(::""$#&!+4) 's   C#c           	        |ry|y|+|j                   t        j                  k7  ry|j                  syt        j                  t        j
                  t        j                  ft        fd| ||hD              sd S t        | ||hD ch c]  }|j                    c}      dk7  ryt        | | ||f||||	|
      }||d	k(  ry
|S y c c}w )Nzreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDAc              3  :   K   | ]  }|j                   v   y wr#   rX   rO   rP   supported_dtypess     r   rQ   z-_fa3_forward_support_error.<locals>.<genexpr>   s     Hqqww**H   inputs must be one of r   #all inputs must have the same dtyperR   z(query, key, value must be on same device)
rX   r   int32rM   rY   float16bfloat16rV   rW   rf   )r_   keyvaluera   return_debug_maskalibi_slopes	seqused_krb   rc   rd   re   rP   errorrk   s                @r   _fa3_forward_support_errorrx      s     0+??ekk),  +++U]]ENNKHUC4GHH'(8'9::
eS%01AGG12a74%	UE ..= 2s   C	c
           	        |j                   t        j                  k(  r	 y|j                   t        j                  k7  ryt        j                  t        j
                  ft        fd| ||||hD              sd S t        | ||||hD 
ch c]  }
|
j                    c}
      dk7  ryt        || |||||f||d d d       }||S y c c}
w )NzHFA3 backward does not support fp8 - use inference only (torch.no_grad())zlogsumexp dtype must be float32c              3  :   K   | ]  }|j                   v   y wr#   ri   rj   s     r   rQ   z._fa3_backward_support_error.<locals>.<genexpr>   s     Wqqww**Wrl   rm   r   rn   )	rX   r   rY   float32rp   rq   rV   rW   rf   )grad_outr_   rr   rs   out	logsumexpra   rb   window_size_leftwindow_size_rightrP   rw   rk   s               @r   _fa3_backward_support_errorr      s     {{e)))V	
 %--'0u~~6WXuc5RU4VWW'(8'9::
hsE3?@AGG@AQF4%	5#uc95E  As   CTsc                 &    t        d | D              S )Nc              3  @   K   | ]  }|j                  d d        yw)r      N)	transposerN   s     r   rQ   z#_transpose_dense.<locals>.<genexpr>   s     4qQ"4s   )tuple)r`   s    r   _transpose_denser      s    4G444r   c                R    | $| j                  d      dk7  r| j                         S | S )z2Ensure tensor is contiguous in the last dimension.r   )stride
contiguous)xs    r   _maybe_contiguousr      s&    ]qxx|q/@1<<>GaGr   c                   t         t        d      t        |       }t        |      }|j                  t        j
                  k(  r8|j                  d      dk7  r$|j                  d      dk7  r|j                         n
t        |      }t        |      }t        |      }t        |      }t        |      }t        g |||ddd|||dd||||ddddd||||||	|	nd|
|
nddddd|xs t	        j                         rdnddt        j                  j                         xs d \  }}}}||j                         fS )	zF
    Run the FA3 forward pass by calling the C++ kernel directly.
    NFA3 not registeredr   r   r   rK   T)r   r7   r   rX   r   rY   r   r   $are_deterministic_algorithms_enabledr   _get_sm_carveout_experimental)r_   rr   rs   cu_seq_qcu_seq_kmax_qmax_kscale	is_causalr   r   rv   r}   rc   rd   re   block_table
num_splitsqkvcu_seqlens_qcu_seqlens_ksoftmax_lse	out_accumsoftmax_lse_accums                             r   _fa3_run_forwardr      s`   . /00% A#A ;;%---LL!LL! 	 u%  %X.L$X.L!),I#K0K5B $6	$6	$6 	
$6 		$6
 	$6 	$6 	$6 	$6 	$6 	$6 	$6 	$6 	$6 	$6 	$6  	!$6" 	#$6$ 	%$6& 	'$6( 	)$6* 	+$6, 	-$6. 	/$60 	1$62 	3$64 -8b5$66 /:7$68 	
9$6: 	;$6< 	=$6> 	?$6@ 	 	F;;=A1C$6D 	E$6F 	..05AG$62Ci!2J &&(((r   c                R   t         t        d      t        |       }|j                  d      dk7  r|j	                         n|}|j                  d      dk7  r|j	                         n|}|j                  d      dk7  r|j	                         n|}t        |      }t        |      }t        j                  |      }t        j                  |      }t        j                  |      }t        |||||||||||d d ||	|
|||d|t
        j                  j                         xs d       |||fS )Nr   r   r   rK   r   )	r   r7   r   r   r   r   
empty_liker   r   )r|   r_   rr   rs   r}   r~   r   r   max_seqlen_qmax_seqlen_kr   r   r   r   deterministicdoutr   r   r   olsedqdkdvs                           r   _fa3_run_backwardr   L  s/   " /00 X&D#ll2.!3AJJrNa/SA#ll2.!3A#A
I
&C 
		!	B			!	B			!	B				


..05A-0 r2:r   r   T	r   r   r   rv   ru   r}   r   compute_auxiliaryr   c       	           t        | ||||	||||
||      }|t        d|       t        | |||||||||||||
||||      \  }}|rt        j                  dt        j
                  | j                        }t        j                  dt        j
                  | j                        }t        j                  d| j                  | j                        }nd }d }d }|||||fS )Nz)FA3 flash_attention forward unsupported: )r   )rX   r&   r!   r   )	rx   r7   r   r   zerosuint64r&   emptyrX   )r_   rr   rs   rb   	cum_seq_kr   r   ra   r   rt   rc   rd   re   r   r   r   rv   ru   r}   r   r   r   rw   r   	rng_statephilox_offset
debug_masks                              r   rB   rB     s   2 'E FugNOO%HC( KKELLN	Bell5<<P[[%++ellK
	
Yz99r   )r   r   r   rv   ru   r   r   c               R    t        |||||||||	|
d d d f|||||| |d|d	\  }}}}}|S )NFr   rB   )r}   r_   rr   rs   rb   r   r   r   ra   r   rt   r   r   r   rv   ru   r   r   r(   r   s                       r   rE   rE     sh    * 8 )+!-OAsAq!0 Jr   )r   r   r   rv   ru   r   r}   r   c
               B    t        | |||||||||	d d d |
|||||||      S )N)r   r   r   rv   ru   r}   r   r   r   )r_   rr   rs   rb   r   r   r   ra   r   rt   r   r   r   rv   ru   r   r}   r   s                     r   rD   rD     sP    * -)+!+ r   )r   r   r   c                   t        | ||||||
|||
      }|t        d|       t        j                         }t	        | |||||||||	||||nd||nd|      \  }}}|||fS )z0FA3 implementation of _flash_attention_backward.z*FA3 flash_attention backward unsupported: r   )r   r7   r   r   r   )r|   r_   rr   rs   r}   r~   rb   r   r   r   ra   r   r   unusedr   r   r   rw   r   r   r   r   s                         r   rG   rG   ,  s    * (E GwOPP>>@M",8b.:JBB" r2:r   r   c	                  t        | ||||d d d |||      }
|
t        d|
       t        | ||      \  }}}| j                  t        j
                  k(  rt        j                  n| j                  }t	        j                  | |      }|j                  dd      }|j                  d      }|j                  d      }t        |||d d ||||||	||||      \  }}}}}| j                  d      }|j                  d      }||d d |||||f	S )NzFA3 SDPA forward unsupported: ri   r   r   )r   r}   rc   rd   re   )rx   r7   r   rX   r   rY   rq   r   r   sizerB   )r_   rr   rs   rc   rd   re   ra   r   rt   r   rw   r   r   r   	out_dtypeout_bhsdout_bshdmax_q_flashmax_k_flashr(   r   r   r   r   r   r   s                             r   rC   rC   g  sA    'E ;E7CDDuc51GAq!
 #(++1D1D"D%++IY7H!!!Q'H&&)K&&)K3T			40AsI}j" JJqMEHHQKE
 
r   c               ,    t        | ||d d d ||||
      S )Nr   )rC   )r_   rr   rs   ra   r   rt   r   s          r   rF   rF     s0     @ r   c                   t        | ||||||
ddd
      }|t        d|       t        | ||||      \  }}}}}t        ||||||dd||	|
||||      \  }}}t        |||      \  }}}|||fS )zCFA3 implementation of _scaled_dot_product_flash_attention_backward.NzFA3 SDPA backward unsupported: r   )r   r7   r   rG   )r|   r_   rr   rs   r}   r~   rb   r   r   r   ra   r   philox_seedr   r   rw   
grad_out_tq_tk_tv_tout_tr   r   r   dq_outdk_outdv_outs                              r   rH   rH     s    & (%eS)YdDE <UGDEE (8%eS($JS#u 4JBB& .b"b9FFF66!!r   FA3)register_fn)r&   ztorch.devicer   int)flash_attn_interface)r.   strr   r   )r.   r   r   r   )r   r
   )r_   torch.Tensorr`   ztuple[torch.Tensor, ...]ra   floatrb   torch.Tensor | Nonerc   r   rd   r   re   r   r   
str | None)r_   r   rr   r   rs   r   ra   r   rt   boolru   r   rv   r   rb   r   rc   r   rd   r   re   r   r   r   )r|   r   r_   r   rr   r   rs   r   r}   r   r~   r   ra   r   rb   r   r   
int | Noner   r   r   r   )r`   z
Unpack[Ts]r   ztuple[Unpack[Ts]])r   r   r   r   )NNNNNN)&r_   r   rr   r   rs   r   r   r   r   r   r   r   r   r   r   float | Noner   r   r   r   r   r   rv   r   r}   r   rc   r   rd   r   re   r   r   r   r   r   r   z!tuple[torch.Tensor, torch.Tensor])F) r|   r   r_   r   rr   r   rs   r   r}   r   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   z/tuple[torch.Tensor, torch.Tensor, torch.Tensor])NNN),r_   r   rr   r   rs   r   rb   r   r   r   r   r   r   r   ra   r   r   r   rt   r   rc   r   rd   r   re   r   r   r   r   r   r   r   rv   r   ru   r   r}   r   r   r   r   r   r   r   )$r}   r   r_   r   rr   r   rs   r   rb   r   r   r   r   r   r   r   ra   r   r   r   rt   r   r   r   r   r   r   r   rv   r   ru   r   r   r   r   r   )$r_   r   rr   r   rs   r   rb   r   r   r   r   r   r   r   ra   r   r   r   rt   r   r   r   r   r   r   r   rv   r   ru   r   r   r   r}   r   r   r   )"r|   r   r_   r   rr   r   rs   r   r}   r   r~   r   rb   r   r   r   r   r   r   r   ra   r   r   r   r   r   r   r   r   r   r   r   r   r   )NNNrK   FF)r_   r   rr   r   rs   r   rc   r   rd   r   re   r   ra   r   r   r   rt   r   r   r   )rK   FF)r_   r   rr   r   rs   r   ra   r   r   r   rt   r   r   r   )r|   r   r_   r   rr   r   rs   r   r}   r   r~   r   rb   r   r   r   r   r   r   r   ra   r   r   r   r   r   r   r   r   r   ).__doc__
__future__r   r3   rZ   typingr   collections.abcr   dataclassesr   	functoolsr   typing_extensionsr   r	   r   torch.libraryr
    r   __all__r   r    r   r   r)   r   r+   r,   rf   rx   r   r   r   r   r   r   rB   rE   rD   rG   rC   rF   rH   register_flash_attention_implr!   r   r   <module>r      s   #     ( !  2  !  #
 "& %!% % ) ) )   .///"/"B""%" " #	"
 #" #" #" "J((	( ( 	(
 ( &( #( #( #( #( #( (V### 
# 	#
 
# # # ## !# "# #L $5H$  $%)%)%)'+!%N)N)	N) N) "	N)
 "N) N) N) N) N) !N) "N) #N) 
N) #N) #N)  #!N)" %#N)$ %N)& ''N)@  888 
8 	8
 
8 8 "8 "8 8 8 8 8 8 8 8  5!8L &*%)%)D: %)(,#'+"!/D:D:	D: D: #	D:
 #D: D: D: D: D: D: #D: #D: #D: D:  !D:" #D:$ #%D:& &'D:( 
)D:* %+D:, -D:. /D:h %)(,'+!'-	-- 
- 	-
 #- #- - - - - - - - -  #!-" &#-$ %%-& '-x %)(,'+#!'++	+ + #	+
 #+ + + + + + + + + #+  &!+" %#+$ 
%+& '+| #'$(%888 
8 	8
 
8 8 #8 #8 8 8 8 8 8 8  !8" !#8$ "%8~ &*%)%)#D DD	D D #	D
 #D #D D D D DV # 	  	
   P !2"2"2" 
2" 	2"
 
2" 2" #2" #2" 2" 2" 2" 2" 2"  2"  !2"j (	 ' ';W Xr   