
    9jA9              
         U d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZ dd	lmZ e
rdd
lmZ ddlZddlmZ dgZdaded<   e G d d             Zed)d       Z	 d*	 	 	 d+dZed,d       Zd-dZ	 d.	 	 	 	 	 	 	 	 	 d/dZ	 	 d0	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d1dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d2dZd3dZ  ed      Z!d4dZ"	 	 	 d5	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d6dZ#	 d7	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d8dZ$dddddddddd		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d9dZ%dddddddd 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d:d!Z&dddd"	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d;d#Z'	 	 	 d<dd$	 	 	 	 	 	 	 	 	 	 	 	 	 d=d%Z(dd$	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d>d&Z) ejT                  d'e(       y)?zUBER PROTOTYPE!!!    )annotationsN)	dataclass)cache)AnyTYPE_CHECKING)TypeVarTupleUnpack   )	_registry)
ModuleType)Libraryregister_flash_attention_fa4
str | None_FA4_MODULE_PATHc                       e Zd ZU ded<   ddZy)
_FA4HandlezLibrary | Nonelibraryc                    d | _         y N)r   )selfs    W/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/nn/attention/_fa4.pyremovez_FA4Handle.remove"   s	        N)returnNone)__name__
__module____qualname____annotations__r    r   r   r   r      s    r   r   c                J    t         j                  j                  |       \  }}|S r   )torchcudaget_device_capability)devicemajor_s      r   _get_device_majorr(   &   s    zz//7HE1Lr   c                B    t        |       }| at        t                     S )z
    Register FA4 flash attention kernels with the PyTorch dispatcher.

    Args:
        module_path: Python module path to the FA4 implementation.
    )_fa4_import_moduler   r   _fa4_register_kernels)module_pathr'   s     r   r   r   ,   s#     	;'A"+-..r   c                ~    t        j                  |       }t        |d      rt        |d      st        d|  d      |S )N_flash_attn_fwd_flash_attn_bwdzModule 'z' does not expose FA4 kernels)	importlibimport_modulehasattrRuntimeError)r,   modules     r   r*   r*   ;   sA    $$[1F6,-WVEV5WXk]2OPQQMr   c                    t        ddd      } | j                  dt        d       | j                  dt        d       | j                  dt        d       | j                  dt
        d       | j                  dt        d       | S )	NatenIMPLCUDA_flash_attention_forward+_flash_attention_forward_no_dropout_inplace_flash_attention_backward#_scaled_dot_product_flash_attention,_scaled_dot_product_flash_attention_backward)r   impl!_fa4_flash_attention_forward_impl4_fa4_flash_attention_forward_no_dropout_inplace_impl"_fa4_flash_attention_backward_impl4_fa4_scaled_dot_product_flash_attention_forward_impl5_fa4_scaled_dot_product_flash_attention_backward_impl)libs    r   r+   r+   C   s~    
&&&
)CHH')JFSHH5<
 HH(*LfUHH-<
 HH6=
 Jr   c                   t        d |D              syt        |D ch c]  }|j                   c}      dk7  ry| j                  t        j
                  t        j                  fvry|D ])  \  }}|j                  t        j                  k7  s$| dc S  || j                         dk7  ry|| j                         d	k7  ry
t        j                  j                         syt        | j                        dvryy c c}w )Nc              3  4   K   | ]  }|j                     y wr   )is_cuda.0ts     r   	<genexpr>z,_fa4_common_support_error.<locals>.<genexpr>_   s     *Qqyy*s   zinputs must be CUDA tensorsr
   inputs must share devicez'query dtype must be float16 or bfloat16z dtype must be float32   zdense query must be 4D   zragged query must be 3DzCUDA not available)	   
   z+FA4 requires compute capability 9.0 or 10.0)alllenr%   dtyper"   float16bfloat16float32dimr#   is_availabler(   )querytensors	cum_seq_qrequire_fp32rJ   nametensors          r   _fa4_common_support_errorr_   Y   s     *'**,
g&AHH&'1,){{5==%..998$ 3f<<5==(V1223 UYY[A-'!1(::""$#&g5< 's   C:c
                   |dk7  ry|ry|y|+|j                   t        j                  k7  ry|j                  syt	        | j
                        }
||
dk7  rd|
 d	S |	|	d
kD  r|
dk7  rd|
 d	S t        | | ||f|      }||dk(  ry|S y )N        dropout_p must be 0zreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDArP   z+paged KV (block_table) not supported on SM 0r
   z-SplitKV (num_splits > 1) not supported on SM rL   z(query, key, value must be on same device)rS   r"   int32rG   r(   r%   r_   )rY   keyvalue	dropout_preturn_debug_maskalibi_slopes	seqused_kr[   block_table
num_splitsr&   errors               r   _fa4_forward_support_errorrn   s   s     C$0+??ekk),  +ell+E5B;<UG1EE*q.Ub[>ugQGG%	UE
 ..=r   c           	     H    |dk7  ryt        || |||||f|d|ff      }||S y )Nra   rb   	logsumexp)r\   )r_   )	grad_outrY   re   rf   outrp   rg   r[   rm   s	            r   _fa4_backward_support_errorrs      sJ     C$%	5#uc95"I.0	E r   c                    | dk(  rdS | S )z"need to convert -1 to None for FA4Nr    )vals    r   _aten_to_fa4_window_sizerw      s    "94%#%r   Tsc                 &    t        d | D              S )Nc              3  @   K   | ]  }|j                  d d        yw)r
      N)	transposerH   s     r   rK   z#_transpose_dense.<locals>.<genexpr>   s     4qQ"4s   )tuple)rZ   s    r   _transpose_denser~      s    4G444r   c                   t         t        d      t        t               }||t        |	      t        |
      d||||||j	                         nd ||xs d|d} |j
                  | ||fi |\  }}||j	                         fS )NFA4 not registeredTr
   )softmax_scalecausalwindow_size_leftwindow_size_right
return_lsecu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_krj   
page_tablerl   rr   )r   r3   r*   rw   
contiguousr.   )rY   re   rf   cu_seq_qcu_seq_kmax_qmax_kscale	is_causalr   r   rj   rr   rk   rl   r4   kwargslses                     r   _fa4_run_forwardr      s    " /00 01F 45EF56GH  /8/DY))+$! oAF &v%%eS%B6BHC   r   c                    t         t        d      t        t               }|j                  ||||| |j	                         ||	t        |
      t        |      |||      \  }}}|||fS )Nr   )r   r   r   r   r   r   deterministic)r   r3   r*   r/   r   rw   )rq   rY   re   rf   rr   rp   r   r   r   r   r   r   r   r4   dqdkdvs                    r   _fa4_run_backwardr      s     /00 01F''12BC23DE# ( JBB r2:r   T	r   r   r   rj   ri   rr   rk   compute_auxiliaryrl   c
       	           t        | ||||	|||||
      }|t        d|       t        | |||||||
|||||||      \  }}|rt        j                  dt        j
                  | j                        }t        j                  dt        j
                  | j                        }t        j                  d| j                  | j                        }nd }d }d }|||||fS )Nz)FA4 flash_attention forward unsupported: )r{   )rS   r%   r    r   )	rn   r3   r   r"   zerosuint64r%   emptyrS   )rY   re   rf   r[   	cum_seq_kr   r   rg   r   rh   r   r   r   rj   ri   rr   rk   r   rl   rm   r   	rng_statephilox_offset
debug_masks                           r   r?   r?   
  s    , 'E FugNOOHC" KKELLN	Bell5<<P[[%++ellK
	
Yz99r   )r   r   r   rj   ri   rk   rl   c               N    t        |||||||||	|
|||||| |d|      \  }}}}}|S )NFr   )r?   )rr   rY   re   rf   r[   r   r   r   rg   r   rh   r   r   r   rj   ri   rk   rl   r'   r   s                       r   r@   r@   J  sX    * 8)+!'OAsAq!* Jr   )r   r   r   c                   t        | ||||||
|      }|t        d|       t        j                         }t	        | ||||||||||||      \  }}}|||fS )Nz*FA4 flash_attention backward unsupported: )rs   r3   r"   $are_deterministic_algorithms_enabledr   )rq   rY   re   rf   rr   rp   r[   r   r   r   rg   r   r   unusedr   r   r   rm   r   r   r   r   s                         r   rA   rA   w  s    ( (	E GwOPP>>@M"JBB r2:r   r   c                  t        | ||||d d d       }|t        d|       t        | ||      \  }}	}
t        j                  |       }|j                  dd      }|j                  d      }|	j                  d      }t        ||	|
d d |||||||      \  }}}}}| j                  d      }|j                  d      }||d d |||||f	S )NzFA4 SDPA forward unsupported: r
   r{   )r   rr   )rn   r3   r~   r"   
empty_liker|   sizer?   )rY   re   rf   rg   r   rh   r   rm   qkvout_bhsdout_bshdmax_q_flashmax_k_flashr'   r   r   r   r   r   r   s                         r   rB   rB     s    '	E ;E7CDDuc51GAq!
 &H!!!Q'H&&)K&&)K3T			40AsI}j JJqMEHHQKE
 
r   c                  t        | ||||||
d       }|t        d|       t        |||||       \  }}}}}|j                  d      }|j                  d      }	t	        ||||||d d ||	|
||||      \  }}}t        |||      \  }}}|||fS )NzFA4 SDPA backward unsupported: r{   r   )rs   r3   r~   r   rA   )rq   rY   re   rf   rr   rp   r[   r   r   r   rg   r   philox_seedr   r   rm   r   r   r   ogor   r   r   s                           r   rC   rC     s    $ (	E <UGDEE%eS%hGNAq!QJJqMEHHQKE3
				JBB" ""b"-JBBr2:r   FA4)register_fn)r%   ztorch.devicer   int)zflash_attn.cute.interface)r,   strr   r   )r,   r   r   r   )r   r   )r    )
rY   torch.TensorrZ   ztuple[torch.Tensor, ...]r[   torch.Tensor | Noner\   z$tuple[tuple[str, torch.Tensor], ...]r   r   )NN)rY   r   re   r   rf   r   rg   floatrh   boolri   r   rj   r   r[   r   rk   r   rl   
int | Noner   r   )rq   r   rY   r   re   r   rf   r   rr   r   rp   r   rg   r   r[   r   r   r   )rv   r   r   r   )rZ   z
Unpack[Ts]r   ztuple[Unpack[Ts]])NNN) rY   r   re   r   rf   r   r   r   r   r   r   r   r   r   r   float | Noner   r   r   r   r   r   rj   r   rr   r   rk   r   rl   r   r   z!tuple[torch.Tensor, torch.Tensor])F)rq   r   rY   r   re   r   rf   r   rr   r   rp   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   z/tuple[torch.Tensor, torch.Tensor, torch.Tensor])&rY   r   re   r   rf   r   r[   r   r   r   r   r   r   r   rg   r   r   r   rh   r   r   r   r   r   r   r   rj   r   ri   r   rr   r   rk   r   r   r   rl   r   )$rr   r   rY   r   re   r   rf   r   r[   r   r   r   r   r   r   r   rg   r   r   r   rh   r   r   r   r   r   r   r   rj   r   ri   r   rk   r   rl   r   )"rq   r   rY   r   re   r   rf   r   rr   r   rp   r   r[   r   r   r   r   r   r   r   rg   r   r   r   r   r   r   r   r   r   r   r   r   r   )ra   FF)rY   r   re   r   rf   r   rg   r   r   r   rh   r   r   r   )rq   r   rY   r   re   r   rf   r   rr   r   rp   r   r[   r   r   r   r   r   r   r   rg   r   r   r   r   r   r   r   r   r   )+__doc__
__future__r   r0   dataclassesr   	functoolsr   typingr   r   typing_extensionsr   r	    r   typesr   r"   torch.libraryr   __all__r   r   r   r(   r   r*   r+   r_   rn   rs   rw   rx   r~   r   r   r?   r@   rA   rB   rC   register_flash_attention_implr    r   r   <module>r      sx    #  !  % 2     ! #
  $ * #      3///  4 :<	% # 7	
 F (,!%%	% % 	%
 % &% #% #% %% % %P 
 	
 
   # .&
 $5"  $'+!%!%!	%! %! "	%!
 "%! %! %! %! %! !%! "%! #%! 
%! %%! %!  '!%!j  !!! 
! 	!
 
! ! "! "! ! ! !! "! ! 5!` #'$(%)(,#'+"!)=:=:	=: =: #	=:
 #=: =: =: =: =: =: =: !=: "=: #=:  &!=:" 
#=:$ %%=:& '=:( )=:Z #'$(%)(,'+!'*	** 
* 	*
 #* #* * * * * * * !* "*  #!*" &#*$ %%*& '*z #'$(%000 
0 	0
 
0 0 #0 #0 0 0 0 0 0 0  !0" !#0$ "%0n #: ::	: : 	:
 : : :Z !333 
3 	3
 
3 3 #3 #3 3 3 3 3 3  3  !3l (	 ' ';W Xr   