
    9jX              $          d Z ddlZddlmZ ddlmZmZ ddlZ ej                  e	      Z
g dZdee   dz  dee   fdZ ed	
      dedefd       Z G d de      Zej$                  j'                  di       	 	 	 	 	 	 	 d:dej(                  dej(                  dej(                  dej(                  dej(                  dz  dededededz  dee   dz  dedej(                  dz  dej(                  dz  dedz  deej(                  ej(                  ej(                  f   fd       Zej0                  	 	 	 	 	 	 	 d:dej(                  dej(                  dej(                  dej(                  dej(                  dz  dededededz  dee   dz  dedej(                  dz  dej(                  dz  dedz  deej(                  ej(                  ej(                  f   fd        Zddd!ddddd"dej(                  dej(                  dej(                  dej(                  dej(                  dz  deded#edz  dedz  deeef   dedej(                  dz  dej(                  dz  dedz  dej(                  eej(                  ej(                  f   z  fd$Zej$                  j'                  d%d&h      	 	 	 	 	 	 	 d:d&ej(                  dej(                  dej(                  dej(                  dej(                  dej(                  dz  dededededz  dee   dz  dedej(                  dz  dej(                  dz  dedz  dej(                  f d'       Zej0                  	 	 	 	 	 	 	 d:d&ej(                  dej(                  dej(                  dej(                  dej(                  dej(                  dz  dededededz  dee   dz  dedej(                  dz  dej(                  dz  dedz  dej(                  f d(       Zddd!ddddd"d&ej(                  dej(                  dej(                  dej(                  dej(                  dej(                  dz  deded#edz  dedz  deeef   dedej(                  dz  dej(                  dz  dedz  dej(                  eej(                  ej(                  f   z  f d)Zd*ed+eed,f   d-eddfd.Zej$                  j'                  d/i       	 	 d;d0ej(                  dej(                  dej(                  dej(                  d&ej(                  d1ej(                  dej(                  dej(                  dededed2ej(                  dedz  dee   dz  deej(                  ej(                  ej(                  f   fd3       Zej0                  	 	 d;d0ej(                  dej(                  dej(                  dej(                  d&ej(                  d1ej(                  dej(                  dej(                  dededed2ej(                  dedz  dee   dz  deej(                  ej(                  ej(                  f   fd4       Z d*ed0ej(                  d5ej(                  d6ej(                  deej(                  dz  d,f   f
d7Z!ejE                  e!e8       ejF                  jI                  ejJ                  jL                  jN                         dd9l(m)Z)m*Z*m+Z+m,Z, e*e,ejJ                  jZ                  j.                  <   e+e,ejJ                  jZ                  j6                  <   e)e,ejJ                  jZ                  j>                  <   y)<z
Variable-length attention implementation using Flash Attention.

This module provides a high-level Python interface for variable-length attention
that calls into the optimized Flash Attention kernels.
    N)	lru_cache)Any
NamedTuple)varlen_attnvarlen_attn_out
AuxRequestwindow_sizereturnc                 \    | ddg} t        |       dk7  rt        dt        |              | S )N   z$window_size must have length 2, got )len
ValueError)r	   s    Y/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/nn/attention/varlen.py_normalize_window_sizer      s=    2h
;1?K@P?QRSS       )maxsizedevice_indexc                      y)z;Cache device capability check to avoid repeated CUDA calls.F )r   s    r   _should_use_cudnnr      s     r   c                        e Zd ZU dZdZeed<   y)r   z
    Request which auxiliary outputs to compute from varlen_attn.

    Each field is a boolean indicating whether that auxiliary output should be computed.
    FlseN)__name__
__module____qualname____doc__r   bool__annotations__r   r   r   r   r   #   s     Cr   r   ztorch_attn::_varlen_attn)mutates_argsFquerykeyvaluecu_seq_qcu_seq_kmax_qmax_k	is_causalscale
enable_gqa	seqused_kblock_table
num_splitsc                    t        |	      }	| j                  xr t        | j                  j                        }|rt
        j                  d       |
rt        d      |t        d      |	d   dk7  s|	d   dk7  rt        d      ||t        d	      t        j                  j                  j                  | ||d||||d
d|d|      }|d   |d   |d   }}}nZt
        j                  d       t        j                  j                  j                  | ||||||d|d||	d   |	d   |||      \  }}}}}t        j                  dt        j                  | j                        }|||fS )z
    Private custom op for variable-length attention.

    This is the internal implementation. Users should use the public varlen_attn function instead.
    #Using cuDNN backend for varlen_attnz,GQA is not supported with the cuDNN backend.Nz3num_splits is not supported with the cuDNN backend.r   r      TcuDNN backend does not support window attention. Please use Flash Attention backend.zBseqused_k/block_table is not yet supported with the cuDNN backend.T        Fr*      -Using Flash Attention backend for varlen_attn)return_debug_maskr*   window_size_leftwindow_size_rightr,   r-   r.   r   dtypedevice)r   is_cudar   r=   indexloginfoRuntimeErrortorchopsaten_cudnn_attention_forward_flash_attention_forwardzerosuint64)r"   r#   r$   r%   r&   r'   r(   r)   r*   r	   r+   r,   r-   r.   	use_cudnnresultoutputsoftmax_lse	rng_state_
rng_state_s                        r   _varlen_attnrQ   -   s   , )5KG"3ELL4F4F"GI67MNN!TUUq>R;q>R#7f   K$; T  88 9 
  *0F1IvayY@A/4yy~~/V/V#(^)!n#!! 0W 0
,Y1& ELLJ ;
**r   c                    t        |	      }	t        j                  |       }| j                  d      }| j                  d      }t        j                  ||ft        j
                  | j                        }t        j                  j                  rt        j                  j                         }|t        j                  j                  j                  k(  rG|j                  d      dz
  }t        j                  |||ft        j
                  | j                        }t        j                  dt        j                  | j                        }|||fS )z
    Fake implementation for meta tensor computation and tracing.

    Based on the 3D varlen path from meta__flash_attention_forward:
    - query shape: (total, num_heads, head_dim)
    - logsumexp shape: (num_heads, total_q)
    r   r1   r;   r:   )r   rC   
empty_likesizeemptyfloatr=   versionhip_C_get_rocm_fa_preferred_backend_ROCmFABackendAOTritonrI   )r"   r#   r$   r%   r&   r'   r(   r)   r*   r	   r+   r,   r-   r.   rL   total_q	num_heads	logsumexp	preferred
batch_sizerN   s                        r   _varlen_attn_fakerb      s    0 )5K e$F jjmG

1I	GEKKI }}HH;;=	//888!q)A-JY.ekk%,,I DU\\JI9i''r   )r   r   )
return_auxr*   r	   r+   r,   r-   r.   rc   c                   | j                  d      }||j                  d      n|j                  d      }|
s||k7  rt        d| d| d      |
r||z  dk7  rt        d| d| d      |	d	k(  }t        j                  j                  j                  | ||||||||t        |	      |
|||      \  }}}||j                  r||fS |S )
a  Compute variable-length attention using Flash Attention.

    This function is similar to scaled_dot_product_attention but optimized for
    variable-length sequences using cumulative sequence position tensors.

    Args:
        query (Tensor): Query tensor; shape :math:`(T_q, H_q, D)`
        key (Tensor): Key tensor; shape :math:`(T_k, H_{kv}, D)`, or
            :math:`(\text{total\_pages}, \text{page\_size}, H_{kv}, D)` when ``block_table`` is provided.
        value (Tensor): Value tensor; shape :math:`(T_k, H_{kv}, D)`, or
            :math:`(\text{total\_pages}, \text{page\_size}, H_{kv}, D)` when ``block_table`` is provided.
        cu_seq_q (Tensor): Cumulative sequence positions for queries; shape :math:`(N+1,)`
        cu_seq_k (Tensor): Cumulative sequence positions for keys/values; shape :math:`(N+1,)`
        max_q (int): Maximum query sequence length in the batch.
        max_k (int): Maximum key/value sequence length in the batch.
        return_aux (Optional[AuxRequest]): If not None and ``return_aux.lse`` is True, also returns the logsumexp tensor.
        scale (float, optional): Scaling factor for attention scores
        window_size (tuple[int, int], optional): Window size for sliding window attention as (left, right).
            Use (-1, -1) for full attention (default), (-1, 0) for causal attention,
            or (W, 0) for causal attention with sliding window of size W.
        enable_gqa (bool): If set to True, enables Grouped Query Attention (GQA)
            and allows key/value to have fewer heads than query.
            Each KV head is shared by a group of :math:`H_q / H_{kv}` query heads,
            so :math:`H_q` must be divisible by :math:`H_{kv}`.
            Default is False.
        seqused_k (Tensor, optional): Number of valid KV tokens per batch element; shape :math:`(N,)`.
            When set, only the first ``seqused_k[i]`` tokens in the key/value sequence for batch
            element *i* participate in attention. Useful for KV-cache decoding where the cache slot
            is larger than the actual sequence. Inference-only (not supported in backward).
        block_table (Tensor, optional): Block table for paged KV cache; shape
            :math:`(N, \text{max\_pages\_per\_seq})`, dtype ``int32``.
            Requires ``seqused_k``. Inference-only (not supported in backward).

            When ``block_table`` is provided, ``key`` and ``value`` are a "pool" of
            pages of tokens of KV data and the pages belong to any sequence/order.
            The ``block_table`` is what maps each sequence's logical chunks
            back to physical pages in this pool.

            ``seqused_k[i]`` tells the kernel how many tokens in sequence *i* are
            actually valid, since the last page is typically only partially filled.
        num_splits (int, optional): Number of splits for split-KV. Set to ``1``
            to disable split-KV which enables batch invariance. Split-KV
            parallelizes the key/value sequence dimension across multiple thread
            blocks and combines partial results. The split decision depends
            on ``max_k`` (the longest sequence in the batch), so different batch
            compositions can change the reduction order and produce different
            floating-point results for the same sequence. When this is disabled,
            bitwise identical outputs are guaranteed for a given sequence
            regardless of what other sequences are in the batch, at the
            cost of lower GPU utilization when there are few queries. When
            ``None`` (default), the kernel chooses automatically.

    Returns:
        output (Tensor): Output tensor from attention computation; shape :math:`(T_q, H_q, D)`.

        If ``return_aux`` is not None and ``return_aux.lse`` is True:
            lse (Tensor): Log-sum-exp of attention scores; shape :math:`(T_q, H_q)`.

    Shape legend:
        - :math:`N`: Batch size
        - :math:`T_q`: Total number of query tokens in the batch (sum of all query sequence lengths)
        - :math:`T_k`: Total number of key/value tokens in the batch (sum of all key/value sequence lengths)
        - :math:`H_q`: Number of query attention heads
        - :math:`H_{kv}`: Number of key/value attention heads (equal to :math:`H_q` unless GQA is enabled)
        - :math:`D`: Head dimension

    Example::

        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
        >>> batch_size, max_seq_len, embed_dim, num_heads = 2, 512, 1024, 16
        >>> head_dim = embed_dim // num_heads
        >>> seq_lengths = []
        >>> for _ in range(batch_size):
        ...     length = torch.randint(1, max_seq_len // 64 + 1, (1,)).item() * 64
        ...     seq_lengths.append(min(length, max_seq_len))
        >>> seq_lengths = torch.tensor(seq_lengths, device="cuda")
        >>> total_tokens = seq_lengths.sum().item()
        >>>
        >>> # Create packed query, key, value tensors
        >>> query = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>> key = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>> value = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>>
        >>> # Build cumulative sequence tensor
        >>> cu_seq = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
        >>> cu_seq[1:] = seq_lengths.cumsum(0)
        >>> max_len = seq_lengths.max().item()
        >>>
        >>> # Call varlen_attn
        >>> output = varlen_attn(
        ...     query, key, value, cu_seq, cu_seq, max_len, max_len
        ... )
    r1   r   GExpect query and key/value to have the same number of heads but got Hq=	 and Hkv=&. Try setting enable_gqa=True for GQA.r   MExpect number of query heads to be a multiple of kv heads for GQA but got Hq=.r   r   )rT   r   rC   rD   
torch_attnrQ   listr   )r"   r#   r$   r%   r&   r'   r(   rc   r*   r	   r+   r,   r-   r.   num_heads_qnum_heads_kr)   outr   rO   s                       r   r   r      s   j **Q-K!,!8#((1+chhqkK+4%i} =34
 	

 kK/14%i}A?
 	

 w&I))&&33[KCa  *..CxJr   ztorch_attn::_varlen_attn_outro   c                 :   t        |
      }
|j                  xr t        |j                  j                        }|rt        d      t        j                  d       t        j                  j                  j                  | |||||||d|d|	|
d   |
d   |||      }|S )z
    Private custom op for variable-length attention with pre-allocated output.
    Same as _varlen_attn but writes the attention output into the provided out tensor.
    z+cuDNN backend does not support out variant.z1Using Flash Attention backend for varlen_attn_outr3   Fr   r1   )r*   r8   r9   r,   r-   r.   )r   r>   r   r=   r?   rB   r@   rA   rC   rD   rE   +_flash_attention_forward_no_dropout_inplace)ro   r"   r#   r$   r%   r&   r'   r(   r)   r*   r	   r+   r,   r-   r.   rJ   rM   s                    r   _varlen_attn_outrr   R  s    , )5KG"3ELL4F4F"GIHIIHH@A))..LL$Q%a.# M K( r   c                    |j                  d      }|j                  d      }t        j                  ||ft        j                  |j                        }t        j
                  j                  rt        j                  j                         }|t        j                  j                  j                  k(  rG|j                  d      dz
  }t        j                  |||ft        j                  |j                        }|S )F
    Fake implementation for meta tensor computation and tracing.
    r   r1   r;   )rT   rC   rU   rV   r=   rW   rX   rY   rZ   r[   r\   )ro   r"   r#   r$   r%   r&   r'   r(   r)   r*   r	   r+   r,   r-   r.   r]   r^   r_   r`   ra   s                       r   _varlen_attn_out_fakeru     s    * jjmG

1I	GEKKI }}HH;;=	//888!q)A-JY.ekk%,,I r   c                   |j                  d      }||j                  d      n|j                  d      }|s||k7  rt        d| d| d      |r||z  dk7  rt        d| d| d      |
d	k(  }t        j                  j                  j                  | |||||||||	t        |
      ||||      }||j                  r| |fS | S )
zCompute variable-length attention using Flash Attention with a pre-allocated output tensor.

    Same as :func:`varlen_attn` but writes the attention output into the provided ``out`` tensor
    instead of allocating a new one.

    r1   r   re   rf   rg   r   rh   ri   rj   )rT   r   rC   rD   rk   rr   rl   r   )ro   r"   r#   r$   r%   r&   r'   r(   rc   r*   r	   r+   r,   r-   r.   rm   rn   r)   r   s                      r   r   r     s   0 **Q-K!,!8#((1+chhqkK+4%i} =34
 	

 kK/14%i}A?
 	

 w&I
))


/
/[C" *..CxJr   ctxinputs.rL   c                     |\  }}}}}}}	}
}}}}}}|\  }}}|t        d      |t        d      | j                  ||||||||       || _        |	| _        |
| _        || _        || _        y )Nz)seqused_k is an inference-only parameter.z+block_table is an inference-only parameter.)rB   save_for_backwardr'   r(   r)   r*   r	   )rw   rx   rL   r"   r#   r$   r%   r&   r'   r(   r)   r*   r	   r+   r,   r-   r.   ro   r   rN   s                       r   _setup_contextr{     s      	 CiFGGHII%eXxc9UCICICMCI!COr   z!torch_attn::_varlen_attn_backwardgrad_outr   rN   c                 N   t        |      }t        j                  d|j                        }|j                  xr t        |j                  j                        }|rmt        j                  d       |d   dk7  s|d   dk7  rt        d      t        j                  j                  j                  | |||||||||	d|
|||      \  }}}nYt        j                  d	       t        j                  j                  j                  | |||||||||	d|
||||d   |d   
      \  }}}|||fS )Nr   )r=   r0   r   r1   r2   r3   r4   r6   )r*   r8   r9   )r   rC   rU   r=   r>   r   r?   r@   rA   rB   rD   rE   _cudnn_attention_backward_flash_attention_backward)r|   r"   r#   r$   ro   r   r%   r&   r'   r(   r)   rN   r*   r	   unusedrJ   dqdkdvs                      r   _varlen_attn_backwardr     sD   " )5K[[5<<0FG"3ELL4F4F"GI67q>R;q>R#7f  YY^^== > 

B$ 	@AYY^^==(^)!n# > 

B& r2:r   c                     t        |      }t        j                  |      }t        j                  |      }t        j                  |      }|||fS )rt   )r   rC   rS   )r|   r"   r#   r$   ro   r   r%   r&   r'   r(   r)   rN   r*   r	   
grad_querygrad_key
grad_values                    r   _varlen_attn_backward_faker   Q  sK    ( )5K!!%(J$H!!%(Jx++r   grad_lsegrad_rngc                 2   | j                   \  }}}}}}	}
}| j                  }| j                  }| j                  }| j                  }| j
                  }t        j                  j                  j                  |||||	|
||||||||      \  }}}d}|||gd|z  S )N   )N)
saved_tensorsr'   r(   r)   r*   r	   rC   rD   rk   r   )rw   r|   r   r   r"   r#   r$   r%   r&   ro   r   rN   r'   r(   r)   r*   r	   r   r   r   
num_paramss                        r   	_backwardr   n  s     BEARAR>E3x3YIIEIIEIIIE//K%%;;JBB$ JB0'J.00r   )setup_context)_varlen_attn_backward_flop_varlen_attn_forward_flop_varlen_attn_out_flopflop_registry)FNNFNNN)NN).r   logging	functoolsr   typingr   r   rC   	getLoggerr   r@   __all__rl   intr   r   r   r   library	custom_opTensorrV   tuplerQ   register_fakerb   r   rr   ru   r   r{   r   r   r   register_autograd_dynamodisallow_in_graphrD   rE   rq   torch.utils.flop_counterr   r   r   r   rk   r   r   r   <module>r      sc
     "  g!
:S	D(8 T#Y  1C D  
  3"E $(%)'+!V+<<V+	V+ <<V+ ll	V+
 llT!V+ V+ V+ V+ 4<V+ cT!V+ V+ ||d"V+ $V+ d
V+ 5<<u||34V+ FV+r  $(%)'+!.(<<.(	.( <<.( ll	.(
 llT!.( .( .( .( 4<.( cT!.( .( ||d".( $.( d
.( 5<<u||34.( .(t %)#+%)'+!V<<V	V <<V ll	V
 llT!V V V T!V 4<V sCxV V ||d"V $V d
V  \\E%,,455!Vr 7ugN $(%)'+!2	2<<2 
2 <<	2
 ll2 llT!2 2 2 2 4<2 cT!2 2 ||d"2 $2 d
2  \\!2 O2j  $(%)'+!"	"<<" 
" <<	"
 ll" llT!" " " " 4<" cT!" " ||d"" $" d
"  \\!"  "^ %)#+%)'+!!:	:<<: 
: <<	:
 ll: llT!: : : T!: 4<: sCx: : ||d": $:  d
!:" \\E%,,455#:z" "U38_ "c "d "B <2N $(AllA<<A 
A <<	A
 
A 
A llA llA A A A ||A 4<A cT!A 5<<u||34A OAH $$ $(,ll,<<, 
, <<	,
 
, 
, ll, ll, , , , ||, 4<, cT!, 5<<u||34, %,81	11051HM1
5<<$#$1B   y  G   	IINN>>  4Meii""// 07Leii""33 4<Veii""88 9r   