
    9j:                      U d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlmZ ddlmZ ddl	mZmZmZmZmZmZmZ ddlmZmZmZmZmZ ddlZddlmZ dd	lmZ  dd
l!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z* e	jV                  rddl,m-Z- ddl.m/Z/ dZ0 e1       Z2de3d<   e4f	 	 	 	 	 	 	 dSdZ5g dZ6eeeeeegef   Z7eeeeegef   Z8ed   Z9de3d<    ed      Z: G d ded      Z; G d de;d      Z< G d de      Z= G d d e      Z> G d! d"e      Z?	 	 	 	 dTd#Z@	 	 	 dU	 	 	 	 	 	 	 	 	 	 	 dVd%ZA	 	 	 	 	 	 	 	 	 	 	 	 dWd&ZB	 	 	 	 	 	 	 	 	 	 dXd'ZC	 	 	 	 	 	 	 	 	 	 dYd(ZDd)ZEd*ZFdZd+ZGd[d,ZH	 	 	 	 	 	 d\d-ZI	 	 	 	 	 	 	 	 	 	 d]d.ZJ e'd$      d/   ZK G d0 d1      ZL eL       ZM G d2 d3e	j$                        ZN G d4 d5e	j$                        ZO	 d^	 	 	 d_d6ZPd7 ZQ G d8 d9      ZR G d: d;      ZSd`d<ZTdad=ZUeEeEdf	 	 	 	 	 	 	 	 	 dbd>ZVdcd?ZWdcd@ZXeEeEf	 	 	 	 	 	 	 dddAZYeEeEf	 	 	 	 	 	 	 	 	 	 	 dedBZZ	 d^	 	 	 	 	 	 	 	 	 	 	 	 	 dfdCZ[deEdf	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dgdDZ\dhdEZ]	 d^	 	 	 	 	 	 	 	 	 	 	 	 	 didFZ^djdGZ_djdHZ`	 	 	 	 	 	 	 	 dkdIZae	 	 	 	 	 	 dldJdK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dmdL       Ze edMebN      	 	 	 	 	 	 dldJdK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dndO              Ze	 	 	 	 	 	 dl	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dodP       Ze	 	 	 	 	 	 dl	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dpdQ       Z	 	 	 	 	 	 dqddK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 drdRZy)szIThis module implements the user facing API for flex_attention in PyTorch.    )annotationsN)Callable)Enum)AnycastLiteral
NamedTupleoverload	TypeAliasTypeVar)
deprecatedNeverNotRequiredSelf	TypedDict)Tensor)flex_attention)setup_compilation_env)_validate_sdpa_input)
GetAttrKeytree_flattentree_map_onlytree_unflattenTreeSpec)DeviceLikeType)BaseArgumentTypesFzset[str]_WARNINGS_SHOWNc                    | t         vrLt        j                  j                         st	        j
                  ||d       t         j                  |        yy)z=Helper to ensure each warning is shown only once per process.   
stacklevelN)r   torchcompileris_compilingwarningswarnadd)
warning_idmessagecategorys      a/media/conek/DATA/Code/OCR/venv/lib/python3.12/site-packages/torch/nn/attention/flex_attention.py
_warn_oncer,   <   s?     (~~**,MM'8:J' )    )
	BlockMaskr   	AuxOutput
AuxRequestFlexKernelOptionscreate_block_maskcreate_maskor_masks	and_masks	noop_mask)AUTOTRITONFLASHTRITON_DECODEr   _Backend_Rc                      e Zd ZU dZded<   	 ded<   	 ded<   	 ded<   	 ded<   	 ded<   	 ded	<   	 ded
<   	 ded<   	 ded<   	 ded<   	 ded<   	 ded<   	 ded<   	 ded<   	 ded<   	 ded<   	 ded<   y)r1   a  Options for controlling the behavior of FlexAttention kernels.

    These options are passed to the underlying Triton kernels to control performance
    and numerical behavior. Most users will not need to specify these options as the
    default autotuning provides good performance.

    The options can be prefixed with ``fwd_`` or ``bwd_`` to apply only to forward or
    backward pass respectively. For example: ``fwd_BLOCK_M`` and ``bwd_BLOCK_M1``.

    Note:
      We currently do not provide any backward compatibility guarantees for these options.
      That being said most of these have remained pretty stable since their introduction. But
      We do not consider this part of the public API just yet. We think that some documentation
      Is better than secret hidden flags, but we may change these options in the future.

    Example Usage:
        .. code-block:: python

            # Using dictionary (backward compatible)
            kernel_opts = {"BLOCK_M": 64, "BLOCK_N": 64, "PRESCALE_QK": True}
            output = flex_attention(q, k, v, kernel_options=kernel_opts)

            # Using TypedDict (recommended for type safety)
            from torch.nn.attention.flex_attention import FlexKernelOptions

            kernel_opts: FlexKernelOptions = {
                "BLOCK_M": 64,
                "BLOCK_N": 64,
                "PRESCALE_QK": True,
            }
            output = flex_attention(q, k, v, kernel_options=kernel_opts)

            # Forward/backward specific options
            kernel_opts: FlexKernelOptions = {
                "fwd_BLOCK_M": 64,
                "bwd_BLOCK_M1": 32,
                "PRESCALE_QK": False,
            }
            output = flex_attention(q, k, v, kernel_options=kernel_opts)
    zNotRequired[int]	num_warps
num_stagesBLOCK_MBLOCK_NBLOCK_M1BLOCK_N1BLOCK_M2BLOCK_N2zNotRequired[bool]PRESCALE_QKROWS_GUARANTEED_SAFEBLOCKS_ARE_CONTIGUOUSWRITE_DQFORCE_USE_FLEX_ATTENTIONUSE_TMAkpackmatrix_instr_nonkdimwaves_per_euzNotRequired[_Backend]BACKENDN__name__
__module____qualname____doc____annotations__ r-   r+   r1   r1   Y   s    'V  L ! N b b
 , , , , #"6 ,+Q
 -,A  G 0/- / 1**;""1""r-   r1   )totalc                  "    e Zd ZU ded<   ded<   y)_KernelOptionsWithInternalsboolOUTPUT_LOGSUMEXP
OUTPUT_MAXN)rQ   rR   rS   rU   rV   r-   r+   rY   rY      s    r-   rY   c                  .    e Zd ZU dZdZded<   dZded<   y)r0   zRequest which auxiliary outputs to compute from flex_attention.

    Each field is a boolean indicating whether that auxiliary output should be computed.
    FrZ   lse
max_scoresNrQ   rR   rS   rT   r^   rU   r_   rV   r-   r+   r0   r0      s    
 CJr-   r0   c                  .    e Zd ZU dZdZded<   dZded<   y)r/   zAuxiliary outputs from flex_attention operation.

    Fields will be None if not requested, or contain the tensor if requested.
    NTensor | Noner^   r_   r`   rV   r-   r+   r/   r/      s    
 C $J$r-   r/   c                      e Zd ZdZdZdZdZy)_ModificationTypezEnum for the type of modification function.
    - SCORE_MOD: score_mod function which accepts a score as the first argument
    - mask_mod: mask function which does not accept a score and is only used for generating
    block mask
       r      N)rQ   rR   rS   rT   	SCORE_MODMASK_MODUNKNOWNrV   r-   r+   rd   rd      s     IHGr-   rd   c                   t        | d      rG| j                  }|j                  }d}t        | d      r| j                  xs d}t	        |      }||z
  }n=t        d t        j                  |       j                  j                         D              }|dk7  r|dk7  rt        d|       |dk(  rt        j                  S |dk(  rt        j                  S t        j                  S )a\  Get the type of modification function.
    This function inspects the number of positional arguments of the function to determine
    the type of modification function. If the function has 5 positional arguments, it is
    considered as a score_mod function. If the function has 4 positional arguments, it is
    considered as a mask function.
    __code__rV   __defaults__c              3  l   K   | ],  }|j                   t        j                  j                  u rd  . yw)re   N)defaultinspect	Parameterempty).0params     r+   	<genexpr>z _get_mod_type.<locals>.<genexpr>  s0      "
}} 1 1 7 77 "
s   24      z%Expected 4 or 5 positional args, got )hasattrrk   co_argcountrl   lensumro   	signature
parametersvaluesAssertionErrorrd   rg   rh   ri   )fncodenum_positional_totaldefaultsnum_defaultsnum_positional_argss         r+   _get_mod_typer     s     r:{{#//2~&,"H8}2\A! "
 **2.99@@B"
 

 a$71$<34G3HI
 	
 a ***		! ))) (((r-   rV   c                x    g }g d}|r|dgz  }|dgz  }|D ]   }t        j                  | ||z   |z   |      } " | S )a  Used to vmap both score_mods and mask_mods over 4-dimensional/5-dimension inputs.
    Mapping over the [b, hq, q_idx, kv_idx] or [b, hkv, g, q_idx, kv_idx] dimensions.

    Args:
        fn (callable): The function to vmap.
        prefix (tuple): The prefix of the vmap. For score mod functions,
                        this should be set to (0,). For mask_mods = ()
        suffix (tuple): We need to add (0,) if gradOut is being mapped over,
                        and (None,) * len(other_buffers).
        out_dims (tuple): For forward cases, keep this as the default 0 since
                          we are only returning 1 output. For backwards, the joint
                          graph returns grads for B, H, Q_idx, KV_idx and other_buffers,
                          so we set this to (0, None, None, None, None) + (None,) * len(other_buffers).

    Returns:
        callable: The vmapped function.
    ))NNNr   )NNr   NNr   NNr   )r   NNN)in_dimsout_dims)r"   vmap)r   prefixsuffixr   	group_dim
dimensionsdimss          r+   _vmap_for_bhqkvr   (  sp    2 OQJJ !
 	

  J  OZZFTMF$:XNOIr-   c                    | S NrV   )scorebatchheadtoken_qtoken_kvs        r+   	_identityr   V  s	     Lr-   c                Z    | j                  dt        j                  | j                        S )zReturns a noop mask_modrV   )sizedtypedevice)new_onesr"   rZ   r   r   r   r   r   s       r+   r6   r6   `  s!     >>rELL>IIr-   c                    t        d      )z
    Raises helpful error when using mask_mod from a sliced BlockMask.

    After slicing a BlockMask, the mask_mod is reset and cannot be used directly.
    Users must reassign mask_mod from the original (unsliced) BlockMask.
    a&  Cannot use mask_mod from a sliced BlockMask. When you slice a BlockMask using [], the mask_mod attribute is reset. You must set it from the original BlockMask's mask_mod.

Incorrect usage:
  base_mask = create_block_mask(my_mask_fn, ...)
  sliced_mask = base_mask[:, :, block_idx]
  sliced_mask.mask_mod = apply_offset(sliced_mask.mask_mod, offset)  # WRONG!

Correct usage:
  base_mask = create_block_mask(my_mask_fn, ...)
  sliced_mask = base_mask[:, :, block_idx]
  sliced_mask.mask_mod = apply_offset(base_mask.mask_mod, offset)  # Use base_mask!)RuntimeErrorr   s       r+   _sliced_mask_mod_errorr   j  s     
	` r-      i   @c                  	 |j                   d   	|j                   d   | j                   d d }| j                  	fd}|}t        t        |            D ]  }t	        j
                  |d      }  || |      }|S )Nc                   |j                  	dz   t        j                        }t        j                  	t        j                        j                  d      }t        j                  t        j                        }|| j                  d      k  }t        j                  ||      }|j                  d      |||f<   |d d d f   j                         S )Nre   r   r   r   r   rV   )		new_zerosr"   int32arangeint	unsqueezewherer   
contiguous)
kv_num_blocks
kv_indices
dense_maskrow_indices	col_range
index_maskvalid_indicesr   num_colsnum_rowss
          r+   create_dense_onez+_ordered_to_dense.<locals>.create_dense_one  s    ))(HqL)T
ll8599VLVV
 LL6J	!8!8!<<
 J
HE 2<1D1DR1H
;-.!YhY,'2244r-   )r   r   )r   )shaper   rangery   r"   r   )
num_blocks_in_rowcol_indices
batch_dimsr   create_dense_batched_outr   r   r   s
          @@@r+   _ordered_to_denser     s      $H  $H"(("-J%%F5  ,3z?# P$zz*>OP 0+
>CJr-   c                T   | j                  t        j                        } | j                  d      }t        j                  | ddd      }|j                  t        j                  t        j
                        |j                  t        j                  t        j
                        fS )Nr   r   dimT)r   
descendingstable)memory_format)tor"   r   rz   argsortcontiguous_format)r   r   r   s      r+   _dense_to_orderedr     s|    U[[1J"2.--
tDQKU[[8O8OPu{{%2I2IJ r-   c                P    t        | |      }t        |j                  dd            S )Nr   r   )r   r   	transpose)r   r   denses      r+   _transpose_orderedr     s'     /=EU__R455r-   c                   |d d d d d |d |f   }| d d d d d |f   } t        j                  | |k  | |      } t        j                  || d d d d d d d f   k  d      j                  t         j                        } | |fS )Nr   r   )r"   r   rz   r   r   )
num_blocksindicesnew_num_rowsnew_num_colss       r+   _adjust_num_blocks_and_indicesr     s     aM\M=L=89GAq-<-/0JZ,6
LQJ7Z1a%>>BGJJ5;;WJwr-   re   c                      e Zd ZdZdZddZy)_ExtractedLeafzSentinel in _StrippedClosure.leaf_entries marking a position that is
    filled from the extracted pytree leaves list during reconstruction.rV   c                     y)N_EXTRACTED_LEAFrV   selfs    r+   __repr__z_ExtractedLeaf.__repr__  s     r-   Nreturnstr)rQ   rR   rS   rT   	__slots__r   rV   r-   r+   r   r     s    K I!r-   r   c                  0    e Zd ZU dZded<   ded<   ded<   y)	_FunctionLeafzEntry in _StrippedClosure.leaf_entries for a recursively processed
    function.  Stores enough information to reconstruct the function from
    the extracted leaves during unflattening.z%_StrippedClosure | Callable[..., Any]strippedr   closure_specr   n_extractedNrP   rV   r-   r+   r   r     s    1 43r-   r   c                  b    e Zd ZU dZded<   ded<   ded<   ded<   d	ed
<   ded<   ded<   ded<   y)_StrippedClosureu(  Data container holding the parts of a function needed for reconstruction.

    Created by _extract_closure_pytree when closure tensors are lifted into
    pytree leaves.  Unlike a FunctionType with None-filled cells, this is not
    callable — it is pure data stored in the pytree context.
    ztypes.CodeTyper   zdict[str, Any]globals_dictr   namequalnameztuple[Any, ...] | Noner   zdict[str, Any] | None
kwdefaults
extra_dictz*tuple[_ExtractedLeaf | _FunctionLeaf, ...]leaf_entriesNrP   rV   r-   r+   r   r     s9       
IM$$%% =<r-   r   c                   t        j                  |       rt        j                  j	                         r	dt
        | fS |
t               }t        |       |v r	dt
        | fS |j                  t        |              | j                  }|s	dt
        | fS 	 t        d |D              }t        |      \  }}g }g }|D ]  }t        j                  |      rGt        ||      \  }	}
}|j                  |	       |j                  t!        ||
t#        |	                   _|j                  |       |j                  t$                t'        | j(                  | j*                  | j,                  | j.                  | j0                  | j2                  | j4                  rt7        | j4                        ni t        |            }t        |      ||fS # t        $ r dt
        | fcY S w xY w)aw  Extract closure contents as a flattened sub-pytree.

    Returns (extracted_leaves, closure_spec, fn_or_stripped) where:
    - extracted_leaves: flattened non-function contents from the closure,
      plus any tensors/scalars recursively extracted from nested function
      closures
    - closure_spec: TreeSpec describing how to reconstruct the closure contents
    - fn_or_stripped: either the original fn (no extraction) or a
      _StrippedClosure carrying the function parts needed for reconstruction

    Functions found among the closure leaves are recursively processed: their
    own closure tensors are extracted into the leaves list, and their skeleton
    is stored in _StrippedClosure.leaf_entries as a _FunctionLeaf.  All other
    values (tensors, scalars, None, etc.) remain as extracted leaves.

    If fn is not a plain function, has no closure, or has empty cells, returns
    the original function unchanged with no closure leaves.

    Skipped under Dynamo tracing (torch.compiler.is_compiling) because Dynamo
    can't trace through closure cell introspection and handles freevars via its
    own lifting mechanism.
    rV   c              3  4   K   | ]  }|j                     y wr   )cell_contents)rr   cells     r+   rt   z*_extract_closure_pytree.<locals>.<genexpr>  s     @++@s   )r   r   r   r   r   r   r   r   )ro   
isfunctionr"   r#   r$   _EMPTY_CLOSURE_SPECsetidr'   __closure__tuple
ValueErrorr   _extract_closure_pytreeextendappendr   ry   r   r   rk   __globals__rQ   rS   rl   __kwdefaults____dict__dict)r   _seenclosurecontentsclosure_leavesr   	extractedr   leafchild_extracted
child_specchild_strippedr   s                r+   r   r     s   6 b!U^^%@%@%B&** }	"v&**	IIbfnnG&**+@@@
 $0#9 NL)+I9;L 1d#:Qe;7OZ _-nj#o:NO T"01  [[^^[[$$(*4$<(	H \833A  +&**+s   G GGc                   t        | t              s| S g }d}| j                  D ]}  }t        |t              rRt	        |j
                  ||||j                  z    |j                        }|j                  |       ||j                  z  }e|j                  ||          |dz  } t        ||      }t        d |D              }t        j                  | j                  | j                  | j                  | j                   |      }	| j"                  |	_        | j&                  r| j&                  |	_        | j*                  r%|	j,                  j/                  | j*                         |	S )zJRebuild a function from a _StrippedClosure and flattened extracted leaves.r   re   c              3  F   K   | ]  }t        j                  |        y wr   )typesCellType)rr   vs     r+   rt   z*_reconstruct_closure_fn.<locals>.<genexpr>Y  s     :AennQ':s   !)
isinstancer   r   r   _reconstruct_closure_fnr   r   r   r   r   r   r  FunctionTyper   r   r   r   r   rS   r   r   r   r   update)
r   extracted_leavesr   
all_leavesidxentrychild_fnr  	new_cellsrestoreds
             r+   r  r  C  sI   h 01?AJ
C&& e]+. sU->->'>?""H
 h'5$$$C .s341HC j,7H:::I!!H %--H"*"5"5  !4!45Or-   c                  >    e Zd ZdZdZd	d
dZddZddZddZddZ	y)_MaskModWrapperu  Wraps a mask_mod or _StrippedClosure with value-based equality.

    BlockMask stores an arbitrary callable (mask_mod) in its pytree context.
    The default __eq__ for functions uses identity comparison, which is too
    strict when the same closure is recreated (e.g., defined inside forward()).

    When closure tensors have been extracted (by _extract_closure_pytree), fn
    is a _StrippedClosure (pure data, not callable).  Equality compares the
    code objects + closure_spec — no tensor dispatch is triggered.

    When extraction is skipped (e.g., under Dynamo), fn is the original
    callable and equality compares code objects + closure contents (for plain
    functions) or delegates to __eq__ (for callable objects).
    r   r   Nc                     || _         || _        y r   r  )r   r   r   s      r+   __init__z_MaskModWrapper.__init__}  s    (r-   c                t    t        | j                  t              rt        d      | j                  ||||      S )Nus   _MaskModWrapper with _StrippedClosure is not callable — use _reconstruct_closure_fn to rebuild the function first)r  r   r   r   )r   bhq_idxkv_idxs        r+   __call__z_MaskModWrapper.__call__  s;    dgg/0L  wwq!UF++r-   c                   t        |t              sy| j                  |j                  u r| j                  |j                  u ryt        | j                  t              rbt        |j                  t              rH| j                  j
                  |j                  j
                  k(  xr | j                  |j                  k(  S t        j                  | j                        rLt        j                  |j                        r-| j                  j                  |j                  j                  k(  S t        | j                  t              s3t        |j                  t              s| j                  |j                  k(  S y)NFT)	r  r  r   r   r   r   ro   r   rk   )r   others     r+   __eq__z_MaskModWrapper.__eq__  s	   %177ehh4#4#48J8J#Jdgg/0ZHH&6
 - <%%););;
 dgg&7+=+=ehh+G77##uxx'8'888$''#34ZHH&>
 77ehh&&r-   c                   t        | j                  t              rt        | j                  j                        S t        j                  | j                        rt        | j                  j                        S t        | j                        S r   )r  r   r   hashr   ro   r   rk   r   s    r+   __hash__z_MaskModWrapper.__hash__  sY    dgg/0%%dgg&(())DGG}r-   c                "    d| j                    dS )Nz_MaskModWrapper())r   r   s    r+   r   z_MaskModWrapper.__repr__  s    !$''!,,r-   r   )r   None
r   r   r!  r   r"  r   r#  r   r   r   )r&  objectr   rZ   r   r   r   )
rQ   rR   rS   rT   r   r  r$  r'  r*  r   rV   r-   r+   r  r  k  s'     'I),.-r-   r  c                     e Zd ZU dZded<   ded<   ded<   ded<   ded	<   ded
<   ded<   ded<   ded<   ded<   ded<   g dZg dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d)dZedde	dddf	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d*d       Z
e	 d+	 	 	 d,d       Ze	 	 	 	 d-d       Zd.d/dZed0d       Zd1dZ	 	 	 	 d2dZd1dZd3dZd4dZd5d Zd6d!Z	 d7	 	 	 	 	 d8d"Zd9d#Zed:d$       Zed:d%       Z	 	 d;d&Ze	 	 	 	 	 	 d<d'       Z	 	 d=d(Zy)>r.   a/
  
    BlockMask is our format for representing a block-sparse attention mask.
    It is somewhat of a cross in-between BCSR and a non-sparse format.

    **Basics**

    A block-sparse mask means that instead of representing the sparsity of
    individual elements in the mask, a KV_BLOCK_SIZE x Q_BLOCK_SIZE block is
    considered sparse only if every element within that block is sparse.
    This aligns well with hardware, which generally expects to perform
    contiguous loads and computation.

    This format is primarily optimized for 1. simplicity, and 2. kernel
    efficiency. Notably, it is *not* optimized for size, as this mask is always
    reduced by a factor of KV_BLOCK_SIZE * Q_BLOCK_SIZE. If the size is a
    concern, the tensors can be reduced in size by increasing the block size.

    The essentials of our format are:

    num_blocks_in_row: Tensor[ROWS]:
    Describes the number of blocks present in each row.

    col_indices: Tensor[ROWS, MAX_BLOCKS_IN_COL]:
    `col_indices[i]` is the sequence of block positions for row i. The values of
    this row after `col_indices[i][num_blocks_in_row[i]]` are undefined.

    For example, to reconstruct the original tensor from this format:

    .. code-block:: python

        dense_mask = torch.zeros(ROWS, COLS)
        for row in range(ROWS):
            for block_idx in range(num_blocks_in_row[row]):
                dense_mask[row, col_indices[row, block_idx]] = 1

    Notably, this format makes it easier to implement a reduction along the
    *rows* of the mask.

    **Details**

    The basics of our format require only kv_num_blocks and kv_indices. But, we
    have up to 8 tensors on this object. This represents 4 pairs:

    1. (kv_num_blocks, kv_indices): Used for the forwards pass of attention, as
    we reduce along the KV dimension.

    2. [OPTIONAL] (full_kv_num_blocks, full_kv_indices): This is optional and
    purely an optimization. As it turns out, applying masking to every block
    is quite expensive! If we specifically know which blocks are "full" and
    don't require masking at all, then we can skip applying mask_mod to these
    blocks. This requires the user to split out a separate mask_mod from the
    score_mod. For causal masks, this is about a 15% speedup.

    3. [GENERATED] (q_num_blocks, q_indices): Required for the backwards pass,
    as computing dKV requires iterating along the mask along the Q dimension. These are autogenerated from 1.

    4. [GENERATED] (full_q_num_blocks, full_q_indices): Same as above, but for
    the backwards pass. These are autogenerated from 2.
    tuple[int, int]seq_lengthsr   r   r   rb   full_kv_num_blocksfull_kv_indicesq_num_blocks	q_indicesfull_q_num_blocksfull_q_indices
BLOCK_SIZE_mask_mod_signaturemask_mod)r   r   r4  r5  r6  r7  r8  r9  )r3  r:  r<  c                ^   |j                         dk  rt        d      |t        d      |t        d      |d u |d u k7  rt        d      |d u |	d u k7  rt        d      || _        || _        || _        || _        || _        || _        || _	        || _
        |	| _        |
| _        || _        y )Nr   )BlockMask must have at least 2 dimensionszkv_num_blocks must be providedzkv_indices must be providedGfull_kv_num_blocks and full_kv_indices must be both provided or omittedzEfull_q_num_blocks and full_q_indices must be both provided or omitted)r   r   r~   r3  r   r   r4  r5  r6  r7  r8  r9  r:  r<  )r   r3  r   r   r4  r5  r6  r7  r8  r9  r:  r<  s               r+   r  zBlockMask.__init__  s     >>aJKK  !ABB !>??$&Ot,CD Y  %>T+AB W  '*$"4.("!2,$ r-   NTc	                   |j                         dk  rt        d      |du |du k7  rt        d      |r4t        ||      \  }	}
||t        d      t        ||      \  }}nd\  }}n
d\  }	}
d\  }}t	        |t
              r||f}||nt        }|.|j                  d   |d   z  }|j                  d	   |d
   z  }||f} | ||||||	|
||||      S )a  
        Creates a BlockMask instance from key-value block information.

        Args:
            kv_num_blocks (Tensor): Number of kv_blocks in each Q_BLOCK_SIZE row tile.
            kv_indices (Tensor): Indices of key-value blocks in each Q_BLOCK_SIZE row tile.
            full_kv_num_blocks (Optional[Tensor]): Number of full kv_blocks in each Q_BLOCK_SIZE row tile.
            full_kv_indices (Optional[Tensor]): Indices of full key-value blocks in each Q_BLOCK_SIZE row tile.
            BLOCK_SIZE (Union[int, tuple[int, int]]): Size of KV_BLOCK_SIZE x Q_BLOCK_SIZE tiles.
            mask_mod (Optional[Callable]): Function to modify the mask.

        Returns:
            BlockMask: Instance with full Q information generated via _transposed_ordered

        Raises:
            RuntimeError: If kv_indices has < 2 dimensions.
            AssertionError: If only one of full_kv_* args is provided.
        r   r>  Nr?   full_kv_indices must not be NoneNNr   r   r   re   )r3  r   r   r4  r5  r6  r7  r8  r9  r:  r<  )r   r   r~   r   r  r   r6   r   )clsr   r   r4  r5  r:  r<  r3  compute_q_blocksr6  r7  r8  r9  q_length	kv_lengths                  r+   from_kv_blockszBlockMask.from_kv_blocks/  s6   < >>aJKK$&Ot,CD Y 
 &8
&S#L)!-"*()KLL4F&51!> 5?1!>&0#L)0:-~j#&$j1J'38!''+jm;H"((,z!}<I#Y/K#'!1+%/)!
 	
r-   c                     y r   rV   r   flattens     r+   as_tuplezBlockMask.as_tuple{  s    " r-   c                     y r   rV   rI  s     r+   rK  zBlockMask.as_tuple  s     r-   c                   |r=| j                   d   | j                   d   f}| j                  d   | j                  d   f}n| j                   f}| j                  f}g || j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  || j                  S )z
        Returns a tuple of the attributes of the BlockMask.

        Args:
            flatten (bool): If True, it will flatten the tuple of (KV_BLOCK_SIZE, Q_BLOCK_SIZE)
        r   re   )r:  r3  r   r   r4  r5  r6  r7  r8  r9  r<  )r   rJ  
block_sizer3  s       r+   rK  zBlockMask.as_tuple  s    //!,dooa.@AJ++A.0@0@0CDK//+J++-K


 OO
 ##	

   
 
 NN
 ""
 
 
 MM
 	
r-   c                f    | j                   j                  ^ }}}t        |      | j                  z   S r   )r   r   r   r3  )r   r   r   s      r+   r   zBlockMask.shape  s.     OO11QZ 4#3#333r-   c                    d| j                    d| j                         dd}| j                         j                         }||z  }|dz  }|S )NzBlockMask(shape=z, sparsity=.2fz%, 

))r   sparsity	to_stringstrip)r   smask_strs      r+   __str__zBlockMask.__str__  sO    tzzl+dmmoc5J%P>>#))+	X	U
r-   c                6   t        |t              s|fn|}g |t        d      t        d      t        d      dd }| j                  j                  dd }t        d t        ||d      D              }| j                  |   }| j                  |   }| j                  6| j                  t        d      | j                  |   }| j                  |   }nd}d}t        j                  ||||| j                  t        | j                  | j                  du      S )ae  
        Returns a new BlockMask instance by getting the mask for the given index position.

        Args:
            index: Index to apply to all attributes.

        Example Usage:
            .. code-block:: python

                def causal_mask(b, h, q_idx, kv_idx):
                    return q_idx >= kv_idx


                block_mask = create_block_mask(
                    causal_mask, 4, 2, 512, 512, device="cuda"
                )
                assert block_mask.kv_num_blocks.shape == (4, 2, 4)
                assert block_mask.kv_indices.shape == (4, 2, 4, 4)

                # Index on batch dimension
                new_block_mask = block_mask[0]
                assert new_block_mask.kv_num_blocks.shape == (2, 4)
                assert new_block_mask.kv_indices.shape == (2, 4, 4)

                # Index on batch and head dimension
                new_block_mask = block_mask[0, 1]
                assert new_block_mask.kv_num_blocks.shape == (4,)
                assert new_block_mask.kv_indices.shape == (4, 4)

                # slicing on batch and head dimension
                new_block_mask = block_mask[0:2, 1:2]
                assert new_block_mask.kv_num_blocks.shape == (2, 1, 4)
                assert new_block_mask.kv_indices.shape == (2, 1, 4, 4)

                # slicing on batch, head, and query dimension
                new_block_mask = block_mask[
                    0:2, 1:2, torch.tensor([1], dtype=torch.int32)
                ]
                assert new_block_mask.kv_num_blocks.shape == (2, 1, 1)
                assert new_block_mask.kv_indices.shape == (2, 1, 1, 4)
        Nrf   c              3     K   | ]L  \  }}t        |t              r3| |cxk  rd k  rn nt        ||z   ||z   dz         nt        ||dz         n| N yw)r   re   N)r  r   slice)rr   ins      r+   rt   z(BlockMask.__getitem__.<locals>.<genexpr>  s]      
 1 !S! *+a!U1q5!a%!)$q!a%
s   AAT)strictrA  )r:  r<  r3  rD  )r  r   r[  r   r   zipr   r4  r5  r~   r.   rG  r:  r   r3  r7  )r   indexpaddedsizesnew_kv_num_blocksnew_kv_indicesnew_full_kv_num_blocksnew_full_kv_indicess           r+   __getitem__zBlockMask.__getitem__  s3   X !+5% 8e@5@%+@uT{@E$K@!D""((!, 
 FE$7	
 
 !..u5/"".##+$%GHH%)%<%<U%C""&"6"6u"=%)""&''"+((!^^47 ( 	
 		
r-   c                    dd}d| j                   j                   d| j                  j                   d || j                         d || j                         d || j
                         d || j                         d || j                         d	 || j                         d
| j                   d| j                   d| j                         ddt        | j                  d      r| j                  j                   dS | j                   dS )Nc                "    | | j                   S d S r   r   )xs    r+   shape_or_nonez)BlockMask.__repr__.<locals>.shape_or_none  s    m17755r-   zBlockMask(
    kv_num_blocks=z,
    kv_indices=z,
    full_kv_num_blocks=z,
    full_kv_indices=z,
    q_num_blocks=z,
    q_indices=z,
    full_q_num_blocks=z,
    full_q_indices=z,
    BLOCK_SIZE=z,
    shape=z,
    sparsity=rQ  z%,
    mask_mod=rQ   rR  )rk  ztorch.Tensor | None)r   r   r   r4  r5  r6  r7  r8  r9  r:  rS  rw   r<  rQ   )r   rl  s     r+   r   zBlockMask.__repr__  s<   	6!!%!3!3!9!9 : ;"oo334 5&&3D4K4K&L%M N##01E1E#F"G H  -d.?.? @A B*4>>:; <%%243I3I%J$K L""/0C0C"D!E F"oo. / % MMOC0 16=dmmZ6XDMM22l m	
 _c^k^kl m	
r-   c                   || j                   d   z   dz
  | j                   d   z  }|| j                   d   z   dz
  | j                   d   z  }t        | j                  | j                  ||      \  }}| j                  =| j
                  t        d      t        | j                  | j
                  ||      \  }}nd }d }| j                  ||||| j                   | j                        S )Nr   re   rA  )	r:  r   r   r   r4  r5  r~   rG  r<  )	r   	new_q_len
new_kv_lenr   r   rc  rd  re  rf  s	            r+   _adjustzBlockMask._adjust'  s    !DOOA$66:tq?QQ"T__Q%77!;PQ@RR,J|-
)> "".##+$%GHH /''$$	&# &*""&"""OOMM
 	
r-   c                0    | j                   }d } ||      S )zIReturns the number of elements (not accounting for sparsity) in the mask.c                L    t        j                  t        j                  | d      S Nre   )	functoolsreduceoperatormul)xss    r+   _prodzBlockMask.numel.<locals>._prodI  s    ##HLL"a88r-   rj  )r   r   ry  s      r+   numelzBlockMask.numelE  s    

	9 U|r-   c                "   | j                         }| j                  j                         }| j                  || j                  j                         z  }|j	                         | j
                  d   z  | j
                  d   z  }||z  }dd|z
  z  S )zEComputes the percentage of blocks that are sparse (i.e. not computed)r   re   d   )rz  r   rz   r4  itemr:  )r   
total_sizecomputed_blockscomputed_sizedense_ratios        r+   rS  zBlockMask.sparsityN  s    ZZ\
,,002"".t66::<<O',,.1CCdooVWFXX#j0a+o&&r-   c                    t        | j                  | j                        }| j                  :| j                  t        d      |t        | j                  | j                        z  S |S )z;Returns a dense block that is equivalent to the block mask.rA  )r   r   r   r4  r5  r~   )r   partial_denses     r+   to_densezBlockMask.to_denseY  sj    )$*<*<dooN"".##+$%GHH #4'')=)=$   r-   c           
       
 | j                         

j                  ^ }t        |t              r||n|dk(  rn|\  
fd}g }t	        t        j                  |D cg c]  }t        |       c}       D ]U  \  }}||k(  r5|j                  d       |j                  d       |j                  d        n || }	|j                  |	       W dj                  |      S c c}w )zReturns a string representation of the block mask. Quite nifty.

        If grid_size is -1, prints out an uncompressed version. Warning, it can be quite big!
        r   c            	     p   g }|j                  |         dj                  t        |            dz   }dd}d }t        d |            }t        d |            }t	        d|      D ]F  }t	        d|      D ]/  }}	| D ]  }
|	|
   }		  ||	|||z   |||z   f         }||dz  z  }1 |dz  }H |S )	Nz, 
c                t    | j                         j                         j                         }|dk(  ry|dk(  ryy)Nre   u   █r    u   ░)floatmeanr}  )section
percentages     r+   summarize_sectionzHBlockMask.to_string.<locals>.create_block_vis.<locals>.summarize_section~  s6    $]]_11388:
? 1_ r-   c                    | |dz
  z   |z  S rs  rV   )ar   s     r+   cdivz;BlockMask.to_string.<locals>.create_block_vis.<locals>.cdiv  s    QU))r-   re   r   r   r   )r   joinreversedmaxr   )	batch_idxdescriptorsvisr  r  row_stepcol_steprccur_maskr  charr   max_colsmax_rowsr   r   s               r+   create_block_visz-BlockMask.to_string.<locals>.create_block_visw  s    K).))H[12T9C!* 1d8X67H1d8X67H1h1 	q(H5 $A)H( 1#+C=1, Q\!11q8|3C!CDD 4!8OC$ t	 Jr-   z...z3To print out more, set BlockMask.to_string(limit=N)zNYou can also index (BlockMask[batch, head]) to choose a specific batch or headr  )
r  r   r  r   	enumerate	itertoolsproductr   r   r  )r   	grid_sizelimitr   r  	total_visr\  r  r  	block_visr   r  r  r   r   s             @@@@@r+   rT  zBlockMask.to_stringe  s    ]]_
*4*:*:'Xxi% H H"_HH!*Hh 	  	D 	'*=Qa=>
 	(NC e|  '  !VW  d ()4IY'	( yy##  >s   -C0c                p    t        t        j                  fd| j                  d            }t	        | S )a  Moves the BlockMask to the specified device.

        Args:
            device (torch.device or str): The target device to move the BlockMask to.
                Can be a torch.device object or a string (e.g., 'cpu', 'cuda:0').

        Returns:
            BlockMask: A new BlockMask instance with all tensor components moved
            to the specified device.

        Note:
            This method does not modify the original BlockMask in-place.
            Instead, it returns a new BlockMask instance where individual tensor attributes
            may or may not be moved to the specified device, depending on their
            current device placement.
        c                &    | j                        S r   )r   )rk  r   s    r+   <lambda>zBlockMask.to.<locals>.<lambda>  s    add6l r-   F)rJ  )r   r"   r   rK  r.   )r   r   mapped_attributess    ` r+   r   zBlockMask.to  s6    " *LL"MM%M(

 +,,r-   c                &    | dk(  rt        |      S |S )Nr<  )r  attrvalues     r+   _wrap_context_valuezBlockMask._wrap_context_value  s    :"5))r-   c                v    | dk(  r3t        |t              st        dt        |             |j                  S |S )Nr<  zExpected _MaskModWrapper, got )r  r  r~   typer   r  s     r+   _unwrap_context_valuezBlockMask._unwrap_context_value  s9    :e_5$'Ed5k]%STT88Or-   c                     t         fd j                  D              }t         j                        \  }||z   }t         fd j                  D              }||fS )a  Flatten BlockMask into a list of tensors and context.

        Closure tensors from mask_mod are extracted into the leaves via
        _extract_closure_pytree so they are visible to the tracing
        infrastructure (instead of being hidden in the pytree context).
        c              3  6   K   | ]  }t        |        y wr   )getattrrr   r  r   s     r+   rt   z%BlockMask._flatten.<locals>.<genexpr>  s     Kd+Ks   c              3  x   K   | ]1  }|d k7  rj                  |t        |            nt               3 ywr<  N)r  r  r  rr   r  r   r   r   s     r+   rt   z%BlockMask._flatten.<locals>.<genexpr>  sH      
  z! $$T74+>? <89
s   7:)r   _TENSOR_ATTRSr   r<  _CONTEXT_ATTRS)r   tensorsr  r  contextr   r   s   `    @@r+   _flattenzBlockMask._flatten  sf     K8J8JKK1H1W.h~-
 
 ++	
 
 7""r-   c                x   t        | j                        }|d| }||d }i }t        | j                  |      D ]T  \  }}|dk(  r5t	        |t
              r%t        |j                  ||j                        ||<   @| j                  ||      ||<   V |j                  t        | j                  |              | di |S )z3Unflatten leaves and context back into a BlockMask.Nr<  rV   )ry   r  r_  r  r  r  r  r   r   r  r  )	rC  leavesr  	n_regularregular_leavesr  kwargsr  vals	            r+   
_unflattenzBlockMask._unflatten  s     ))*	
+	
+S//9 	DID#z!jo&F6FFNC,<,< t  #88sCt	D 	c#++^<=}V}r-   c                     t         fd j                  D              }t         j                        \  }t        d t	        |      D              }||z   }t         fd j
                  D              }||fS )a  Flatten BlockMask with keys for better tracing.

        Closure tensors from mask_mod are extracted into the leaves via
        _extract_closure_pytree so they are visible to the tracing
        infrastructure (instead of being hidden in the pytree context).
        c              3  L   K   | ]  }t        |      t        |      f  y wr   )r   r  r  s     r+   rt   z/BlockMask._flatten_with_keys.<locals>.<genexpr>  s'      
8<ZwtT23
s   !$c              3  B   K   | ]  \  }}t        d |       |f  yw)	_closure_N)r   )rr   r\  r  s      r+   rt   z/BlockMask._flatten_with_keys.<locals>.<genexpr>  s)      "
4;AtZ)A3($/"
s   c           	   3     K   | ]G  }|d k7  r't        |      j                  |t        |            fnt        |      t              f I ywr  )r   r  r  r  r  s     r+   rt   z/BlockMask._flatten_with_keys.<locals>.<genexpr>	  s]      
  z! t77gdD>QRST"OHl$KLM
s   AA)r   r  r   r<  r  r  )r   r  r  closure_with_keysr  r  r   r   s   `     @@r+   _flatten_with_keyszBlockMask._flatten_with_keys  s      
@D@R@R
 
 2I1W.h! "
?H?X"
 
 00
 
 ++	
 
 7""r-   )r3  r2  r   r   r   r   r4  rb   r5  rb   r6  rb   r7  rb   r8  rb   r9  rb   r:  r2  r<  r;  r   r-  )r   r   r   r   r4  rb   r5  rb   r:  int | tuple[int, int]r<  _mask_mod_signature | Noner3  ztuple[int, int] | NonerD  rZ   r   r   ).)rJ  Literal[True]r   ztuple[int, int, Tensor, Tensor, Tensor | None, Tensor | None, Tensor | None, Tensor | None, Tensor | None, Tensor | None, int, int, _mask_mod_signature])rJ  Literal[False]r   ztuple[tuple[int, int], Tensor, Tensor, Tensor | None, Tensor | None, Tensor | None, Tensor | None, Tensor | None, Tensor | None, int | tuple[int, int], _mask_mod_signature])T)rJ  rZ   r   tuple[Any, ...])r   ztuple[int, ...]r   )r`  z7int | slice | Tensor | tuple[int | slice | Tensor, ...]r   r   )rn  r   ro  r   r   r   r0  )r   r  )r   r   ))   r  rv   )r  r  r  r   r   r   )r   ztorch.device | strr   r.   )r  r   r  r   r   r   )r   z<tuple[tuple[BaseArgumentTypes | None, ...], tuple[Any, ...]])r  r  r  r  r   r   )r   zMtuple[tuple[tuple[GetAttrKey, Any], ...], tuple[tuple[GetAttrKey, Any], ...]])rQ   rR   rS   rT   rU   r  r  r  classmethod_DEFAULT_SPARSE_BLOCK_SIZErG  r
   rK  propertyr   rX  rg  r   rp  rz  rS  r  rT  r   staticmethodr  r  r  r  r  rV   r-   r+   r.   r.     s   :x ! %%""$$!!!!	MN'!$'! '! 	'!
 *'! ''! $'! !'! )'! &'! $'! &'! 
'!R 
 -1)-,F/3.2!%I
I
 I
 *	I

 'I
 *I
 -I
 ,I
 I
 
I
 I
V '*$
 $ %
  
: 4 4H
LH
	H
T
*
<	'
 IJB$.B$BEB$	B$H-0  
  #	E#(  ! 
	 (#	V#r-   r.   c                v    | j                         |k  r%| j                  d      } | j                         |k  r%| S )Nr   )r   r   )rk  r   s     r+   _broadcast_to_dimr    s0    
%%'C-KKN %%'C-Hr-   c                    | |z   dz
  |z  |z  S rs  rV   rk  multiples     r+   _round_up_to_multipler    s    L1)H44r-   c           
     T   | j                   t        j                  k7  rt        d| j                          t	        | d      } d }t        j
                  j                  j                  | d || j                  d   |      d || j                  d   |      f      } | j                  \  }}}}||z  dk7  rt        d| d| d	      ||z  dk7  rt        d
| d| d	      | j                  ||||z  |||z  |      } | j                  dddddd      } | j                  ddg      }	|rY||z  }
|	|
k(  }|	dkD  |	|
k  z  }|j                  t        j                        }|j                  t        j                        }||fS |	dkD  }|j                  t        j                        }|d fS )Nz#mask.dtype must be torch.bool, got rv   c                     t        | |      | z
  S r   )r  r  s     r+   padding_needed_for_multiplez@_convert_mask_to_block_mask.<locals>.padding_needed_for_multiple&  s    $Q1A55r-   r   r   r   zQ (z%) must be divisible by Q_BLOCK_SIZE (r,  zKV (z&) must be divisible by KV_BLOCK_SIZE (re   r   rf   ru   r   r   )r   r"   rZ   r~   r  nn
functionalpadr   viewpermuterz   r   int8)maskQ_BLOCK_SIZEKV_BLOCK_SIZEseparate_full_blocksr  BHQKVmask_block_sumfull_block_sumfull_blockspartial_blockss                r+   _convert_mask_to_block_maskr    s    zzUZZB4::,OPPT1%D6 88""'

2F'

2E		
D **KAq!R<1!9,qI
 	
 
MQ2$<]O1M
 	
 99	1a<r]/BMD <<	1aAqD XXH  N %5$6(1,.1PQ'***<!nn5::n6{**'!+'***<t##r-   c                 T     t        d  D              st        d        d fd}|S )z9Returns a mask_mod that's the union of provided mask_modsc              3  2   K   | ]  }t        |        y wr   callablerr   args     r+   rt   zor_masks.<locals>.<genexpr>S       2x}2   )All inputs should be callable mask_mods: c                t    | j                  dt        j                        }D ]  }| || |||      z  } |S NrV   r   )r   r"   rZ   r   r!  r"  r#  resultr  	mask_modss         r+   or_maskzor_masks.<locals>.or_maskV  sB    Ruzz2 	8Dd1a77F	8r-   r.  allr   )r  r  s   ` r+   r4   r4   Q  s0    2	22FykRSS Nr-   c                 T     t        d  D              st        d        d fd}|S )z@Returns a mask_mod that's the intersection of provided mask_modsc              3  2   K   | ]  }t        |        y wr   r  r  s     r+   rt   zand_masks.<locals>.<genexpr>a  r  r  r  c                t    | j                  dt        j                        }D ]  }| || |||      z  } |S r  )r   r"   rZ   r  s         r+   and_maskzand_masks.<locals>.and_maskd  sB    Bejj1 	8Dd1a77F	8r-   r.  r  )r  r  s   ` r+   r5   r5   _  s0    2	22FykRSS Or-   c                   | j                         dk7  rt        d| j                                | j                  \  }}}} | j                  ||g| j                   } | j	                  dddddd      j                  ||||z  ||z        } | S )Nrv   z block_mask.dim() must be 4, got r   rf   r   ru   re   )r   r~   r   expandr  reshape)
block_maskr  r  r  r  r  r  s          r+   _convert_block_mask_to_maskr  m  s    
 ~~1?
@P?QRSS""KAq!R"""<RAQAQRJ##Aq!Q15==	1a,] 2J r-   c           	         | \  }}t        |      }|t        |      }nd}t        j                  |d   |d   |d   |d   ||f||      S )NrB  r   re   )r:  r<  r3  )r   r.   rG  )	r  r<  r3  r  r  r  r  
partial_bmfull_bms	            r+   $_create_sparse_block_from_block_maskr  |  so     #-NK">2J7H7U##11

 -0 $  r-   c           
        |"t         j                  j                         xs d}|d}|d}t        j                  d||      }t        j                  d||      }t        j                  d||      }t        j                  d||      }	t	        |       }
ddlm}  |       5  |
t        j                  k(  rh| }t        |d      } |t        j                  |||||      ||||	      }t        j                  t        j                  |      d	d
      }|cddd       S |
t        j                  k(  r%| }t        |d      } |||||	      }|cddd       S t        # 1 sw Y   yxY w)a  This function creates a mask tensor from a mod_fn function.

    Args:
        mod_fn (Union[_score_mod_signature, _mask_mod_signature]): Function to modify attention scores.
        B (int): Batch size.
        H (int): Number of query heads.
        Q_LEN (int): Sequence length of query.
        KV_LEN (int): Sequence length of key/value.
        device (str): Device to run the mask creation on.

    Returns:
        mask (Tensor): A mask tensor with shape (B, H, M, N).
    Ncpure   r   )r   )TransformGetItemToIndex)r   )r   FTrV   )r"   acceleratorcurrent_acceleratorr   r   ,torch._dynamo._trace_wrapped_higher_order_opr  rd   rg   r   zerosr   isneginfrh   r~   )mod_fnr  r  Q_LENKV_LENr   r   r!  mr]  mod_typer  	score_modr   r  r<  s                   r+   r3   r3     sV   * ~""668AEyyQ&)AQ&)AQf-AQv.AV$HT	 	" !(222I'	$?IEKK1eVFKQPQSTVWXC;;u~~c2E4@D! ! *333H&x;HAq!Q'D! ! ! ! !s   %A2E!.EEE(c           	        |"t         j                  j                         xs d}t        |       }|t        j
                  k7  rt        d|        |d}|d}t        |t              r|}	|}
n|\  }	}
|rAt        j                  dt        d        t        j                  t              | ||||||      S t        | |||||      }t        ||	|
d      \  }}t!        ||f| ||f|	|
      }|S )	a  This function creates a block mask tuple from a mask_mod function.

    Args:
        mask_mod (Callable): mask_mod function. This is a callable that defines the
            masking pattern for the attention mechanism. It takes four arguments:
            b (batch size), h (number of heads), q_idx (query index), and kv_idx (key/value index).
            It should return a boolean tensor indicating which attention connections are allowed (True)
            or masked out (False).
        B (int): Batch size.
        H (int): Number of query heads.
        Q_LEN (int): Sequence length of query.
        KV_LEN (int): Sequence length of key/value.
        device (str): Device to run the mask creation on.
        BLOCK_SIZE (int or tuple[int, int]): Block size for the block mask. If a single int is provided it is used for both query and key/value.

    Returns:
        BlockMask:  A BlockMask object that contains the block mask information.

    Example Usage:
        .. code-block:: python

            def causal_mask(b, h, q_idx, kv_idx):
                return q_idx >= kv_idx


            block_mask = create_block_mask(causal_mask, 1, 1, 8192, 8192, device="cuda")
            query = torch.randn(1, 1, 8192, 64, device="cuda", dtype=torch.float16)
            key = torch.randn(1, 1, 8192, 64, device="cuda", dtype=torch.float16)
            value = torch.randn(1, 1, 8192, 64, device="cuda", dtype=torch.float16)
            output = flex_attention(query, key, value, block_mask=block_mask)
    r  z4create-block_mask requires a mask_mod function! Got re   a  _compile flag on create_block_mask was originally added to work around a torch.compile limitation. That limitation has since been addressed. So, to compile create_block_mask, we suggest doing torch.compile(create_block_mask). This still works for now, but will be removed in the future.r   r    T)r  r  r  )r"   r  r  r   rd   rh   r~   r  r   r%   r&   DeprecationWarningcompiler2   r3   r  r  )r<  r  r  r  r  r   r:  _compiler  r  r  mask_tensorpartial_block_maskfull_block_maskr  s                  r+   r2   r2     s*   R ~""668AEX&H$---B8*M
 	
 	yy*c"!"&0#m m	

 0u}}./aE66:
 	
 h1eVVDK*E!#!	+' 6	_-	J r-   c                    | j                   }t        j                  t        j                  g dt        j
                  |      t        j                  g dt        j
                  |      t        d      S )zDefault block mask for flex attention.
    If users don't specify any block sparse mask info, we create this
    empty block sparse mask. Which creates a BlockMask with 1 block that is the full length
    of the query and key tensors.
    )re   re   re   r   )re   re   re   re   )re   re   )r   r   r:  r3  )r   r.   rG  r"   onesr   r  _LARGE_SPARSE_BLOCK_SIZE)querykeyr   s      r+   _create_empty_block_maskr,    sS     \\F##jj%++fM;;|5;;vN+	 $  r-   c                X   t        t        |i n
t        |            }d|v r|j                  dd      rt	        d      d|v r4t        j                  t              }|d   |vrt        d|d    d|       |j                  dd       |j                  dd       |j                  d	d       |j                  d
d       |j                  dd       | j                  j                  dk(  xs4 |j                  j                  dk(  xs |j                  j                  dk(  }|}d}	||j                  }|j                  }	d|v rt        d      d|d<   |st        j                          |d<   |rd|d<   d|v rt        d      |d   dk(  r|	rt#        d      |	|d<   |r|	rt#        d      |S )NrO   rJ   FzBACKEND cannot be combined with legacy FORCE_USE_FLEX_ATTENTION. BACKEND supersedes the legacy knob; please drop FORCE_USE_FLEX_ATTENTION and only specify the desired BACKEND.zInvalid BACKEND value 'z'. Must be one of r7   rF   rG   rH   rI   Tr  r[   z.OUTPUT_LOGSUMEXP must not be in kernel_optionsr\   z(OUTPUT_MAX must not be in kernel_optionsr9   zsReturning max scores is not supported with BACKEND='FLASH'. Use return_aux=AuxRequest(lse=True) or omit max_scores.z-Returning max scores is not supported on CPU.)r   rY   r   getr   typingget_argsr;   r   
setdefaultr   r  r^   r_   r~   r"   is_grad_enabledNotImplementedError)
r*  r+  r  
return_lsekernel_options
return_auxvalid_backendsany_inputs_on_cpu_device
output_lse
output_maxs
             r+   _apply_kernel_optionsr;  -  s    #$$~*>N
 N"~'9'9"E( 4
 	
 N"2)$N:).*C)D E""0!13 
 i0mU34e<5u=j$/ 	U" 	&::??e#	&<<%  JJ^^
**
 ^+MNN)-N%& .3-B-B-D)*# 27N-. ~%GHHi G+
!F
 	
 $.N< J ""QRR r-   c                    | j                  d      |j                  d      k7  r0t        d| j                  d       d|j                  d       d      y )Nr   zJExpect query and key/value to have the same embedding dimension but got E=z and E=.)r   r   )r*  r+  r  s      r+   _validate_embed_dimr>    sU    zz"~"%B(~Q@
 	
 &r-   c                   | j                   j                  dk(  r/| j                  s|j                  s|j                  rt        d      h d}| j                   j                  |vr#t	        d| j                   j                   d      y)zTODO: Remove once non cuda/cpu devices support is added
    We only need to check query since we have already that q,k,v are on the same device
    r  zrFlexAttention does not support backward on CPU. Please set the input requires_grad to False or use another device.>   r  hpuxpucudazTFlexAttention is only supported on CUDA, CPU or HPU devices. Found input tensors on z device.N)r   r  requires_gradr3  r   )r*  r+  r  supported_devicess       r+   _validate_devicerE    s     ||E!s00E4G4G! A
 	
 6|| 11&&+ll&7&7%8B
 	
 2r-   c                &   d	d}d	d}t         j                  t         j                  f}| j                  }||v xrd t         j                  j
                  duxrF t         j
                  j                  d      dk\  xr" t         j
                  j                  d      dk  }|s| ||fS  ||       s| j                         }  ||      s|j                         } ||      s0|j                  dd      j                         j                  dd      }| ||fS )
a  
    Enforce memory layouts for query, key, and value tensors.

    For non-FP8 dtypes, no action is taken.

    For FP8 dtypes, we enforce the following memory layouts:
    - Query tensor must be in row-major memory layout, as it will be the left-operand in the FP8 GEMM `q @ k.T`.
    - Key tensor must be in row-major memory layout, as it will be transposed when used as the right-operand
      in the FP8 GEMM `q @ k.T`, meaning it will correctly be in column-major memory layout for the GEMM.
    - Value tensor must be in column-major memory layout, as it will be the right-operand in the FP8 GEMM `softmax_scores @ v`.

    Returns the query, key, and value tensors with the enforced memory layouts.
    c                .    | j                         d   dk(  S )Nr   re   stridetensors    r+   is_row_majorz*_enforce_mem_layouts.<locals>.is_row_major      }}r"a''r-   c                .    | j                         d   dk(  S )Nr   re   rH  rJ  s    r+   is_col_majorz*_enforce_mem_layouts.<locals>.is_col_major  rM  r-   NrB  )   	   )
   r   r   r   )rK  r   r   rZ   )	r"   float8_e4m3fnfloat8_e5m2r   versionrB  get_device_capabilityr   r   )r*  r+  r  rL  rO  
fp8_dtypesgemm_precisionshould_enforce_mem_layouts           r+   _enforce_mem_layoutsrZ    s   "(( 	J [[N 	*$ 	?MMd*	?JJ,,V4>	? JJ,,V4w>	  %c5     " nn B'224>>r2F#ur-   .)r6  c	                    y r   rV   
r*  r+  r  r  r  scale
enable_gqar4  r5  r6  s
             r+   r   r     s     r-   zcreturn_lse is deprecated and will be removed in v2.10. Use return_aux=AuxRequest(lse=True) instead.r*   c	                    y r   rV   r\  s
             r+   r   r     s    $  r-   c	                    y r   rV   r\  s
             r+   r   r     s      #r-   c	                    y r   rV   r\  s
             r+   r   r     s     r-   c	          
     r   t        | ||d       t        | ||       t        | ||       t        | ||      \  } }}| j	                         dk7  s&|j	                         dk7  s|j	                         dk7  rt        d      |sS| j                  d      |j                  d      k7  r0t        d| j                  d       d|j                  d       d      |r<| j                  d	      }
|j                  d	      }|
|z  d
k7  rt        d|
 d| d      | j                  d
      |j                  d
      k7  r|0t        d| j                  d
       d|j                  d
       d      |j                  j                  d
      | j                  d
      k7  rLt        d| j                  d
       d|j                  d
       d|j                  j                  d
       d      |t        }|t        | |      }t        |dd      t        u rt        d      |j                  d
   t        k(  r|j                  d	   t        k(  rnz|j                   d   }|j                   d   }| j                  d      |kD  s|j                  d      |kD  r=t        d|j                    d| j                  d       d|j                  d       d      | j                  d      |k  r|j                  d      |k  s(| j                  d      |k  rQ|j                  d      |k  r=t        d|j                    d| j                  d       d|j                  d       d      | j                  d      |k7  r!t#        d| j                  d       d| d      |j                  d      |k7  r!t#        d|j                  d       d | d      |'d!t%        j&                  | j                  d            z  }| j(                  |j                  j(                  k7  r0t        d"| j(                   d#|j                  j(                   d      |r|	t        d$      |r|	t+        d%d&t,        '       t/        | |||||	      }	 	 	 	 d0d(}t0        j2                  j5                         rz| ||fD ]B  }t0        j6                  j9                  |d       t0        j6                  j9                  |d       D t;        | ||||j=                         ||      \  }}} |||||	|)      S t>        st+        d*d+,       t0        j6                  jA                         st        d-      d. }tC               5 }t>        r|}nt1        jD                  ||d/      } || ||||j=                         ||      \  }}}ddd        ||	|)      S # 1 sw Y   xY w)1a  This function implements scaled dot product attention with an arbitrary attention score modification function
    described in the `Flex Attention <https://arxiv.org/abs/2412.05496>`_ paper. See also the
    `blog post <https://pytorch.org/blog/flexattention/>`_.

    This function computes the scaled dot product attention between query, key, and value tensors with a user-defined
    attention score modification function. The attention score modification function will be applied after the attention
    scores have been calculated between the query and key tensors. The attention scores are calculated as follows:

    The ``score_mod`` function should have the following signature:

    .. code-block:: python

        def score_mod(
            score: Tensor,
            batch: Tensor,
            head: Tensor,
            q_idx: Tensor,
            k_idx: Tensor
        ) -> Tensor:

    Where:
        - ``score``: A scalar tensor representing the attention score,
          with the same data type and device as the query, key, and value tensors.
        - ``batch``, ``head``, ``q_idx``, ``k_idx``: Scalar tensors indicating
          the batch index, query head index, query index, and key/value index, respectively.
          These should have the ``torch.int`` data type and be located on the same device as the score tensor.

    Args:
        query (Tensor): Query tensor; shape :math:`(B, Hq, L, E)`. For FP8 dtypes, should be in row-major memory layout for optimal performance.
        key (Tensor): Key tensor; shape :math:`(B, Hkv, S, E)`. For FP8 dtypes, should be in row-major memory layout for optimal performance.
        value (Tensor): Value tensor; shape :math:`(B, Hkv, S, Ev)`. For FP8 dtypes, should be in column-major memory layout for optimal performance.
        score_mod (Optional[Callable]): Function to modify attention scores. By default no score_mod is applied.
        block_mask (Optional[BlockMask]): BlockMask object that controls the blocksparsity pattern of the attention.
        scale (Optional[float]): Scaling factor applied prior to softmax. If none, the default value is set to :math:`\frac{1}{\sqrt{E}}`.
        enable_gqa (bool): If set to True, enables Grouped Query Attention (GQA) and broadcasts key/value heads to query heads.
        return_lse (bool): Whether to return the logsumexp of the attention scores. Default is False. **Deprecated**: Use ``return_aux=AuxRequest(lse=True)`` instead.
        kernel_options (Optional[FlexKernelOptions]):
            Options to control the behavior of the underlying Triton kernels.
            See :class:`FlexKernelOptions` for available options and usage examples.
        return_aux (Optional[AuxRequest]): Specifies which auxiliary outputs to compute and return.
            If None, only the attention output is returned. Use ``AuxRequest(lse=True, max_scores=True)``
            to request both auxiliary outputs.

    Returns:
        output (Tensor): Attention output; shape :math:`(B, Hq, L, Ev)`.

        When ``return_aux`` is not None:
            aux (AuxOutput): Auxiliary outputs with requested fields populated.

        When ``return_aux`` is None (deprecated paths):
            lse (Tensor): Log-sum-exp of attention scores; shape :math:`(B, Hq, L)`. Only returned if ``return_lse=True``.

    Shape legend:
        - :math:`N: \text{Batch size} ... : \text{Any number of other batch dimensions (optional)}`
        - :math:`S: \text{Source sequence length}`
        - :math:`L: \text{Target sequence length}`
        - :math:`E: \text{Embedding dimension of the query and key}`
        - :math:`Ev: \text{Embedding dimension of the value}`

    .. warning::
        `torch.nn.attention.flex_attention` is a prototype feature in PyTorch.
        Please look forward to a more stable implementation in a future version of PyTorch.
        Read more about feature classification at: https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype

    T)allow_lowp_kvrv   z-NYI: query, key, and value must be 4D tensorszGExpect query and key/value to have the same number of heads but got Hq=z	 and Hkv=z&. Try setting enable_gqa=True for GQA.re   r   zMExpect number of query heads to be a multiple of kv heads for GQA but got Hq=r=  NzlExpect query and key/value to have the same batch size, or non-none block_mask, but got block_mask=None, Bq=z
, and Bkv=zxExpect query and key/value to have the same batch size, or block_mask and query to have the same batch size, but got Bq=z, Bkv=z, B_block_mask=r<  z+Cannot use mask_mod from a sliced BlockMaskr   r   z,block_mask was created for block_mask.shape=z but got q_len=z and kv_len=zz. As the block mask was created for a smaller length than you're using it for, you likely need to create a new block mask.ad  . As the block mask was created for a larger length than you're using it for, you can either 1. create a new block mask with the correct length, or 2. 'adjust' the existing block mask to the correct length by calling block_mask._adjust(q_len, kv_len). This essentially 'crops' the block mask to the upper left corner, which does not work for all mask_mods!zquery.size(-2) (z) != block_mask_q_len (r,  zkey.size(-2) (z) != block_mask_kv_len (g      ?z=Expect q/k/v and block_mask to be on the same device but got z and z|Cannot specify both return_lse and return_aux. return_lse is deprecated, please use return_aux=AuxRequest(lse=True) instead.deprecated_return_lsezjreturn_lse is deprecated and will be removed in v2.10. Please use return_aux=AuxRequest(lse=True) instead.r_  c                  t        j                  d      }|xs |duxr |j                  }|duxr |j                  }|r|j	                         dkD  r||z  nd}|r|j	                         dkD  r||z  nd}|| t        ||      fS |r| |fS | S )zFNormalize stats and build return value (aux-aware, legacy-compatible).g       @Nr   )r^   r_   )mathlogr^   r_   rz  r/   )	r   r^   r_   r6  r4  ln2
return_max
lse_scaled
max_scaleds	            r+   _finalize_outputsz)flex_attention.<locals>._finalize_outputs  s     hhsmL:T#9#Ljnn
t+E
0E0E
#-#))+/S3Y
!+
0@0@0BQ0FJT 	 !	%  
 
?"
r-   )r6  r4  flex_attention_performancea  flex_attention called without torch.compile() - this will use an unfused implementation that materializes the full scores matrix instead of generating a fused kernel.

SOLUTION: Use torch.compile(flex_attention)(...)

If you want to debug your score_mod/mask_mod, you can set:
torch.nn.attention.flex_attention._FLEX_ATTENTION_DISABLE_COMPILE_DEBUG = True

This will allow you to use print statements or breakpoints. Note: This doesn't work with the backwards pass and may produce incorrect results.)r(   r)   z&flex_attention requires dynamo supportc                     t        | i |S r   )flex_attention_hop)argsr  s     r+   _flex_attention_hop_wrapperz3flex_attention.<locals>._flex_attention_hop_wrapper  s    !42622r-   )backend	fullgraph)r6  AuxRequest | Noner4  rZ   )#r   r>  rE  rZ  r   r3  r   r   r   r   r,  r  r   r   r:  r)  r   r~   rh  sqrtr   r,   FutureWarningr;  r"   r#   is_dynamo_compiling_dynamomark_staticrq  rK  %_FLEX_ATTENTION_DISABLE_COMPILE_DEBUGis_dynamo_supportedr   r"  )r*  r+  r  r  r  r]  r^  r4  r5  r6  HqHkvblock_mask_q_lenblock_mask_kv_lenrn  rk  r   r^   r_   rs  rt  flex_fns                         r+   r   r     sX   ^ U$?sE*UC',UC?E3yy{a3779>UYY[A-=!"QRREJJrNchhrl:**R.)388B<. A34
 	

 ZZ]hhqk8q= T3%q2  zz!}#//4zz!}oZQR}TUW  ##((+uzz!}<#jjm_F388A;-zOgOgOlOlmnOoNppqs  	-eS9
 z:t,0FFHII 	a $<<!!!$(@@ 	%++B/&,,R0::b>,,?P0P>z?O?O>PP_`e`j`jkm`n_oo{|  }E  }E  FH  }I  |J JK K 
 JJrN--#((2,BS2Sjjn 00SXXb\DU5U>z?O?O>PP_`e`j`jkm`n_oo{|  }E  }E  FH  }I  |J Ju u  ::b>-- "5::b>"22IJZI[[\]  88B<,,  ".FGXFYYZ[  }dii

2//||z//666||nE**B*B*I*I)J!M
 	
 j,\
 	
 

*#B"		
 +N
 & : ~~))+e$ 	-AMM%%a,MM%%a,	-  2! 
S* !jZJ
 	
 13a		
 ==,,.CDD3 
	  
G01Gmm+WG  '! 
S*
" S*
 #
 
s   AV--V6)r(   r   r)   r   r*   ztype[Warning]r   r-  )r   z?_score_mod_signature | _mask_mod_signature | Callable[..., Any]r   rd   )rV   r   F)r   Callable[..., _R]r   tuple[int | None, ...]r   r  r   zint | list[int | None]r   rZ   r   r  )r   r   r   r   r   r   r   r   r   r   r   r   )
r   r   r   r   r   r   r   r   r   r   )
r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   tuple[Tensor, Tensor])r   r   r   r   r   r  )
r   r   r   r   r   r   r   r   r   r  r   )r  zset[int] | Noner   zUtuple[tuple[BaseArgumentTypes, ...], TreeSpec, _StrippedClosure | Callable[..., Any]])rk  r   r   r   r   r   )rk  r   r  r   r   r   )
r  r   r  r   r  r   r  rZ   r   tuple[Tensor, Tensor | None])r  r;  r   r;  )r  r   r  r   r  r   r   r   )r  r  r<  r  r3  r2  r  r   r  r   r   r.   )r  z*_score_mod_signature | _mask_mod_signaturer  
int | Noner  r  r  r   r  r   r   DeviceLikeType | Noner   r   )r<  r;  r  r  r  r  r  r   r  r   r   r  r:  r  r   r.   )r*  r   r+  r   r   r.   )r*  r   r+  r   r  r   r4  rZ   r5  FlexKernelOptions | Noner6  rv  r   rY   )r*  r   r+  r   r  r   r   r-  )r*  r   r+  r   r  r   r   z/tuple[torch.Tensor, torch.Tensor, torch.Tensor])......)r*  r   r+  r   r  r   r  _score_mod_signature | Noner  BlockMask | Noner]  float | Noner^  rZ   r4  r  r5  r  r6  r-  r   r   )r*  r   r+  r   r  r   r  r  r  r  r]  r  r^  rZ   r4  r  r5  r  r6  r-  r   r  )r*  r   r+  r   r  r   r  r  r  r  r]  r  r^  rZ   r4  rZ   r5  r  r6  r0   r   ztuple[Tensor, AuxOutput])r*  r   r+  r   r  r   r  r  r  r  r]  r  r^  rZ   r4  r  r5  r  r6  r0   r   r   )NNNFFN)r*  r   r+  r   r  r   r  r  r  r  r]  r  r^  rZ   r4  rZ   r5  r  r6  rv  r   z9Tensor | tuple[Tensor, Tensor] | tuple[Tensor, AuxOutput])crT   
__future__r   rt  ro   r  rh  rv  r  r/  r%   collections.abcr   enumr   r   r   r   r	   r
   r   r   typing_extensionsr   r   r   r   r   r"   r   &torch._higher_order_ops.flex_attentionr   rq  torch._higher_order_ops.utilsr   torch.nn.attention._utilsr   torch.utils._pytreer   r   r   r   r   TYPE_CHECKINGtorch._prims_commonr   torch.fx.noder   r|  r   r   rU   UserWarningr,   __all___score_mod_signaturer;  r;   r<   r1   rY   r0   r/   rd   r   r   r   r6   r   r  r)  r   r   r   r   r   r   r   r   r   r   r  r  r.   r  r  r  r4   r5   r  r  r3   r2   r,  r;  r>  rE  rZ  rx  rV   r-   r+   <module>r     sH   P "         $  O O O M M   W ? :  
2/& ). %E ! >I((!(-:(	(   H& PQ ?GH HI) IT]C	 CL"35 
 %
 %	 	 )G ) )N &('(++"+ #+ %	+
 + +\  	
  JJ
J J 	J
 J
  	
 6 ! " <66,266


 
 	

 
 #2&q) ! ! !"F%% =v(( =, "&K4K4K4\%P=- =-@d	# d	#N5 33!&	2$
2$2$ 2$ 	2$
 "2$j  42  	& 33,( ! 	
  @ %)0!60!0! 0! 	0!
 0! "0! 0!r %)(BR!RR R 	R
 R "R &R Rj* %)QQ	Q Q 	Q
 -Q "Q !Qh

$888'-848v 

 .1#&!$/2 	  +	
 !    -   
 
3 .1#& #/2    	    +	 
 !        -      
  

 .1#&/2##	# # +	#
 !# # # # -# # # 
# 

 .1#& #/2	  +	
 !    -   
& .2#'/3R %)RR	R R +	R
 !R R R R -R "R ?Rr-   