pallas_operations.splash_attention.tpu.splash_attention_kernel

Implementation of Sparse Flash Attention, a.k.a. "Splash" attention.

`BlockSizes` `dataclass`

Tile sizes parameterizing SplashAttention kernels.

Those parameters have negligible effect on numerics, but affect performance greatly.

Note that changing the layouts only influences the physical layout that the kernel will enforce. The logical interface to splash attention always takes the head dimension as the minormost one.

Source code in src/fjformer/pallas_operations/splash_attention/tpu/splash_attention_kernel.py

@dataclasses.dataclass(unsafe_hash=True)
class BlockSizes:
    """Tile sizes parameterizing SplashAttention kernels.

    Those parameters have negligible effect on numerics, but affect performance
    greatly.

    Note that changing the layouts only influences the physical layout that the
    kernel will enforce. The logical interface to splash attention always takes
    the head dimension as the minormost one.
    """
    block_q: int
    block_kv: int
    block_kv_compute: int | None = None

    block_q_dkv: int | None = None
    block_kv_dkv: int | None = None
    block_kv_dkv_compute: int | None = None

    block_q_dq: int | None = None
    block_kv_dq: int | None = None

    use_fused_bwd_kernel: bool = False

    q_layout: QKVLayout = QKVLayout.HEAD_DIM_MINOR
    k_layout: QKVLayout = QKVLayout.HEAD_DIM_MINOR
    v_layout: QKVLayout = QKVLayout.HEAD_DIM_MINOR

    def __post_init__(self):
        if self.block_kv_compute is None:
            self.block_kv_compute = self.block_kv
        if self.block_kv_dkv_compute is None:
            self.block_kv_dkv_compute = self.block_kv_dkv
        if self.use_fused_bwd_kernel:
            if self.block_q_dq is not None or self.block_kv_dq is not None:
                raise ValueError(
                    "Block sizes for dq kernel are not needed with a fused kernel."
                )

    @property
    def has_backward_blocks(self) -> bool:
        backward_blocks = (
            self.block_q_dkv, self.block_kv_dkv, self.block_kv_dkv_compute,
        )
        if not self.use_fused_bwd_kernel:
            backward_blocks += (self.block_q_dq, self.block_kv_dq)
        return all(b is not None for b in backward_blocks)

    @classmethod
    def get_default(cls):
        # TODO(apaszke,sharadmv): Select better parameters based on a heuristic.
        return BlockSizes(
            block_q=128,
            block_kv=128,
            block_kv_compute=128,
            block_q_dkv=128,
            block_kv_dkv=128,
            block_kv_dkv_compute=128,
            block_q_dq=128,
            block_kv_dq=128,
        )

`SegmentIds`

Bases: NamedTuple

SegmentIds for Q and KV sequences.

SegmentIds are a mechanims to ensure that there is no cross-attention between segments (fraction of a sequence) that have been concatenated together into a sequence. Each array is a list of ids (integers). Only tokens with the same id are allowed to attend to each other.

The static mask (e.g. causal) is "and-ed" with the segment id mask to form the actual attention mask. It is important that the latter does not have any all-zero rows (along dimension kv). Otherwise it would result in a invalid softmax (the denominator would be 0). This condition holds for causal self-attention because in this case segment ids form a block diagonal matrix so at least one element in each row is set. It is easy to break this condition with non-self-attention configurations. Attributes: q: segment ids along the Q sequence kv: segment ids along the KV sequence

Source code in src/fjformer/pallas_operations/splash_attention/tpu/splash_attention_kernel.py

class SegmentIds(NamedTuple):
    """SegmentIds for Q and KV sequences.

    SegmentIds are a mechanims to ensure that there is no cross-attention between
    segments (fraction of a sequence) that have been concatenated together into a
    sequence. Each array is a list of ids (integers). Only tokens with the same
    id are allowed to attend to each other.

    The static mask (e.g. causal) is "and-ed" with the segment id mask to form
    the actual attention mask. It is important that the latter does not have any
    all-zero rows (along dimension kv). Otherwise it would result in a invalid
    softmax (the denominator would be 0).
    This condition holds for causal self-attention because in this case segment
    ids form a block diagonal matrix so at least one element in each row is set.
    It is easy to break this condition with non-self-attention configurations.
    Attributes:
      q: segment ids along the Q sequence
      kv: segment ids along the KV sequence
    """

    q: jax.Array  # [q_seq_len]
    kv: jax.Array  # [kv_seq_len]

`SplashAttentionKernel`

Source code in src/fjformer/pallas_operations/splash_attention/tpu/splash_attention_kernel.py

@jax.tree_util.register_pytree_node_class
class SplashAttentionKernel:

    def __init__(
            self,
            fwd_mask_info: mask_info_lib.MaskInfo,
            dq_mask_info: mask_info_lib.MaskInfo | None,
            dkv_mask_info: mask_info_lib.MaskInfo | None,
            **kwargs,
    ):
        self.kwargs = kwargs
        self.fwd_mask_info = fwd_mask_info
        self.dq_mask_info = dq_mask_info
        self.dkv_mask_info = dkv_mask_info

    def __call__(self, *args, **kwargs) -> SplashCustomReturnType:
        return _splash_attention(
            self.fwd_mask_info,
            self.dq_mask_info,
            self.dkv_mask_info,
            *args,
            **kwargs,
            **self.kwargs,
        )

    def manual_sharding_spec(self, sharding: jax.sharding.NamedSharding):
        """Returns a value that can be used as a shard_map partition spec for the kernel."""
        if self.fwd_mask_info.data_next is not None:
            block_mask_shape = self.fwd_mask_info.data_next.shape
            try:
                shard_shape = sharding.shard_shape(block_mask_shape)
            except ValueError as exc:
                raise ValueError(
                    "The sharding must divide the mask blocks evenly between devices"
                ) from exc
            if block_mask_shape[-1] != shard_shape[-1]:
                raise ValueError("Sharding the kv sequence dimension is not supported")
        spec = sharding.spec
        assert len(spec) == 2
        replicated = jax.sharding.PartitionSpec()
        # Shard q_sequence over the sequence dimension only.
        q_sequence_spec = jax.sharding.PartitionSpec(spec[1])
        mask_info_specs = mask_info_lib.MaskInfo(  # pytype: disable=wrong-arg-types
            data_next=spec if self.fwd_mask_info.data_next is not None else None,
            mask_next=spec if self.fwd_mask_info.mask_next is not None else None,
            block_mask=spec if self.fwd_mask_info.block_mask is not None else None,
            partial_mask_blocks=replicated
            if self.fwd_mask_info.partial_mask_blocks is not None
            else None,
            q_sequence=q_sequence_spec
            if self.fwd_mask_info.q_sequence is not None
            else None,
        )
        return SplashAttentionKernel(
            mask_info_specs,
            mask_info_specs if self.dq_mask_info is not None else None,
            mask_info_specs if self.dkv_mask_info is not None else None,
            **self.kwargs,
        )

    def tree_flatten(self):
        return (
            (self.fwd_mask_info, self.dq_mask_info, self.dkv_mask_info),
            self.kwargs,
        )

    @classmethod
    def tree_unflatten(cls, kwargs, values):
        fwd_mask_info, dq_mask_info, dkv_mask_info = values
        # NamedTuples are not preserved during pytree serialization.
        dq_mask_info = (
            mask_info_lib.MaskInfo(*dq_mask_info)
            if dq_mask_info is not None
            else None
        )
        dkv_mask_info = (
            mask_info_lib.MaskInfo(*dkv_mask_info)
            if dkv_mask_info is not None
            else None
        )
        return SplashAttentionKernel(
            mask_info_lib.MaskInfo(*fwd_mask_info),
            dq_mask_info,
            dkv_mask_info,
            **kwargs,
        )

`manual_sharding_spec(sharding)`

Returns a value that can be used as a shard_map partition spec for the kernel.

Source code in src/fjformer/pallas_operations/splash_attention/tpu/splash_attention_kernel.py

def manual_sharding_spec(self, sharding: jax.sharding.NamedSharding):
    """Returns a value that can be used as a shard_map partition spec for the kernel."""
    if self.fwd_mask_info.data_next is not None:
        block_mask_shape = self.fwd_mask_info.data_next.shape
        try:
            shard_shape = sharding.shard_shape(block_mask_shape)
        except ValueError as exc:
            raise ValueError(
                "The sharding must divide the mask blocks evenly between devices"
            ) from exc
        if block_mask_shape[-1] != shard_shape[-1]:
            raise ValueError("Sharding the kv sequence dimension is not supported")
    spec = sharding.spec
    assert len(spec) == 2
    replicated = jax.sharding.PartitionSpec()
    # Shard q_sequence over the sequence dimension only.
    q_sequence_spec = jax.sharding.PartitionSpec(spec[1])
    mask_info_specs = mask_info_lib.MaskInfo(  # pytype: disable=wrong-arg-types
        data_next=spec if self.fwd_mask_info.data_next is not None else None,
        mask_next=spec if self.fwd_mask_info.mask_next is not None else None,
        block_mask=spec if self.fwd_mask_info.block_mask is not None else None,
        partial_mask_blocks=replicated
        if self.fwd_mask_info.partial_mask_blocks is not None
        else None,
        q_sequence=q_sequence_spec
        if self.fwd_mask_info.q_sequence is not None
        else None,
    )
    return SplashAttentionKernel(
        mask_info_specs,
        mask_info_specs if self.dq_mask_info is not None else None,
        mask_info_specs if self.dkv_mask_info is not None else None,
        **self.kwargs,
    )

`get_kernel_name(is_mqa, save_residuals, is_segmented, phase)`

Returns a unique name for all SplashAttention kernel variants.

Source code in src/fjformer/pallas_operations/splash_attention/tpu/splash_attention_kernel.py

def get_kernel_name(
        is_mqa: bool, save_residuals: bool, is_segmented: bool, phase: str
) -> str:
    """Returns a unique name for all SplashAttention kernel variants."""

    assert phase == "dq" or phase == "dkv" or phase == "fwd"
    # Saving residuals is supported only for the fwd phase.
    assert not save_residuals or phase == "fwd"
    residuals = ""
    if save_residuals:
        residuals = "_residuals"
    elif phase == "fwd":
        residuals = "_no_residuals"
    attention_type = "mqa" if is_mqa else "mha"
    segments = "_segmented" if is_segmented else ""
    return f"splash_{attention_type}_{phase}{segments}{residuals}"

pallas_operations.splash_attention.tpu.splash_attention_kernel

BlockSizes dataclass

SegmentIds

SplashAttentionKernel

manual_sharding_spec(sharding)

get_kernel_name(is_mqa, save_residuals, is_segmented, phase)

`BlockSizes` `dataclass`

`SegmentIds`

`SplashAttentionKernel`

`manual_sharding_spec(sharding)`

`get_kernel_name(is_mqa, save_residuals, is_segmented, phase)`