Source code for torch_geometric.nn.attention.qformer
from typing import Callable
import torch
[docs]class QFormer(torch.nn.Module):
r"""The Querying Transformer (Q-Former) from
`"BLIP-2: Bootstrapping Language-Image Pre-training
with Frozen Image Encoders and Large Language Models"
<https://arxiv.org/pdf/2301.12597>`_ paper.
Args:
input_dim (int): The number of features in the input.
hidden_dim (int): The dimension of the fnn in the encoder layer.
output_dim (int): The final output dimension.
num_heads (int): The number of multi-attention-heads.
num_layers (int): The number of sub-encoder-layers in the encoder.
dropout (int): The dropout value in each encoder layer.
.. note::
This is a simplified version of the original Q-Former implementation.
"""
def __init__(
self,
input_dim: int,
hidden_dim: int,
output_dim: int,
num_heads: int,
num_layers: int,
dropout: float = 0.0,
activation: Callable = torch.nn.ReLU(),
) -> None:
super().__init__()
self.num_layers = num_layers
self.num_heads = num_heads
self.layer_norm = torch.nn.LayerNorm(input_dim)
self.encoder_layer = torch.nn.TransformerEncoderLayer(
d_model=input_dim,
nhead=num_heads,
dim_feedforward=hidden_dim,
dropout=dropout,
activation=activation,
batch_first=True,
)
self.encoder = torch.nn.TransformerEncoder(
self.encoder_layer,
num_layers=num_layers,
)
self.project = torch.nn.Linear(input_dim, output_dim)
[docs] def forward(self, x: torch.Tensor) -> torch.Tensor:
r"""Forward pass.
Args:
x (torch.Tensor): Input sequence to the encoder layer.
:math:`\mathbf{X} \in \mathbb{R}^{B \times N \times F}`, with
batch-size :math:`B`, sequence length :math:`N`,
and feature dimension :math:`F`.
"""
x = self.layer_norm(x)
x = self.encoder(x)
out = self.project(x)
return out
def __repr__(self) -> str:
return (f'{self.__class__.__name__}('
f'num_heads={self.num_heads}, '
f'num_layers={self.num_layers})')