Click here to view the next lesson.

Capítulo 3: Atención y el auge de los Transformers

Ejercicios Prácticos para el Capítulo 3

Los siguientes ejercicios prácticos refuerzan los conceptos clave cubiertos en el Capítulo 3, incluyendo los desafíos con arquitecturas anteriores, atención propia, atención multi-cabezal y atención dispersa. Cada ejercicio viene acompañado de una solución detallada y ejemplos de código para profundizar en la comprensión.

Ejercicio 1: Simulando Desafíos con RNNs

Tarea: Crear un RNN simple utilizando PyTorch para demostrar la dificultad de manejar dependencias de largo alcance.

Pasos:

Implementar un RNN para el procesamiento de secuencias.
Generar un conjunto de datos sintético con secuencias largas.
Observar cómo el RNN lucha por capturar dependencias a largo plazo.

Solución:

import torch
import torch.nn as nn

# Define a simple RNN model
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Use the last timestep
        return out

# Parameters
input_size = 10  # Vocabulary size
hidden_size = 20
output_size = 1
sequence_length = 100
batch_size = 32

# Generate synthetic dataset
X = torch.randn(batch_size, sequence_length, input_size)
y = torch.randint(0, 2, (batch_size, 1), dtype=torch.float32)  # Binary labels

# Initialize and train the model
model = SimpleRNN(input_size, hidden_size, output_size)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(10):
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs, y)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

Ejercicio 2: Implementación de Self-Attention

Tarea: Escribir una función en Python para calcular la self-attention para una secuencia de tokens utilizando NumPy.

Solución:

import numpy as np

def self_attention(X, W_Q, W_K, W_V):
    """
    Compute self-attention for a sequence.
    X: Input sequence (n_tokens, d_model)
    W_Q, W_K, W_V: Weight matrices for Query, Key, Value
    """
    Q = np.dot(X, W_Q)  # Compute Queries
    K = np.dot(X, W_K)  # Compute Keys
    V = np.dot(X, W_V)  # Compute Values

    # Calculate scaled dot-product attention
    scores = np.dot(Q, K.T) / np.sqrt(K.shape[1])
    weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)
    output = np.dot(weights, V)

    return output, weights

# Example inputs
X = np.array([[1, 0], [0, 1], [1, 1]])  # Input sequence
W_Q = np.array([[0.1, 0.3], [0.5, 0.7]])  # Query weights
W_K = np.array([[0.2, 0.4], [0.6, 0.8]])  # Key weights
W_V = np.array([[0.1, 0.5], [0.3, 0.7]])  # Value weights

output, weights = self_attention(X, W_Q, W_K, W_V)
print("Self-Attention Weights:\n", weights)
print("Self-Attention Output:\n", output)

Ejercicio 3: Atención Multi-Cabezal

Tarea: Implementar un mecanismo simplificado de atención multi-cabezal utilizando NumPy.

Solución:

def multi_head_attention(X, W_Q, W_K, W_V, W_O, n_heads):
    """
    Compute multi-head attention.
    X: Input sequence (n_tokens, d_model)
    W_Q, W_K, W_V: Weight matrices for Query, Key, Value
    W_O: Output projection matrix
    n_heads: Number of attention heads
    """
    head_dim = W_Q.shape[1] // n_heads
    outputs = []

    for i in range(n_heads):
        Q = np.dot(X, W_Q[:, i*head_dim:(i+1)*head_dim])
        K = np.dot(X, W_K[:, i*head_dim:(i+1)*head_dim])
        V = np.dot(X, W_V[:, i*head_dim:(i+1)*head_dim])

        scores = np.dot(Q, K.T) / np.sqrt(head_dim)
        weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)
        output = np.dot(weights, V)
        outputs.append(output)

    concatenated = np.concatenate(outputs, axis=-1)
    final_output = np.dot(concatenated, W_O)
    return final_output

# Example parameters
n_heads = 2
X = np.array([[1, 0], [0, 1], [1, 1]])  # Input sequence
W_Q = np.random.rand(2, 4)  # Query weights (2 features, 4 for 2 heads)
W_K = np.random.rand(2, 4)  # Key weights
W_V = np.random.rand(2, 4)  # Value weights
W_O = np.random.rand(4, 2)  # Output projection weights

# Compute multi-head attention
output = multi_head_attention(X, W_Q, W_K, W_V, W_O, n_heads)
print("Multi-Head Attention Output:\n", output)

Ejercicio 4: Atención Dispersa

Tarea: Implementar un mecanismo de atención dispersa utilizando una máscara personalizada para limitar las interacciones entre tokens.

Solución:

def sparse_attention(Q, K, V, sparsity_mask):
    """
    Compute sparse attention.
    Q: Queries
    K: Keys
    V: Values
    sparsity_mask: Binary mask defining allowable token interactions
    """
    d_k = Q.shape[-1]  # Dimension of keys
    scores = np.dot(Q, K.T) / np.sqrt(d_k)  # Compute scaled dot-product
    sparse_scores = scores * sparsity_mask  # Apply sparsity mask
    weights = np.exp(sparse_scores) / np.sum(np.exp(sparse_scores), axis=-1, keepdims=True)  # Softmax
    output = np.dot(weights, V)  # Weighted sum of values
    return output, weights

# Example inputs
Q = np.array([[1, 0], [0, 1], [1, 1]])  # Query
K = np.array([[1, 0], [0, 1], [1, 1]])  # Keys
V = np.array([[0.5, 1.0], [0.2, 0.8], [0.9, 0.3]])  # Values

# Sparsity mask (local attention pattern)
sparsity_mask = np.array([
    [1, 1, 0],  # Token 1 attends to Token 1, 2
    [1, 1, 1],  # Token 2 attends to all
    [0, 1, 1]   # Token 3 attends to Token 2, 3
])

output, weights = sparse_attention(Q, K, V, sparsity_mask)
print("Sparse Attention Weights:\n", weights)
print("Sparse Attention Output:\n", output)

Estos ejercicios te guían a través de la implementación práctica de conceptos clave como la self-attention, la atención multi-cabezal y la atención dispersa. Completarlos profundizará tu comprensión de cómo los mecanismos de atención abordan los desafíos de arquitecturas anteriores y permiten la escalabilidad y eficiencia de los modelos Transformer.

Ejercicios Prácticos para el Capítulo 3

Ejercicio 1: Simulando Desafíos con RNNs

Tarea: Crear un RNN simple utilizando PyTorch para demostrar la dificultad de manejar dependencias de largo alcance.

Pasos:

Implementar un RNN para el procesamiento de secuencias.
Generar un conjunto de datos sintético con secuencias largas.
Observar cómo el RNN lucha por capturar dependencias a largo plazo.

Solución:

import torch
import torch.nn as nn

# Define a simple RNN model
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Use the last timestep
        return out

# Parameters
input_size = 10  # Vocabulary size
hidden_size = 20
output_size = 1
sequence_length = 100
batch_size = 32

# Generate synthetic dataset
X = torch.randn(batch_size, sequence_length, input_size)
y = torch.randint(0, 2, (batch_size, 1), dtype=torch.float32)  # Binary labels

# Initialize and train the model
model = SimpleRNN(input_size, hidden_size, output_size)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(10):
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs, y)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

Ejercicio 2: Implementación de Self-Attention

Tarea: Escribir una función en Python para calcular la self-attention para una secuencia de tokens utilizando NumPy.

Solución:

import numpy as np

def self_attention(X, W_Q, W_K, W_V):
    """
    Compute self-attention for a sequence.
    X: Input sequence (n_tokens, d_model)
    W_Q, W_K, W_V: Weight matrices for Query, Key, Value
    """
    Q = np.dot(X, W_Q)  # Compute Queries
    K = np.dot(X, W_K)  # Compute Keys
    V = np.dot(X, W_V)  # Compute Values

    # Calculate scaled dot-product attention
    scores = np.dot(Q, K.T) / np.sqrt(K.shape[1])
    weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)
    output = np.dot(weights, V)

    return output, weights

# Example inputs
X = np.array([[1, 0], [0, 1], [1, 1]])  # Input sequence
W_Q = np.array([[0.1, 0.3], [0.5, 0.7]])  # Query weights
W_K = np.array([[0.2, 0.4], [0.6, 0.8]])  # Key weights
W_V = np.array([[0.1, 0.5], [0.3, 0.7]])  # Value weights

output, weights = self_attention(X, W_Q, W_K, W_V)
print("Self-Attention Weights:\n", weights)
print("Self-Attention Output:\n", output)

Ejercicio 3: Atención Multi-Cabezal

Tarea: Implementar un mecanismo simplificado de atención multi-cabezal utilizando NumPy.

Solución:

def multi_head_attention(X, W_Q, W_K, W_V, W_O, n_heads):
    """
    Compute multi-head attention.
    X: Input sequence (n_tokens, d_model)
    W_Q, W_K, W_V: Weight matrices for Query, Key, Value
    W_O: Output projection matrix
    n_heads: Number of attention heads
    """
    head_dim = W_Q.shape[1] // n_heads
    outputs = []

    for i in range(n_heads):
        Q = np.dot(X, W_Q[:, i*head_dim:(i+1)*head_dim])
        K = np.dot(X, W_K[:, i*head_dim:(i+1)*head_dim])
        V = np.dot(X, W_V[:, i*head_dim:(i+1)*head_dim])

        scores = np.dot(Q, K.T) / np.sqrt(head_dim)
        weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)
        output = np.dot(weights, V)
        outputs.append(output)

    concatenated = np.concatenate(outputs, axis=-1)
    final_output = np.dot(concatenated, W_O)
    return final_output

# Example parameters
n_heads = 2
X = np.array([[1, 0], [0, 1], [1, 1]])  # Input sequence
W_Q = np.random.rand(2, 4)  # Query weights (2 features, 4 for 2 heads)
W_K = np.random.rand(2, 4)  # Key weights
W_V = np.random.rand(2, 4)  # Value weights
W_O = np.random.rand(4, 2)  # Output projection weights

# Compute multi-head attention
output = multi_head_attention(X, W_Q, W_K, W_V, W_O, n_heads)
print("Multi-Head Attention Output:\n", output)

Ejercicio 4: Atención Dispersa

Tarea: Implementar un mecanismo de atención dispersa utilizando una máscara personalizada para limitar las interacciones entre tokens.

Solución:

def sparse_attention(Q, K, V, sparsity_mask):
    """
    Compute sparse attention.
    Q: Queries
    K: Keys
    V: Values
    sparsity_mask: Binary mask defining allowable token interactions
    """
    d_k = Q.shape[-1]  # Dimension of keys
    scores = np.dot(Q, K.T) / np.sqrt(d_k)  # Compute scaled dot-product
    sparse_scores = scores * sparsity_mask  # Apply sparsity mask
    weights = np.exp(sparse_scores) / np.sum(np.exp(sparse_scores), axis=-1, keepdims=True)  # Softmax
    output = np.dot(weights, V)  # Weighted sum of values
    return output, weights

# Example inputs
Q = np.array([[1, 0], [0, 1], [1, 1]])  # Query
K = np.array([[1, 0], [0, 1], [1, 1]])  # Keys
V = np.array([[0.5, 1.0], [0.2, 0.8], [0.9, 0.3]])  # Values

# Sparsity mask (local attention pattern)
sparsity_mask = np.array([
    [1, 1, 0],  # Token 1 attends to Token 1, 2
    [1, 1, 1],  # Token 2 attends to all
    [0, 1, 1]   # Token 3 attends to Token 2, 3
])

output, weights = sparse_attention(Q, K, V, sparsity_mask)
print("Sparse Attention Weights:\n", weights)
print("Sparse Attention Output:\n", output)

Ejercicios Prácticos para el Capítulo 3

Ejercicio 1: Simulando Desafíos con RNNs

Tarea: Crear un RNN simple utilizando PyTorch para demostrar la dificultad de manejar dependencias de largo alcance.

Pasos:

Implementar un RNN para el procesamiento de secuencias.
Generar un conjunto de datos sintético con secuencias largas.
Observar cómo el RNN lucha por capturar dependencias a largo plazo.

Solución:

import torch
import torch.nn as nn

# Define a simple RNN model
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Use the last timestep
        return out

# Parameters
input_size = 10  # Vocabulary size
hidden_size = 20
output_size = 1
sequence_length = 100
batch_size = 32

# Generate synthetic dataset
X = torch.randn(batch_size, sequence_length, input_size)
y = torch.randint(0, 2, (batch_size, 1), dtype=torch.float32)  # Binary labels

# Initialize and train the model
model = SimpleRNN(input_size, hidden_size, output_size)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(10):
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs, y)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

Ejercicio 2: Implementación de Self-Attention

Tarea: Escribir una función en Python para calcular la self-attention para una secuencia de tokens utilizando NumPy.

Solución:

import numpy as np

def self_attention(X, W_Q, W_K, W_V):
    """
    Compute self-attention for a sequence.
    X: Input sequence (n_tokens, d_model)
    W_Q, W_K, W_V: Weight matrices for Query, Key, Value
    """
    Q = np.dot(X, W_Q)  # Compute Queries
    K = np.dot(X, W_K)  # Compute Keys
    V = np.dot(X, W_V)  # Compute Values

    # Calculate scaled dot-product attention
    scores = np.dot(Q, K.T) / np.sqrt(K.shape[1])
    weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)
    output = np.dot(weights, V)

    return output, weights

# Example inputs
X = np.array([[1, 0], [0, 1], [1, 1]])  # Input sequence
W_Q = np.array([[0.1, 0.3], [0.5, 0.7]])  # Query weights
W_K = np.array([[0.2, 0.4], [0.6, 0.8]])  # Key weights
W_V = np.array([[0.1, 0.5], [0.3, 0.7]])  # Value weights

output, weights = self_attention(X, W_Q, W_K, W_V)
print("Self-Attention Weights:\n", weights)
print("Self-Attention Output:\n", output)

Ejercicio 3: Atención Multi-Cabezal

Tarea: Implementar un mecanismo simplificado de atención multi-cabezal utilizando NumPy.

Solución:

def multi_head_attention(X, W_Q, W_K, W_V, W_O, n_heads):
    """
    Compute multi-head attention.
    X: Input sequence (n_tokens, d_model)
    W_Q, W_K, W_V: Weight matrices for Query, Key, Value
    W_O: Output projection matrix
    n_heads: Number of attention heads
    """
    head_dim = W_Q.shape[1] // n_heads
    outputs = []

    for i in range(n_heads):
        Q = np.dot(X, W_Q[:, i*head_dim:(i+1)*head_dim])
        K = np.dot(X, W_K[:, i*head_dim:(i+1)*head_dim])
        V = np.dot(X, W_V[:, i*head_dim:(i+1)*head_dim])

        scores = np.dot(Q, K.T) / np.sqrt(head_dim)
        weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)
        output = np.dot(weights, V)
        outputs.append(output)

    concatenated = np.concatenate(outputs, axis=-1)
    final_output = np.dot(concatenated, W_O)
    return final_output

# Example parameters
n_heads = 2
X = np.array([[1, 0], [0, 1], [1, 1]])  # Input sequence
W_Q = np.random.rand(2, 4)  # Query weights (2 features, 4 for 2 heads)
W_K = np.random.rand(2, 4)  # Key weights
W_V = np.random.rand(2, 4)  # Value weights
W_O = np.random.rand(4, 2)  # Output projection weights

# Compute multi-head attention
output = multi_head_attention(X, W_Q, W_K, W_V, W_O, n_heads)
print("Multi-Head Attention Output:\n", output)

Ejercicio 4: Atención Dispersa

Tarea: Implementar un mecanismo de atención dispersa utilizando una máscara personalizada para limitar las interacciones entre tokens.

Solución:

def sparse_attention(Q, K, V, sparsity_mask):
    """
    Compute sparse attention.
    Q: Queries
    K: Keys
    V: Values
    sparsity_mask: Binary mask defining allowable token interactions
    """
    d_k = Q.shape[-1]  # Dimension of keys
    scores = np.dot(Q, K.T) / np.sqrt(d_k)  # Compute scaled dot-product
    sparse_scores = scores * sparsity_mask  # Apply sparsity mask
    weights = np.exp(sparse_scores) / np.sum(np.exp(sparse_scores), axis=-1, keepdims=True)  # Softmax
    output = np.dot(weights, V)  # Weighted sum of values
    return output, weights

# Example inputs
Q = np.array([[1, 0], [0, 1], [1, 1]])  # Query
K = np.array([[1, 0], [0, 1], [1, 1]])  # Keys
V = np.array([[0.5, 1.0], [0.2, 0.8], [0.9, 0.3]])  # Values

# Sparsity mask (local attention pattern)
sparsity_mask = np.array([
    [1, 1, 0],  # Token 1 attends to Token 1, 2
    [1, 1, 1],  # Token 2 attends to all
    [0, 1, 1]   # Token 3 attends to Token 2, 3
])

output, weights = sparse_attention(Q, K, V, sparsity_mask)
print("Sparse Attention Weights:\n", weights)
print("Sparse Attention Output:\n", output)

Ejercicios Prácticos para el Capítulo 3

Ejercicio 1: Simulando Desafíos con RNNs

Tarea: Crear un RNN simple utilizando PyTorch para demostrar la dificultad de manejar dependencias de largo alcance.

Pasos:

Implementar un RNN para el procesamiento de secuencias.
Generar un conjunto de datos sintético con secuencias largas.
Observar cómo el RNN lucha por capturar dependencias a largo plazo.

Solución:

import torch
import torch.nn as nn

# Define a simple RNN model
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Use the last timestep
        return out

# Parameters
input_size = 10  # Vocabulary size
hidden_size = 20
output_size = 1
sequence_length = 100
batch_size = 32

# Generate synthetic dataset
X = torch.randn(batch_size, sequence_length, input_size)
y = torch.randint(0, 2, (batch_size, 1), dtype=torch.float32)  # Binary labels

# Initialize and train the model
model = SimpleRNN(input_size, hidden_size, output_size)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(10):
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs, y)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

Ejercicio 2: Implementación de Self-Attention

Tarea: Escribir una función en Python para calcular la self-attention para una secuencia de tokens utilizando NumPy.

Solución:

import numpy as np

def self_attention(X, W_Q, W_K, W_V):
    """
    Compute self-attention for a sequence.
    X: Input sequence (n_tokens, d_model)
    W_Q, W_K, W_V: Weight matrices for Query, Key, Value
    """
    Q = np.dot(X, W_Q)  # Compute Queries
    K = np.dot(X, W_K)  # Compute Keys
    V = np.dot(X, W_V)  # Compute Values

    # Calculate scaled dot-product attention
    scores = np.dot(Q, K.T) / np.sqrt(K.shape[1])
    weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)
    output = np.dot(weights, V)

    return output, weights

# Example inputs
X = np.array([[1, 0], [0, 1], [1, 1]])  # Input sequence
W_Q = np.array([[0.1, 0.3], [0.5, 0.7]])  # Query weights
W_K = np.array([[0.2, 0.4], [0.6, 0.8]])  # Key weights
W_V = np.array([[0.1, 0.5], [0.3, 0.7]])  # Value weights

output, weights = self_attention(X, W_Q, W_K, W_V)
print("Self-Attention Weights:\n", weights)
print("Self-Attention Output:\n", output)

Ejercicio 3: Atención Multi-Cabezal

Tarea: Implementar un mecanismo simplificado de atención multi-cabezal utilizando NumPy.

Solución:

def multi_head_attention(X, W_Q, W_K, W_V, W_O, n_heads):
    """
    Compute multi-head attention.
    X: Input sequence (n_tokens, d_model)
    W_Q, W_K, W_V: Weight matrices for Query, Key, Value
    W_O: Output projection matrix
    n_heads: Number of attention heads
    """
    head_dim = W_Q.shape[1] // n_heads
    outputs = []

    for i in range(n_heads):
        Q = np.dot(X, W_Q[:, i*head_dim:(i+1)*head_dim])
        K = np.dot(X, W_K[:, i*head_dim:(i+1)*head_dim])
        V = np.dot(X, W_V[:, i*head_dim:(i+1)*head_dim])

        scores = np.dot(Q, K.T) / np.sqrt(head_dim)
        weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)
        output = np.dot(weights, V)
        outputs.append(output)

    concatenated = np.concatenate(outputs, axis=-1)
    final_output = np.dot(concatenated, W_O)
    return final_output

# Example parameters
n_heads = 2
X = np.array([[1, 0], [0, 1], [1, 1]])  # Input sequence
W_Q = np.random.rand(2, 4)  # Query weights (2 features, 4 for 2 heads)
W_K = np.random.rand(2, 4)  # Key weights
W_V = np.random.rand(2, 4)  # Value weights
W_O = np.random.rand(4, 2)  # Output projection weights

# Compute multi-head attention
output = multi_head_attention(X, W_Q, W_K, W_V, W_O, n_heads)
print("Multi-Head Attention Output:\n", output)

Ejercicio 4: Atención Dispersa

Tarea: Implementar un mecanismo de atención dispersa utilizando una máscara personalizada para limitar las interacciones entre tokens.

Solución:

def sparse_attention(Q, K, V, sparsity_mask):
    """
    Compute sparse attention.
    Q: Queries
    K: Keys
    V: Values
    sparsity_mask: Binary mask defining allowable token interactions
    """
    d_k = Q.shape[-1]  # Dimension of keys
    scores = np.dot(Q, K.T) / np.sqrt(d_k)  # Compute scaled dot-product
    sparse_scores = scores * sparsity_mask  # Apply sparsity mask
    weights = np.exp(sparse_scores) / np.sum(np.exp(sparse_scores), axis=-1, keepdims=True)  # Softmax
    output = np.dot(weights, V)  # Weighted sum of values
    return output, weights

# Example inputs
Q = np.array([[1, 0], [0, 1], [1, 1]])  # Query
K = np.array([[1, 0], [0, 1], [1, 1]])  # Keys
V = np.array([[0.5, 1.0], [0.2, 0.8], [0.9, 0.3]])  # Values

# Sparsity mask (local attention pattern)
sparsity_mask = np.array([
    [1, 1, 0],  # Token 1 attends to Token 1, 2
    [1, 1, 1],  # Token 2 attends to all
    [0, 1, 1]   # Token 3 attends to Token 2, 3
])

output, weights = sparse_attention(Q, K, V, sparsity_mask)
print("Sparse Attention Weights:\n", weights)
print("Sparse Attention Output:\n", output)

Compra este libro