Click here to view the next lesson.

Chapter 2: Hugging Face and Other NLP Libraries

2.4 Ejercicios Prácticos

Esta sección proporciona ejercicios prácticos para reforzar tu comprensión del ecosistema de Hugging Face y su integración con TensorFlow y PyTorch. Cada ejercicio incluye una explicación clara y soluciones de código para guiar tu aprendizaje.

Ejercicio 1: Usando el Pipeline de Hugging Face

Tarea: Utiliza el pipeline de Hugging Face para realizar reconocimiento de entidades nombradas (NER) en un texto dado.

Instrucciones:

Importa el pipeline de Hugging Face.
Carga el pipeline NER.
Procesa un texto de ejemplo para identificar entidades nombradas.

Solución:

from transformers import pipeline

# Step 1: Load the NER pipeline
ner_pipeline = pipeline("ner", grouped_entities=True)

# Step 2: Define the input text
text = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, across the Manhattan Bridge."

# Step 3: Perform named entity recognition
entities = ner_pipeline(text)

# Step 4: Print the results
print("Named Entities:")
for entity in entities:
    print(f"Entity: {entity['word']}, Type: {entity['entity_group']}, Score: {entity['score']:.2f}")

Salida Esperada:

Named Entities:
Entity: Hugging Face Inc., Type: ORG, Score: 0.99
Entity: New York City, Type: LOC, Score: 0.99
Entity: DUMBO, Type: LOC, Score: 0.97
Entity: Manhattan Bridge, Type: LOC, Score: 0.96

Ejercicio 2: Ajuste Fino de un Modelo Transformer

Tarea: Realizar el ajuste fino de un modelo BERT para clasificación de texto utilizando la API Trainer de Hugging Face y el conjunto de datos IMDB.

Instrucciones:

Cargar el conjunto de datos IMDB y preprocesarlo.
Tokenizar el texto usando el tokenizador BERT.
Realizar el ajuste fino del modelo en un subconjunto pequeño del conjunto de datos.
Evaluar el modelo.

Solución:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score

# Step 1: Load and preprocess the dataset
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Step 2: Prepare the data for training
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))

# Step 3: Load the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Step 4: Define the training arguments and trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Step 5: Train the model
trainer.train()

# Step 6: Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

Salida Esperada:

Evaluation Results: {'eval_loss': 0.36, 'eval_accuracy': 0.88}

Ejercicio 3: Bucle de Entrenamiento con PyTorch

Tarea: Implementar un bucle de entrenamiento en PyTorch para realizar el ajuste fino de un modelo BERT en clasificación de texto.

Instrucciones:

Cargar el conjunto de datos IMDB y preprocesarlo.
Convertir el conjunto de datos a tensores de PyTorch.
Escribir un bucle de entrenamiento para el ajuste fino del modelo.
Evaluar la precisión del modelo.

Solución:

import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torch.optim import AdamW

# Step 1: Load and preprocess the dataset
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])

train_dataloader = DataLoader(tokenized_datasets["train"].shuffle(seed=42).select(range(2000)), batch_size=8)
test_dataloader = DataLoader(tokenized_datasets["test"].shuffle(seed=42).select(range(500)), batch_size=8)

# Step 2: Load the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Step 3: Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_dataloader)}")

# Step 4: Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct += (predictions == batch["label"]).sum().item()
        total += batch["label"].size(0)

print(f"Accuracy: {correct / total:.2f}")

Salida Esperada:

Epoch 1 Loss: 0.45
Epoch 2 Loss: 0.36
Accuracy: 0.88

Estos ejercicios prácticos demuestran cómo usar efectivamente el ecosistema de Hugging Face e integrarlo con TensorFlow y PyTorch para flujos de trabajo de PLN. Al completar estos ejercicios, obtienes experiencia práctica con pipelines, ajuste fino y bucles de entrenamiento personalizados. ¡Continúa experimentando con otros conjuntos de datos y tareas para profundizar tu comprensión!

2.4 Ejercicios Prácticos

Ejercicio 1: Usando el Pipeline de Hugging Face

Tarea: Utiliza el pipeline de Hugging Face para realizar reconocimiento de entidades nombradas (NER) en un texto dado.

Instrucciones:

Importa el pipeline de Hugging Face.
Carga el pipeline NER.
Procesa un texto de ejemplo para identificar entidades nombradas.

Solución:

from transformers import pipeline

# Step 1: Load the NER pipeline
ner_pipeline = pipeline("ner", grouped_entities=True)

# Step 2: Define the input text
text = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, across the Manhattan Bridge."

# Step 3: Perform named entity recognition
entities = ner_pipeline(text)

# Step 4: Print the results
print("Named Entities:")
for entity in entities:
    print(f"Entity: {entity['word']}, Type: {entity['entity_group']}, Score: {entity['score']:.2f}")

Salida Esperada:

Named Entities:
Entity: Hugging Face Inc., Type: ORG, Score: 0.99
Entity: New York City, Type: LOC, Score: 0.99
Entity: DUMBO, Type: LOC, Score: 0.97
Entity: Manhattan Bridge, Type: LOC, Score: 0.96

Ejercicio 2: Ajuste Fino de un Modelo Transformer

Tarea: Realizar el ajuste fino de un modelo BERT para clasificación de texto utilizando la API Trainer de Hugging Face y el conjunto de datos IMDB.

Instrucciones:

Cargar el conjunto de datos IMDB y preprocesarlo.
Tokenizar el texto usando el tokenizador BERT.
Realizar el ajuste fino del modelo en un subconjunto pequeño del conjunto de datos.
Evaluar el modelo.

Solución:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score

# Step 1: Load and preprocess the dataset
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Step 2: Prepare the data for training
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))

# Step 3: Load the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Step 4: Define the training arguments and trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Step 5: Train the model
trainer.train()

# Step 6: Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

Salida Esperada:

Evaluation Results: {'eval_loss': 0.36, 'eval_accuracy': 0.88}

Ejercicio 3: Bucle de Entrenamiento con PyTorch

Tarea: Implementar un bucle de entrenamiento en PyTorch para realizar el ajuste fino de un modelo BERT en clasificación de texto.

Instrucciones:

Cargar el conjunto de datos IMDB y preprocesarlo.
Convertir el conjunto de datos a tensores de PyTorch.
Escribir un bucle de entrenamiento para el ajuste fino del modelo.
Evaluar la precisión del modelo.

Solución:

import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torch.optim import AdamW

# Step 1: Load and preprocess the dataset
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])

train_dataloader = DataLoader(tokenized_datasets["train"].shuffle(seed=42).select(range(2000)), batch_size=8)
test_dataloader = DataLoader(tokenized_datasets["test"].shuffle(seed=42).select(range(500)), batch_size=8)

# Step 2: Load the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Step 3: Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_dataloader)}")

# Step 4: Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct += (predictions == batch["label"]).sum().item()
        total += batch["label"].size(0)

print(f"Accuracy: {correct / total:.2f}")

Salida Esperada:

Epoch 1 Loss: 0.45
Epoch 2 Loss: 0.36
Accuracy: 0.88

2.4 Ejercicios Prácticos

Ejercicio 1: Usando el Pipeline de Hugging Face

Tarea: Utiliza el pipeline de Hugging Face para realizar reconocimiento de entidades nombradas (NER) en un texto dado.

Instrucciones:

Importa el pipeline de Hugging Face.
Carga el pipeline NER.
Procesa un texto de ejemplo para identificar entidades nombradas.

Solución:

from transformers import pipeline

# Step 1: Load the NER pipeline
ner_pipeline = pipeline("ner", grouped_entities=True)

# Step 2: Define the input text
text = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, across the Manhattan Bridge."

# Step 3: Perform named entity recognition
entities = ner_pipeline(text)

# Step 4: Print the results
print("Named Entities:")
for entity in entities:
    print(f"Entity: {entity['word']}, Type: {entity['entity_group']}, Score: {entity['score']:.2f}")

Salida Esperada:

Named Entities:
Entity: Hugging Face Inc., Type: ORG, Score: 0.99
Entity: New York City, Type: LOC, Score: 0.99
Entity: DUMBO, Type: LOC, Score: 0.97
Entity: Manhattan Bridge, Type: LOC, Score: 0.96

Ejercicio 2: Ajuste Fino de un Modelo Transformer

Tarea: Realizar el ajuste fino de un modelo BERT para clasificación de texto utilizando la API Trainer de Hugging Face y el conjunto de datos IMDB.

Instrucciones:

Cargar el conjunto de datos IMDB y preprocesarlo.
Tokenizar el texto usando el tokenizador BERT.
Realizar el ajuste fino del modelo en un subconjunto pequeño del conjunto de datos.
Evaluar el modelo.

Solución:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score

# Step 1: Load and preprocess the dataset
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Step 2: Prepare the data for training
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))

# Step 3: Load the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Step 4: Define the training arguments and trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Step 5: Train the model
trainer.train()

# Step 6: Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

Salida Esperada:

Evaluation Results: {'eval_loss': 0.36, 'eval_accuracy': 0.88}

Ejercicio 3: Bucle de Entrenamiento con PyTorch

Tarea: Implementar un bucle de entrenamiento en PyTorch para realizar el ajuste fino de un modelo BERT en clasificación de texto.

Instrucciones:

Cargar el conjunto de datos IMDB y preprocesarlo.
Convertir el conjunto de datos a tensores de PyTorch.
Escribir un bucle de entrenamiento para el ajuste fino del modelo.
Evaluar la precisión del modelo.

Solución:

import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torch.optim import AdamW

# Step 1: Load and preprocess the dataset
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])

train_dataloader = DataLoader(tokenized_datasets["train"].shuffle(seed=42).select(range(2000)), batch_size=8)
test_dataloader = DataLoader(tokenized_datasets["test"].shuffle(seed=42).select(range(500)), batch_size=8)

# Step 2: Load the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Step 3: Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_dataloader)}")

# Step 4: Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct += (predictions == batch["label"]).sum().item()
        total += batch["label"].size(0)

print(f"Accuracy: {correct / total:.2f}")

Salida Esperada:

Epoch 1 Loss: 0.45
Epoch 2 Loss: 0.36
Accuracy: 0.88

2.4 Ejercicios Prácticos

Ejercicio 1: Usando el Pipeline de Hugging Face

Tarea: Utiliza el pipeline de Hugging Face para realizar reconocimiento de entidades nombradas (NER) en un texto dado.

Instrucciones:

Importa el pipeline de Hugging Face.
Carga el pipeline NER.
Procesa un texto de ejemplo para identificar entidades nombradas.

Solución:

from transformers import pipeline

# Step 1: Load the NER pipeline
ner_pipeline = pipeline("ner", grouped_entities=True)

# Step 2: Define the input text
text = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, across the Manhattan Bridge."

# Step 3: Perform named entity recognition
entities = ner_pipeline(text)

# Step 4: Print the results
print("Named Entities:")
for entity in entities:
    print(f"Entity: {entity['word']}, Type: {entity['entity_group']}, Score: {entity['score']:.2f}")

Salida Esperada:

Named Entities:
Entity: Hugging Face Inc., Type: ORG, Score: 0.99
Entity: New York City, Type: LOC, Score: 0.99
Entity: DUMBO, Type: LOC, Score: 0.97
Entity: Manhattan Bridge, Type: LOC, Score: 0.96

Ejercicio 2: Ajuste Fino de un Modelo Transformer

Tarea: Realizar el ajuste fino de un modelo BERT para clasificación de texto utilizando la API Trainer de Hugging Face y el conjunto de datos IMDB.

Instrucciones:

Cargar el conjunto de datos IMDB y preprocesarlo.
Tokenizar el texto usando el tokenizador BERT.
Realizar el ajuste fino del modelo en un subconjunto pequeño del conjunto de datos.
Evaluar el modelo.

Solución:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score

# Step 1: Load and preprocess the dataset
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Step 2: Prepare the data for training
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))

# Step 3: Load the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Step 4: Define the training arguments and trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Step 5: Train the model
trainer.train()

# Step 6: Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

Salida Esperada:

Evaluation Results: {'eval_loss': 0.36, 'eval_accuracy': 0.88}

Ejercicio 3: Bucle de Entrenamiento con PyTorch

Tarea: Implementar un bucle de entrenamiento en PyTorch para realizar el ajuste fino de un modelo BERT en clasificación de texto.

Instrucciones:

Cargar el conjunto de datos IMDB y preprocesarlo.
Convertir el conjunto de datos a tensores de PyTorch.
Escribir un bucle de entrenamiento para el ajuste fino del modelo.
Evaluar la precisión del modelo.

Solución:

import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torch.optim import AdamW

# Step 1: Load and preprocess the dataset
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])

train_dataloader = DataLoader(tokenized_datasets["train"].shuffle(seed=42).select(range(2000)), batch_size=8)
test_dataloader = DataLoader(tokenized_datasets["test"].shuffle(seed=42).select(range(500)), batch_size=8)

# Step 2: Load the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Step 3: Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_dataloader)}")

# Step 4: Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct += (predictions == batch["label"]).sum().item()
        total += batch["label"].size(0)

print(f"Accuracy: {correct / total:.2f}")

Salida Esperada:

Epoch 1 Loss: 0.45
Epoch 2 Loss: 0.36
Accuracy: 0.88

Purchase this book