Chapter 4: Language Modeling
Practical Exercises
Exercise 1: N-grams
Task: Generate trigrams (3-grams) from the following text: "Natural Language Processing with Python."
Solution:
from nltk import ngrams
import nltk
nltk.download('punkt')
# Sample text
text = "Natural Language Processing with Python"
# Tokenize the text into words
tokens = nltk.word_tokenize(text)
# Generate trigrams
trigrams = ngrams(tokens, 3)
print("Trigrams:")
for grams in trigrams:
print(grams)
Output:
Trigrams:
('Natural', 'Language', 'Processing')
('Language', 'Processing', 'with')
('Processing', 'with', 'Python')
Exercise 2: Bigram Language Model
Task: Train a bigram language model on the following text corpus and calculate the probability of the bigram ("Language", "Processing"):
corpus = [
"Natural Language Processing is fascinating.",
"Language models are important in NLP.",
"Machine learning and NLP are closely related."
]
Solution:
from collections import defaultdict
import numpy as np
from nltk import ngrams
import nltk
nltk.download('punkt')
# Sample text corpus
corpus = [
"Natural Language Processing is fascinating.",
"Language models are important in NLP.",
"Machine learning and NLP are closely related."
]
# Tokenize the text into words
tokenized_corpus = [nltk.word_tokenize(sentence) for sentence in corpus]
# Function to calculate bigram probabilities
def train_bigram_model(tokenized_corpus):
model = defaultdict(lambda: defaultdict(lambda: 0))
# Count bigrams
for sentence in tokenized_corpus:
for w1, w2 in ngrams(sentence, 2):
model[w1][w2] += 1
# Calculate probabilities
for w1 in model:
total_count = float(sum(model[w1].values()))
for w2 in model[w1]:
model[w1][w2] /= total_count
return model
# Train the bigram model
bigram_model = train_bigram_model(tokenized_corpus)
# Function to get the probability of a bigram
def get_bigram_probability(bigram_model, w1, w2):
return bigram_model[w1][w2]
print("Bigram Probability (Processing | Language):")
print(get_bigram_probability(bigram_model, 'Language', 'Processing'))
Output:
Bigram Probability (Processing | Language):
0.5
Exercise 3: HMM for Part-of-Speech Tagging
Task: Implement an HMM for part-of-speech tagging using the following sentences and tags:
sentences = [
["I", "run", "to", "the", "store"],
["She", "jumps", "over", "the", "fence"]
]
tags = [
["PRON", "VERB", "ADP", "DET", "NOUN"],
["PRON", "VERB", "ADP", "DET", "NOUN"]
]
Solution:
import numpy as np
from hmmlearn import hmm
# Define the states and observations
states = ["PRON", "VERB", "ADP", "DET", "NOUN"]
n_states = len(states)
observations = ["I", "run", "to", "the", "store", "She", "jumps", "over", "fence"]
n_observations = len(observations)
# Encode the states and observations
state_to_idx = {state: idx for idx, state in enumerate(states)}
observation_to_idx = {obs: idx for idx, obs in enumerate(observations)}
# Create the sequences for training
X = [[observation_to_idx[word] for word in sentence] for sentence in sentences]
y = [[state_to_idx[tag] for tag in tag_sequence] for tag_sequence in tags]
# Convert to numpy arrays
X = np.concatenate([np.array(x).reshape(-1, 1) for x in X])
lengths = [len(x) for x in sentences]
y = np.concatenate(y)
# Create the HMM model
model = hmm.MultinomialHMM(n_components=n_states, n_iter=100)
model.fit(X, lengths)
# Predict the hidden states (decoding problem)
logprob, hidden_states = model.decode(X, algorithm="viterbi")
# Map the states back to their original labels
hidden_states = [states[state] for state in hidden_states]
print("Observations:", sentences[0] + sentences[1])
print("Predicted states:", hidden_states)
Output:
Observations: ['I', 'run', 'to', 'the', 'store', 'She', 'jumps', 'over', 'the', 'fence']
Predicted states: ['PRON', 'VERB', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'DET', 'NOUN']
Exercise 4: Simple RNN for Text Generation
Task: Implement a simple RNN for text generation using the following text:
text = "hello world"
Solution:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.utils import to_categorical
# Sample text corpus
text = "hello world"
# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
y.append(char_to_idx[text[i + sequence_length]])
X = np.array(X)
y = to_categorical(y, num_classes=len(chars))
# Reshape input to be compatible with RNN input
X = X.reshape((X.shape[0], X.shape[1], 1))
# Define the RNN model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')
# Train the model
model.fit(X, y, epochs=200, verbose=1)
# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
input_eval = [char_to_idx[s] for s in start_string]
input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))
text_generated = []
for i in range(num_generate):
predictions = model.predict(input_eval)
predicted_id = np.argmax(predictions[-1])
input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
text_generated.append(idx_to_char[predicted_id])
return start_string + ''.join(text_generated)
# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)
Output:
Generated text:
hello w
Exercise 5: LSTM for Text Generation
Task: Implement an LSTM for text generation using the following text:
text = "hello world"
Solution:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical
# Sample text corpus
text = "hello world"
# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
y.append(char_to_idx[text[i + sequence_length]])
X = np.array(X)
y = to_categorical(y, num_classes=len(chars))
# Reshape input to be compatible with LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))
# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')
# Train the model
model.fit(X, y, epochs=200, verbose=1)
# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
input_eval = [char_to_idx[s] for s in start_string]
input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))
text_generated = []
for i in range(num_generate):
predictions = model.predict(input_eval)
predicted_id =
np.argmax(predictions[-1])
input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
text_generated.append(idx_to_char[predicted_id])
return start_string + ''.join(text_generated)
# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)
Output:
Generated text:
hello w
These exercises provide hands-on experience with N-grams, Hidden Markov Models, Recurrent Neural Networks, and Long Short-Term Memory Networks, reinforcing the concepts covered in Chapter 4.
Practical Exercises
Exercise 1: N-grams
Task: Generate trigrams (3-grams) from the following text: "Natural Language Processing with Python."
Solution:
from nltk import ngrams
import nltk
nltk.download('punkt')
# Sample text
text = "Natural Language Processing with Python"
# Tokenize the text into words
tokens = nltk.word_tokenize(text)
# Generate trigrams
trigrams = ngrams(tokens, 3)
print("Trigrams:")
for grams in trigrams:
print(grams)
Output:
Trigrams:
('Natural', 'Language', 'Processing')
('Language', 'Processing', 'with')
('Processing', 'with', 'Python')
Exercise 2: Bigram Language Model
Task: Train a bigram language model on the following text corpus and calculate the probability of the bigram ("Language", "Processing"):
corpus = [
"Natural Language Processing is fascinating.",
"Language models are important in NLP.",
"Machine learning and NLP are closely related."
]
Solution:
from collections import defaultdict
import numpy as np
from nltk import ngrams
import nltk
nltk.download('punkt')
# Sample text corpus
corpus = [
"Natural Language Processing is fascinating.",
"Language models are important in NLP.",
"Machine learning and NLP are closely related."
]
# Tokenize the text into words
tokenized_corpus = [nltk.word_tokenize(sentence) for sentence in corpus]
# Function to calculate bigram probabilities
def train_bigram_model(tokenized_corpus):
model = defaultdict(lambda: defaultdict(lambda: 0))
# Count bigrams
for sentence in tokenized_corpus:
for w1, w2 in ngrams(sentence, 2):
model[w1][w2] += 1
# Calculate probabilities
for w1 in model:
total_count = float(sum(model[w1].values()))
for w2 in model[w1]:
model[w1][w2] /= total_count
return model
# Train the bigram model
bigram_model = train_bigram_model(tokenized_corpus)
# Function to get the probability of a bigram
def get_bigram_probability(bigram_model, w1, w2):
return bigram_model[w1][w2]
print("Bigram Probability (Processing | Language):")
print(get_bigram_probability(bigram_model, 'Language', 'Processing'))
Output:
Bigram Probability (Processing | Language):
0.5
Exercise 3: HMM for Part-of-Speech Tagging
Task: Implement an HMM for part-of-speech tagging using the following sentences and tags:
sentences = [
["I", "run", "to", "the", "store"],
["She", "jumps", "over", "the", "fence"]
]
tags = [
["PRON", "VERB", "ADP", "DET", "NOUN"],
["PRON", "VERB", "ADP", "DET", "NOUN"]
]
Solution:
import numpy as np
from hmmlearn import hmm
# Define the states and observations
states = ["PRON", "VERB", "ADP", "DET", "NOUN"]
n_states = len(states)
observations = ["I", "run", "to", "the", "store", "She", "jumps", "over", "fence"]
n_observations = len(observations)
# Encode the states and observations
state_to_idx = {state: idx for idx, state in enumerate(states)}
observation_to_idx = {obs: idx for idx, obs in enumerate(observations)}
# Create the sequences for training
X = [[observation_to_idx[word] for word in sentence] for sentence in sentences]
y = [[state_to_idx[tag] for tag in tag_sequence] for tag_sequence in tags]
# Convert to numpy arrays
X = np.concatenate([np.array(x).reshape(-1, 1) for x in X])
lengths = [len(x) for x in sentences]
y = np.concatenate(y)
# Create the HMM model
model = hmm.MultinomialHMM(n_components=n_states, n_iter=100)
model.fit(X, lengths)
# Predict the hidden states (decoding problem)
logprob, hidden_states = model.decode(X, algorithm="viterbi")
# Map the states back to their original labels
hidden_states = [states[state] for state in hidden_states]
print("Observations:", sentences[0] + sentences[1])
print("Predicted states:", hidden_states)
Output:
Observations: ['I', 'run', 'to', 'the', 'store', 'She', 'jumps', 'over', 'the', 'fence']
Predicted states: ['PRON', 'VERB', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'DET', 'NOUN']
Exercise 4: Simple RNN for Text Generation
Task: Implement a simple RNN for text generation using the following text:
text = "hello world"
Solution:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.utils import to_categorical
# Sample text corpus
text = "hello world"
# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
y.append(char_to_idx[text[i + sequence_length]])
X = np.array(X)
y = to_categorical(y, num_classes=len(chars))
# Reshape input to be compatible with RNN input
X = X.reshape((X.shape[0], X.shape[1], 1))
# Define the RNN model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')
# Train the model
model.fit(X, y, epochs=200, verbose=1)
# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
input_eval = [char_to_idx[s] for s in start_string]
input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))
text_generated = []
for i in range(num_generate):
predictions = model.predict(input_eval)
predicted_id = np.argmax(predictions[-1])
input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
text_generated.append(idx_to_char[predicted_id])
return start_string + ''.join(text_generated)
# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)
Output:
Generated text:
hello w
Exercise 5: LSTM for Text Generation
Task: Implement an LSTM for text generation using the following text:
text = "hello world"
Solution:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical
# Sample text corpus
text = "hello world"
# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
y.append(char_to_idx[text[i + sequence_length]])
X = np.array(X)
y = to_categorical(y, num_classes=len(chars))
# Reshape input to be compatible with LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))
# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')
# Train the model
model.fit(X, y, epochs=200, verbose=1)
# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
input_eval = [char_to_idx[s] for s in start_string]
input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))
text_generated = []
for i in range(num_generate):
predictions = model.predict(input_eval)
predicted_id =
np.argmax(predictions[-1])
input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
text_generated.append(idx_to_char[predicted_id])
return start_string + ''.join(text_generated)
# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)
Output:
Generated text:
hello w
These exercises provide hands-on experience with N-grams, Hidden Markov Models, Recurrent Neural Networks, and Long Short-Term Memory Networks, reinforcing the concepts covered in Chapter 4.
Practical Exercises
Exercise 1: N-grams
Task: Generate trigrams (3-grams) from the following text: "Natural Language Processing with Python."
Solution:
from nltk import ngrams
import nltk
nltk.download('punkt')
# Sample text
text = "Natural Language Processing with Python"
# Tokenize the text into words
tokens = nltk.word_tokenize(text)
# Generate trigrams
trigrams = ngrams(tokens, 3)
print("Trigrams:")
for grams in trigrams:
print(grams)
Output:
Trigrams:
('Natural', 'Language', 'Processing')
('Language', 'Processing', 'with')
('Processing', 'with', 'Python')
Exercise 2: Bigram Language Model
Task: Train a bigram language model on the following text corpus and calculate the probability of the bigram ("Language", "Processing"):
corpus = [
"Natural Language Processing is fascinating.",
"Language models are important in NLP.",
"Machine learning and NLP are closely related."
]
Solution:
from collections import defaultdict
import numpy as np
from nltk import ngrams
import nltk
nltk.download('punkt')
# Sample text corpus
corpus = [
"Natural Language Processing is fascinating.",
"Language models are important in NLP.",
"Machine learning and NLP are closely related."
]
# Tokenize the text into words
tokenized_corpus = [nltk.word_tokenize(sentence) for sentence in corpus]
# Function to calculate bigram probabilities
def train_bigram_model(tokenized_corpus):
model = defaultdict(lambda: defaultdict(lambda: 0))
# Count bigrams
for sentence in tokenized_corpus:
for w1, w2 in ngrams(sentence, 2):
model[w1][w2] += 1
# Calculate probabilities
for w1 in model:
total_count = float(sum(model[w1].values()))
for w2 in model[w1]:
model[w1][w2] /= total_count
return model
# Train the bigram model
bigram_model = train_bigram_model(tokenized_corpus)
# Function to get the probability of a bigram
def get_bigram_probability(bigram_model, w1, w2):
return bigram_model[w1][w2]
print("Bigram Probability (Processing | Language):")
print(get_bigram_probability(bigram_model, 'Language', 'Processing'))
Output:
Bigram Probability (Processing | Language):
0.5
Exercise 3: HMM for Part-of-Speech Tagging
Task: Implement an HMM for part-of-speech tagging using the following sentences and tags:
sentences = [
["I", "run", "to", "the", "store"],
["She", "jumps", "over", "the", "fence"]
]
tags = [
["PRON", "VERB", "ADP", "DET", "NOUN"],
["PRON", "VERB", "ADP", "DET", "NOUN"]
]
Solution:
import numpy as np
from hmmlearn import hmm
# Define the states and observations
states = ["PRON", "VERB", "ADP", "DET", "NOUN"]
n_states = len(states)
observations = ["I", "run", "to", "the", "store", "She", "jumps", "over", "fence"]
n_observations = len(observations)
# Encode the states and observations
state_to_idx = {state: idx for idx, state in enumerate(states)}
observation_to_idx = {obs: idx for idx, obs in enumerate(observations)}
# Create the sequences for training
X = [[observation_to_idx[word] for word in sentence] for sentence in sentences]
y = [[state_to_idx[tag] for tag in tag_sequence] for tag_sequence in tags]
# Convert to numpy arrays
X = np.concatenate([np.array(x).reshape(-1, 1) for x in X])
lengths = [len(x) for x in sentences]
y = np.concatenate(y)
# Create the HMM model
model = hmm.MultinomialHMM(n_components=n_states, n_iter=100)
model.fit(X, lengths)
# Predict the hidden states (decoding problem)
logprob, hidden_states = model.decode(X, algorithm="viterbi")
# Map the states back to their original labels
hidden_states = [states[state] for state in hidden_states]
print("Observations:", sentences[0] + sentences[1])
print("Predicted states:", hidden_states)
Output:
Observations: ['I', 'run', 'to', 'the', 'store', 'She', 'jumps', 'over', 'the', 'fence']
Predicted states: ['PRON', 'VERB', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'DET', 'NOUN']
Exercise 4: Simple RNN for Text Generation
Task: Implement a simple RNN for text generation using the following text:
text = "hello world"
Solution:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.utils import to_categorical
# Sample text corpus
text = "hello world"
# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
y.append(char_to_idx[text[i + sequence_length]])
X = np.array(X)
y = to_categorical(y, num_classes=len(chars))
# Reshape input to be compatible with RNN input
X = X.reshape((X.shape[0], X.shape[1], 1))
# Define the RNN model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')
# Train the model
model.fit(X, y, epochs=200, verbose=1)
# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
input_eval = [char_to_idx[s] for s in start_string]
input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))
text_generated = []
for i in range(num_generate):
predictions = model.predict(input_eval)
predicted_id = np.argmax(predictions[-1])
input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
text_generated.append(idx_to_char[predicted_id])
return start_string + ''.join(text_generated)
# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)
Output:
Generated text:
hello w
Exercise 5: LSTM for Text Generation
Task: Implement an LSTM for text generation using the following text:
text = "hello world"
Solution:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical
# Sample text corpus
text = "hello world"
# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
y.append(char_to_idx[text[i + sequence_length]])
X = np.array(X)
y = to_categorical(y, num_classes=len(chars))
# Reshape input to be compatible with LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))
# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')
# Train the model
model.fit(X, y, epochs=200, verbose=1)
# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
input_eval = [char_to_idx[s] for s in start_string]
input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))
text_generated = []
for i in range(num_generate):
predictions = model.predict(input_eval)
predicted_id =
np.argmax(predictions[-1])
input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
text_generated.append(idx_to_char[predicted_id])
return start_string + ''.join(text_generated)
# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)
Output:
Generated text:
hello w
These exercises provide hands-on experience with N-grams, Hidden Markov Models, Recurrent Neural Networks, and Long Short-Term Memory Networks, reinforcing the concepts covered in Chapter 4.
Practical Exercises
Exercise 1: N-grams
Task: Generate trigrams (3-grams) from the following text: "Natural Language Processing with Python."
Solution:
from nltk import ngrams
import nltk
nltk.download('punkt')
# Sample text
text = "Natural Language Processing with Python"
# Tokenize the text into words
tokens = nltk.word_tokenize(text)
# Generate trigrams
trigrams = ngrams(tokens, 3)
print("Trigrams:")
for grams in trigrams:
print(grams)
Output:
Trigrams:
('Natural', 'Language', 'Processing')
('Language', 'Processing', 'with')
('Processing', 'with', 'Python')
Exercise 2: Bigram Language Model
Task: Train a bigram language model on the following text corpus and calculate the probability of the bigram ("Language", "Processing"):
corpus = [
"Natural Language Processing is fascinating.",
"Language models are important in NLP.",
"Machine learning and NLP are closely related."
]
Solution:
from collections import defaultdict
import numpy as np
from nltk import ngrams
import nltk
nltk.download('punkt')
# Sample text corpus
corpus = [
"Natural Language Processing is fascinating.",
"Language models are important in NLP.",
"Machine learning and NLP are closely related."
]
# Tokenize the text into words
tokenized_corpus = [nltk.word_tokenize(sentence) for sentence in corpus]
# Function to calculate bigram probabilities
def train_bigram_model(tokenized_corpus):
model = defaultdict(lambda: defaultdict(lambda: 0))
# Count bigrams
for sentence in tokenized_corpus:
for w1, w2 in ngrams(sentence, 2):
model[w1][w2] += 1
# Calculate probabilities
for w1 in model:
total_count = float(sum(model[w1].values()))
for w2 in model[w1]:
model[w1][w2] /= total_count
return model
# Train the bigram model
bigram_model = train_bigram_model(tokenized_corpus)
# Function to get the probability of a bigram
def get_bigram_probability(bigram_model, w1, w2):
return bigram_model[w1][w2]
print("Bigram Probability (Processing | Language):")
print(get_bigram_probability(bigram_model, 'Language', 'Processing'))
Output:
Bigram Probability (Processing | Language):
0.5
Exercise 3: HMM for Part-of-Speech Tagging
Task: Implement an HMM for part-of-speech tagging using the following sentences and tags:
sentences = [
["I", "run", "to", "the", "store"],
["She", "jumps", "over", "the", "fence"]
]
tags = [
["PRON", "VERB", "ADP", "DET", "NOUN"],
["PRON", "VERB", "ADP", "DET", "NOUN"]
]
Solution:
import numpy as np
from hmmlearn import hmm
# Define the states and observations
states = ["PRON", "VERB", "ADP", "DET", "NOUN"]
n_states = len(states)
observations = ["I", "run", "to", "the", "store", "She", "jumps", "over", "fence"]
n_observations = len(observations)
# Encode the states and observations
state_to_idx = {state: idx for idx, state in enumerate(states)}
observation_to_idx = {obs: idx for idx, obs in enumerate(observations)}
# Create the sequences for training
X = [[observation_to_idx[word] for word in sentence] for sentence in sentences]
y = [[state_to_idx[tag] for tag in tag_sequence] for tag_sequence in tags]
# Convert to numpy arrays
X = np.concatenate([np.array(x).reshape(-1, 1) for x in X])
lengths = [len(x) for x in sentences]
y = np.concatenate(y)
# Create the HMM model
model = hmm.MultinomialHMM(n_components=n_states, n_iter=100)
model.fit(X, lengths)
# Predict the hidden states (decoding problem)
logprob, hidden_states = model.decode(X, algorithm="viterbi")
# Map the states back to their original labels
hidden_states = [states[state] for state in hidden_states]
print("Observations:", sentences[0] + sentences[1])
print("Predicted states:", hidden_states)
Output:
Observations: ['I', 'run', 'to', 'the', 'store', 'She', 'jumps', 'over', 'the', 'fence']
Predicted states: ['PRON', 'VERB', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'DET', 'NOUN']
Exercise 4: Simple RNN for Text Generation
Task: Implement a simple RNN for text generation using the following text:
text = "hello world"
Solution:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.utils import to_categorical
# Sample text corpus
text = "hello world"
# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
y.append(char_to_idx[text[i + sequence_length]])
X = np.array(X)
y = to_categorical(y, num_classes=len(chars))
# Reshape input to be compatible with RNN input
X = X.reshape((X.shape[0], X.shape[1], 1))
# Define the RNN model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')
# Train the model
model.fit(X, y, epochs=200, verbose=1)
# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
input_eval = [char_to_idx[s] for s in start_string]
input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))
text_generated = []
for i in range(num_generate):
predictions = model.predict(input_eval)
predicted_id = np.argmax(predictions[-1])
input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
text_generated.append(idx_to_char[predicted_id])
return start_string + ''.join(text_generated)
# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)
Output:
Generated text:
hello w
Exercise 5: LSTM for Text Generation
Task: Implement an LSTM for text generation using the following text:
text = "hello world"
Solution:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical
# Sample text corpus
text = "hello world"
# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
y.append(char_to_idx[text[i + sequence_length]])
X = np.array(X)
y = to_categorical(y, num_classes=len(chars))
# Reshape input to be compatible with LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))
# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')
# Train the model
model.fit(X, y, epochs=200, verbose=1)
# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
input_eval = [char_to_idx[s] for s in start_string]
input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))
text_generated = []
for i in range(num_generate):
predictions = model.predict(input_eval)
predicted_id =
np.argmax(predictions[-1])
input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
text_generated.append(idx_to_char[predicted_id])
return start_string + ''.join(text_generated)
# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)
Output:
Generated text:
hello w
These exercises provide hands-on experience with N-grams, Hidden Markov Models, Recurrent Neural Networks, and Long Short-Term Memory Networks, reinforcing the concepts covered in Chapter 4.