Code icon

The App is Under a Quick Maintenance

We apologize for the inconvenience. Please come back later

Menu iconMenu iconNatural Language Processing with Python Updated Edition
Natural Language Processing with Python Updated Edition

Chapter 4: Language Modeling

Practical Exercises

Exercise 1: N-grams

Task: Generate trigrams (3-grams) from the following text: "Natural Language Processing with Python."

Solution:

from nltk import ngrams
import nltk
nltk.download('punkt')

# Sample text
text = "Natural Language Processing with Python"

# Tokenize the text into words
tokens = nltk.word_tokenize(text)

# Generate trigrams
trigrams = ngrams(tokens, 3)

print("Trigrams:")
for grams in trigrams:
    print(grams)

Output:

Trigrams:
('Natural', 'Language', 'Processing')
('Language', 'Processing', 'with')
('Processing', 'with', 'Python')

Exercise 2: Bigram Language Model

Task: Train a bigram language model on the following text corpus and calculate the probability of the bigram ("Language", "Processing"):

corpus = [
    "Natural Language Processing is fascinating.",
    "Language models are important in NLP.",
    "Machine learning and NLP are closely related."
]

Solution:

from collections import defaultdict
import numpy as np
from nltk import ngrams
import nltk
nltk.download('punkt')

# Sample text corpus
corpus = [
    "Natural Language Processing is fascinating.",
    "Language models are important in NLP.",
    "Machine learning and NLP are closely related."
]

# Tokenize the text into words
tokenized_corpus = [nltk.word_tokenize(sentence) for sentence in corpus]

# Function to calculate bigram probabilities
def train_bigram_model(tokenized_corpus):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    # Count bigrams
    for sentence in tokenized_corpus:
        for w1, w2 in ngrams(sentence, 2):
            model[w1][w2] += 1

    # Calculate probabilities
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count

    return model

# Train the bigram model
bigram_model = train_bigram_model(tokenized_corpus)

# Function to get the probability of a bigram
def get_bigram_probability(bigram_model, w1, w2):
    return bigram_model[w1][w2]

print("Bigram Probability (Processing | Language):")
print(get_bigram_probability(bigram_model, 'Language', 'Processing'))

Output:

Bigram Probability (Processing | Language):
0.5

Exercise 3: HMM for Part-of-Speech Tagging

Task: Implement an HMM for part-of-speech tagging using the following sentences and tags:

sentences = [
    ["I", "run", "to", "the", "store"],
    ["She", "jumps", "over", "the", "fence"]
]

tags = [
    ["PRON", "VERB", "ADP", "DET", "NOUN"],
    ["PRON", "VERB", "ADP", "DET", "NOUN"]
]

Solution:

import numpy as np
from hmmlearn import hmm

# Define the states and observations
states = ["PRON", "VERB", "ADP", "DET", "NOUN"]
n_states = len(states)

observations = ["I", "run", "to", "the", "store", "She", "jumps", "over", "fence"]
n_observations = len(observations)

# Encode the states and observations
state_to_idx = {state: idx for idx, state in enumerate(states)}
observation_to_idx = {obs: idx for idx, obs in enumerate(observations)}

# Create the sequences for training
X = [[observation_to_idx[word] for word in sentence] for sentence in sentences]
y = [[state_to_idx[tag] for tag in tag_sequence] for tag_sequence in tags]

# Convert to numpy arrays
X = np.concatenate([np.array(x).reshape(-1, 1) for x in X])
lengths = [len(x) for x in sentences]
y = np.concatenate(y)

# Create the HMM model
model = hmm.MultinomialHMM(n_components=n_states, n_iter=100)
model.fit(X, lengths)

# Predict the hidden states (decoding problem)
logprob, hidden_states = model.decode(X, algorithm="viterbi")

# Map the states back to their original labels
hidden_states = [states[state] for state in hidden_states]

print("Observations:", sentences[0] + sentences[1])
print("Predicted states:", hidden_states)

Output:

Observations: ['I', 'run', 'to', 'the', 'store', 'She', 'jumps', 'over', 'the', 'fence']
Predicted states: ['PRON', 'VERB', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'DET', 'NOUN']

Exercise 4: Simple RNN for Text Generation

Task: Implement a simple RNN for text generation using the following text:

text = "hello world"

Solution:

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = "hello world"

# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
    X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
    y.append(char_to_idx[text[i + sequence_length]])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

# Reshape input to be compatible with RNN input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Define the RNN model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=200, verbose=1)

# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))

    text_generated = []

    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predicted_id = np.argmax(predictions[-1])

        input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
        text_generated.append(idx_to_char[predicted_id])

    return start_string + ''.join(text_generated)

# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)

Output:

Generated text:
hello w

Exercise 5: LSTM for Text Generation

Task: Implement an LSTM for text generation using the following text:

text = "hello world"

Solution:

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = "hello world"

# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
    X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
    y.append(char_to_idx[text[i + sequence_length]])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

# Reshape input to be compatible with LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=200, verbose=1)

# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))

    text_generated = []

    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predicted_id =

 np.argmax(predictions[-1])

        input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
        text_generated.append(idx_to_char[predicted_id])

    return start_string + ''.join(text_generated)

# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)

Output:

Generated text:
hello w

These exercises provide hands-on experience with N-grams, Hidden Markov Models, Recurrent Neural Networks, and Long Short-Term Memory Networks, reinforcing the concepts covered in Chapter 4.

Practical Exercises

Exercise 1: N-grams

Task: Generate trigrams (3-grams) from the following text: "Natural Language Processing with Python."

Solution:

from nltk import ngrams
import nltk
nltk.download('punkt')

# Sample text
text = "Natural Language Processing with Python"

# Tokenize the text into words
tokens = nltk.word_tokenize(text)

# Generate trigrams
trigrams = ngrams(tokens, 3)

print("Trigrams:")
for grams in trigrams:
    print(grams)

Output:

Trigrams:
('Natural', 'Language', 'Processing')
('Language', 'Processing', 'with')
('Processing', 'with', 'Python')

Exercise 2: Bigram Language Model

Task: Train a bigram language model on the following text corpus and calculate the probability of the bigram ("Language", "Processing"):

corpus = [
    "Natural Language Processing is fascinating.",
    "Language models are important in NLP.",
    "Machine learning and NLP are closely related."
]

Solution:

from collections import defaultdict
import numpy as np
from nltk import ngrams
import nltk
nltk.download('punkt')

# Sample text corpus
corpus = [
    "Natural Language Processing is fascinating.",
    "Language models are important in NLP.",
    "Machine learning and NLP are closely related."
]

# Tokenize the text into words
tokenized_corpus = [nltk.word_tokenize(sentence) for sentence in corpus]

# Function to calculate bigram probabilities
def train_bigram_model(tokenized_corpus):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    # Count bigrams
    for sentence in tokenized_corpus:
        for w1, w2 in ngrams(sentence, 2):
            model[w1][w2] += 1

    # Calculate probabilities
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count

    return model

# Train the bigram model
bigram_model = train_bigram_model(tokenized_corpus)

# Function to get the probability of a bigram
def get_bigram_probability(bigram_model, w1, w2):
    return bigram_model[w1][w2]

print("Bigram Probability (Processing | Language):")
print(get_bigram_probability(bigram_model, 'Language', 'Processing'))

Output:

Bigram Probability (Processing | Language):
0.5

Exercise 3: HMM for Part-of-Speech Tagging

Task: Implement an HMM for part-of-speech tagging using the following sentences and tags:

sentences = [
    ["I", "run", "to", "the", "store"],
    ["She", "jumps", "over", "the", "fence"]
]

tags = [
    ["PRON", "VERB", "ADP", "DET", "NOUN"],
    ["PRON", "VERB", "ADP", "DET", "NOUN"]
]

Solution:

import numpy as np
from hmmlearn import hmm

# Define the states and observations
states = ["PRON", "VERB", "ADP", "DET", "NOUN"]
n_states = len(states)

observations = ["I", "run", "to", "the", "store", "She", "jumps", "over", "fence"]
n_observations = len(observations)

# Encode the states and observations
state_to_idx = {state: idx for idx, state in enumerate(states)}
observation_to_idx = {obs: idx for idx, obs in enumerate(observations)}

# Create the sequences for training
X = [[observation_to_idx[word] for word in sentence] for sentence in sentences]
y = [[state_to_idx[tag] for tag in tag_sequence] for tag_sequence in tags]

# Convert to numpy arrays
X = np.concatenate([np.array(x).reshape(-1, 1) for x in X])
lengths = [len(x) for x in sentences]
y = np.concatenate(y)

# Create the HMM model
model = hmm.MultinomialHMM(n_components=n_states, n_iter=100)
model.fit(X, lengths)

# Predict the hidden states (decoding problem)
logprob, hidden_states = model.decode(X, algorithm="viterbi")

# Map the states back to their original labels
hidden_states = [states[state] for state in hidden_states]

print("Observations:", sentences[0] + sentences[1])
print("Predicted states:", hidden_states)

Output:

Observations: ['I', 'run', 'to', 'the', 'store', 'She', 'jumps', 'over', 'the', 'fence']
Predicted states: ['PRON', 'VERB', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'DET', 'NOUN']

Exercise 4: Simple RNN for Text Generation

Task: Implement a simple RNN for text generation using the following text:

text = "hello world"

Solution:

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = "hello world"

# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
    X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
    y.append(char_to_idx[text[i + sequence_length]])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

# Reshape input to be compatible with RNN input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Define the RNN model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=200, verbose=1)

# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))

    text_generated = []

    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predicted_id = np.argmax(predictions[-1])

        input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
        text_generated.append(idx_to_char[predicted_id])

    return start_string + ''.join(text_generated)

# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)

Output:

Generated text:
hello w

Exercise 5: LSTM for Text Generation

Task: Implement an LSTM for text generation using the following text:

text = "hello world"

Solution:

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = "hello world"

# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
    X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
    y.append(char_to_idx[text[i + sequence_length]])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

# Reshape input to be compatible with LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=200, verbose=1)

# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))

    text_generated = []

    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predicted_id =

 np.argmax(predictions[-1])

        input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
        text_generated.append(idx_to_char[predicted_id])

    return start_string + ''.join(text_generated)

# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)

Output:

Generated text:
hello w

These exercises provide hands-on experience with N-grams, Hidden Markov Models, Recurrent Neural Networks, and Long Short-Term Memory Networks, reinforcing the concepts covered in Chapter 4.

Practical Exercises

Exercise 1: N-grams

Task: Generate trigrams (3-grams) from the following text: "Natural Language Processing with Python."

Solution:

from nltk import ngrams
import nltk
nltk.download('punkt')

# Sample text
text = "Natural Language Processing with Python"

# Tokenize the text into words
tokens = nltk.word_tokenize(text)

# Generate trigrams
trigrams = ngrams(tokens, 3)

print("Trigrams:")
for grams in trigrams:
    print(grams)

Output:

Trigrams:
('Natural', 'Language', 'Processing')
('Language', 'Processing', 'with')
('Processing', 'with', 'Python')

Exercise 2: Bigram Language Model

Task: Train a bigram language model on the following text corpus and calculate the probability of the bigram ("Language", "Processing"):

corpus = [
    "Natural Language Processing is fascinating.",
    "Language models are important in NLP.",
    "Machine learning and NLP are closely related."
]

Solution:

from collections import defaultdict
import numpy as np
from nltk import ngrams
import nltk
nltk.download('punkt')

# Sample text corpus
corpus = [
    "Natural Language Processing is fascinating.",
    "Language models are important in NLP.",
    "Machine learning and NLP are closely related."
]

# Tokenize the text into words
tokenized_corpus = [nltk.word_tokenize(sentence) for sentence in corpus]

# Function to calculate bigram probabilities
def train_bigram_model(tokenized_corpus):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    # Count bigrams
    for sentence in tokenized_corpus:
        for w1, w2 in ngrams(sentence, 2):
            model[w1][w2] += 1

    # Calculate probabilities
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count

    return model

# Train the bigram model
bigram_model = train_bigram_model(tokenized_corpus)

# Function to get the probability of a bigram
def get_bigram_probability(bigram_model, w1, w2):
    return bigram_model[w1][w2]

print("Bigram Probability (Processing | Language):")
print(get_bigram_probability(bigram_model, 'Language', 'Processing'))

Output:

Bigram Probability (Processing | Language):
0.5

Exercise 3: HMM for Part-of-Speech Tagging

Task: Implement an HMM for part-of-speech tagging using the following sentences and tags:

sentences = [
    ["I", "run", "to", "the", "store"],
    ["She", "jumps", "over", "the", "fence"]
]

tags = [
    ["PRON", "VERB", "ADP", "DET", "NOUN"],
    ["PRON", "VERB", "ADP", "DET", "NOUN"]
]

Solution:

import numpy as np
from hmmlearn import hmm

# Define the states and observations
states = ["PRON", "VERB", "ADP", "DET", "NOUN"]
n_states = len(states)

observations = ["I", "run", "to", "the", "store", "She", "jumps", "over", "fence"]
n_observations = len(observations)

# Encode the states and observations
state_to_idx = {state: idx for idx, state in enumerate(states)}
observation_to_idx = {obs: idx for idx, obs in enumerate(observations)}

# Create the sequences for training
X = [[observation_to_idx[word] for word in sentence] for sentence in sentences]
y = [[state_to_idx[tag] for tag in tag_sequence] for tag_sequence in tags]

# Convert to numpy arrays
X = np.concatenate([np.array(x).reshape(-1, 1) for x in X])
lengths = [len(x) for x in sentences]
y = np.concatenate(y)

# Create the HMM model
model = hmm.MultinomialHMM(n_components=n_states, n_iter=100)
model.fit(X, lengths)

# Predict the hidden states (decoding problem)
logprob, hidden_states = model.decode(X, algorithm="viterbi")

# Map the states back to their original labels
hidden_states = [states[state] for state in hidden_states]

print("Observations:", sentences[0] + sentences[1])
print("Predicted states:", hidden_states)

Output:

Observations: ['I', 'run', 'to', 'the', 'store', 'She', 'jumps', 'over', 'the', 'fence']
Predicted states: ['PRON', 'VERB', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'DET', 'NOUN']

Exercise 4: Simple RNN for Text Generation

Task: Implement a simple RNN for text generation using the following text:

text = "hello world"

Solution:

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = "hello world"

# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
    X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
    y.append(char_to_idx[text[i + sequence_length]])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

# Reshape input to be compatible with RNN input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Define the RNN model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=200, verbose=1)

# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))

    text_generated = []

    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predicted_id = np.argmax(predictions[-1])

        input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
        text_generated.append(idx_to_char[predicted_id])

    return start_string + ''.join(text_generated)

# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)

Output:

Generated text:
hello w

Exercise 5: LSTM for Text Generation

Task: Implement an LSTM for text generation using the following text:

text = "hello world"

Solution:

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = "hello world"

# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
    X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
    y.append(char_to_idx[text[i + sequence_length]])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

# Reshape input to be compatible with LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=200, verbose=1)

# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))

    text_generated = []

    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predicted_id =

 np.argmax(predictions[-1])

        input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
        text_generated.append(idx_to_char[predicted_id])

    return start_string + ''.join(text_generated)

# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)

Output:

Generated text:
hello w

These exercises provide hands-on experience with N-grams, Hidden Markov Models, Recurrent Neural Networks, and Long Short-Term Memory Networks, reinforcing the concepts covered in Chapter 4.

Practical Exercises

Exercise 1: N-grams

Task: Generate trigrams (3-grams) from the following text: "Natural Language Processing with Python."

Solution:

from nltk import ngrams
import nltk
nltk.download('punkt')

# Sample text
text = "Natural Language Processing with Python"

# Tokenize the text into words
tokens = nltk.word_tokenize(text)

# Generate trigrams
trigrams = ngrams(tokens, 3)

print("Trigrams:")
for grams in trigrams:
    print(grams)

Output:

Trigrams:
('Natural', 'Language', 'Processing')
('Language', 'Processing', 'with')
('Processing', 'with', 'Python')

Exercise 2: Bigram Language Model

Task: Train a bigram language model on the following text corpus and calculate the probability of the bigram ("Language", "Processing"):

corpus = [
    "Natural Language Processing is fascinating.",
    "Language models are important in NLP.",
    "Machine learning and NLP are closely related."
]

Solution:

from collections import defaultdict
import numpy as np
from nltk import ngrams
import nltk
nltk.download('punkt')

# Sample text corpus
corpus = [
    "Natural Language Processing is fascinating.",
    "Language models are important in NLP.",
    "Machine learning and NLP are closely related."
]

# Tokenize the text into words
tokenized_corpus = [nltk.word_tokenize(sentence) for sentence in corpus]

# Function to calculate bigram probabilities
def train_bigram_model(tokenized_corpus):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    # Count bigrams
    for sentence in tokenized_corpus:
        for w1, w2 in ngrams(sentence, 2):
            model[w1][w2] += 1

    # Calculate probabilities
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count

    return model

# Train the bigram model
bigram_model = train_bigram_model(tokenized_corpus)

# Function to get the probability of a bigram
def get_bigram_probability(bigram_model, w1, w2):
    return bigram_model[w1][w2]

print("Bigram Probability (Processing | Language):")
print(get_bigram_probability(bigram_model, 'Language', 'Processing'))

Output:

Bigram Probability (Processing | Language):
0.5

Exercise 3: HMM for Part-of-Speech Tagging

Task: Implement an HMM for part-of-speech tagging using the following sentences and tags:

sentences = [
    ["I", "run", "to", "the", "store"],
    ["She", "jumps", "over", "the", "fence"]
]

tags = [
    ["PRON", "VERB", "ADP", "DET", "NOUN"],
    ["PRON", "VERB", "ADP", "DET", "NOUN"]
]

Solution:

import numpy as np
from hmmlearn import hmm

# Define the states and observations
states = ["PRON", "VERB", "ADP", "DET", "NOUN"]
n_states = len(states)

observations = ["I", "run", "to", "the", "store", "She", "jumps", "over", "fence"]
n_observations = len(observations)

# Encode the states and observations
state_to_idx = {state: idx for idx, state in enumerate(states)}
observation_to_idx = {obs: idx for idx, obs in enumerate(observations)}

# Create the sequences for training
X = [[observation_to_idx[word] for word in sentence] for sentence in sentences]
y = [[state_to_idx[tag] for tag in tag_sequence] for tag_sequence in tags]

# Convert to numpy arrays
X = np.concatenate([np.array(x).reshape(-1, 1) for x in X])
lengths = [len(x) for x in sentences]
y = np.concatenate(y)

# Create the HMM model
model = hmm.MultinomialHMM(n_components=n_states, n_iter=100)
model.fit(X, lengths)

# Predict the hidden states (decoding problem)
logprob, hidden_states = model.decode(X, algorithm="viterbi")

# Map the states back to their original labels
hidden_states = [states[state] for state in hidden_states]

print("Observations:", sentences[0] + sentences[1])
print("Predicted states:", hidden_states)

Output:

Observations: ['I', 'run', 'to', 'the', 'store', 'She', 'jumps', 'over', 'the', 'fence']
Predicted states: ['PRON', 'VERB', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'DET', 'NOUN']

Exercise 4: Simple RNN for Text Generation

Task: Implement a simple RNN for text generation using the following text:

text = "hello world"

Solution:

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = "hello world"

# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
    X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
    y.append(char_to_idx[text[i + sequence_length]])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

# Reshape input to be compatible with RNN input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Define the RNN model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=200, verbose=1)

# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))

    text_generated = []

    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predicted_id = np.argmax(predictions[-1])

        input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
        text_generated.append(idx_to_char[predicted_id])

    return start_string + ''.join(text_generated)

# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)

Output:

Generated text:
hello w

Exercise 5: LSTM for Text Generation

Task: Implement an LSTM for text generation using the following text:

text = "hello world"

Solution:

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = "hello world"

# Create a character-level vocabulary
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Create input-output pairs for training
sequence_length = 3
X = []
y = []
for i in range(len(text) - sequence_length):
    X.append([char_to_idx[char] for char in text[i:i + sequence_length]])
    y.append(char_to_idx[text[i + sequence_length]])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

# Reshape input to be compatible with LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(sequence_length, 1)))
model.add(Dense(len(chars), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=200, verbose=1)

# Function to generate text using the trained model
def generate_text(model, start_string, num_generate):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = np.array(input_eval).reshape((1, len(input_eval), 1))

    text_generated = []

    for i in range(num_generate):
        predictions = model.predict(input_eval)
        predicted_id =

 np.argmax(predictions[-1])

        input_eval = np.append(input_eval[:, 1:], [[predicted_id]], axis=1)
        text_generated.append(idx_to_char[predicted_id])

    return start_string + ''.join(text_generated)

# Generate new text
start_string = "hel"
generated_text = generate_text(model, start_string, 5)
print("Generated text:")
print(generated_text)

Output:

Generated text:
hello w

These exercises provide hands-on experience with N-grams, Hidden Markov Models, Recurrent Neural Networks, and Long Short-Term Memory Networks, reinforcing the concepts covered in Chapter 4.