Chapter 3: Data Preprocessing and Feature Engineering
Practical Exercises Chapter 3
Exercise 1: Handling Missing Data
Task:You have the following dataset:
Your task is to:
- Detect missing data.
- Impute the missing values in the "Age" and "Salary" columns using the mean of the respective columns.
Solution:
import pandas as pd
# Create the DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, None, 35, 40],
'Salary': [50000, 60000, None, 80000]}
df = pd.DataFrame(data)
# Detect missing data
print("Missing data:\\n", df.isnull().sum())
# Impute missing values with the mean of each column
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())
print("\\nDataFrame after imputation:\\n", df)
Exercise 2: Encoding Categorical Variables
Task: You have the following dataset:
Apply one-hot encoding to the "City" column.
Solution:
# Sample DataFrame
data = {'City': ['New York', 'London', 'Paris', 'London'],
'Temperature': [30, 25, 28, 26]}
df = pd.DataFrame(data)
# One-hot encode the "City" column
df_encoded = pd.get_dummies(df, columns=['City'])
print(df_encoded)
Exercise 3: Feature Engineering - Interaction Terms
Task: You are given a dataset with two features: "Age" and "Salary". Create an interaction term between these two features.
Solution:
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40],
'Salary': [50000, 60000, 70000, 80000]}
df = pd.DataFrame(data)
# Initialize PolynomialFeatures with interaction only
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
# Create interaction terms
interaction_features = poly.fit_transform(df)
# Convert back to DataFrame
df_interaction = pd.DataFrame(interaction_features, columns=['Age', 'Salary', 'Age*Salary'])
print(df_interaction)
Exercise 4: Data Scaling
Task: You are given a dataset with the following features: "Age" and "Income". Apply Min-Max Scaling to both features to scale them between 0 and 1.
Solution:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40],
'Income': [50000, 60000, 70000, 80000]}
df = pd.DataFrame(data)
# Initialize MinMaxScaler
scaler = MinMaxScaler()
# Fit and transform the data
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=['Age', 'Income'])
print(df_scaled)
Exercise 5: Train-Test Split
Task: Given the following dataset:
Split the data into 80% training data and 20% test data.
Solution:
from sklearn.model_selection import train_test_split
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40, 45],
'Salary': [50000, 60000, 70000, 80000, 90000],
'Purchased': [0, 1, 0, 1, 1]}
df = pd.DataFrame(data)
# Features (X) and target (y)
X = df[['Age', 'Salary']]
y = df['Purchased']
# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training Features:\\n", X_train)
print("Test Features:\\n", X_test)
Exercise 6: Cross-Validation
Task: Use 5-fold cross-validation to evaluate the performance of a logistic regression model on the following dataset:
Solution:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40, 45],
'Salary': [50000, 60000, 70000, 80000, 90000],
'Purchased': [0, 1, 0, 1, 1]}
df = pd.DataFrame(data)
# Features (X) and target (y)
X = df[['Age', 'Salary']]
y = df['Purchased']
# Initialize the model
model = LogisticRegression()
# Perform 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5)
print("Cross-Validation Scores:", scores)
print("Average Cross-Validation Accuracy:", scores.mean())
Exercise 7: Data Augmentation for Images
Task: Apply image augmentation techniques such as rotation, zooming, and flipping to an image using Keras’ ImageDataGenerator.
Solution:
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
import matplotlib.pyplot as plt
# Initialize the ImageDataGenerator
datagen = ImageDataGenerator(
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest'
)
# Load an example image
img_path = 'path_to_image.jpg'
img = image.load_img(img_path, target_size=(150, 150))
x = image.img_to_array(img)
x = x.reshape((1,) + x.shape)
# Generate augmented images
i = 0
for batch in datagen.flow(x, batch_size=1):
plt.figure(i)
imgplot = plt.imshow(image.array_to_img(batch[0]))
i += 1
if i % 4 == 0: # Display 4 augmented images
break
plt.show()
Exercise 8: Data Augmentation for Text
Task: Use synonym replacement to augment the following sentence:
"The quick brown fox jumps over the lazy dog."
Solution:
import random
from nltk.corpus import wordnet
# Function to get synonyms of a word
def get_synonyms(word):
synonyms = []
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
synonyms.append(lemma.name())
return synonyms
# Sample sentence
sentence = "The quick brown fox jumps over the lazy dog"
words = sentence.split()
# Randomly replace some words with their synonyms
augmented_sentence = []
for word in words:
synonyms = get_synonyms(word)
if synonyms and random.random() > 0.5: # Replace with a synonym 50% of the time
augmented_sentence.append(random.choice(synonyms))
else:
augmented_sentence.append(word)
augmented_sentence = ' '.join(augmented_sentence)
print("Original sentence:", sentence)
print("Augmented sentence:", augmented_sentence)
These hands-on exercises provide a solid foundation for building and improving machine learning models.
Practical Exercises Chapter 3
Exercise 1: Handling Missing Data
Task:You have the following dataset:
Your task is to:
- Detect missing data.
- Impute the missing values in the "Age" and "Salary" columns using the mean of the respective columns.
Solution:
import pandas as pd
# Create the DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, None, 35, 40],
'Salary': [50000, 60000, None, 80000]}
df = pd.DataFrame(data)
# Detect missing data
print("Missing data:\\n", df.isnull().sum())
# Impute missing values with the mean of each column
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())
print("\\nDataFrame after imputation:\\n", df)
Exercise 2: Encoding Categorical Variables
Task: You have the following dataset:
Apply one-hot encoding to the "City" column.
Solution:
# Sample DataFrame
data = {'City': ['New York', 'London', 'Paris', 'London'],
'Temperature': [30, 25, 28, 26]}
df = pd.DataFrame(data)
# One-hot encode the "City" column
df_encoded = pd.get_dummies(df, columns=['City'])
print(df_encoded)
Exercise 3: Feature Engineering - Interaction Terms
Task: You are given a dataset with two features: "Age" and "Salary". Create an interaction term between these two features.
Solution:
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40],
'Salary': [50000, 60000, 70000, 80000]}
df = pd.DataFrame(data)
# Initialize PolynomialFeatures with interaction only
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
# Create interaction terms
interaction_features = poly.fit_transform(df)
# Convert back to DataFrame
df_interaction = pd.DataFrame(interaction_features, columns=['Age', 'Salary', 'Age*Salary'])
print(df_interaction)
Exercise 4: Data Scaling
Task: You are given a dataset with the following features: "Age" and "Income". Apply Min-Max Scaling to both features to scale them between 0 and 1.
Solution:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40],
'Income': [50000, 60000, 70000, 80000]}
df = pd.DataFrame(data)
# Initialize MinMaxScaler
scaler = MinMaxScaler()
# Fit and transform the data
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=['Age', 'Income'])
print(df_scaled)
Exercise 5: Train-Test Split
Task: Given the following dataset:
Split the data into 80% training data and 20% test data.
Solution:
from sklearn.model_selection import train_test_split
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40, 45],
'Salary': [50000, 60000, 70000, 80000, 90000],
'Purchased': [0, 1, 0, 1, 1]}
df = pd.DataFrame(data)
# Features (X) and target (y)
X = df[['Age', 'Salary']]
y = df['Purchased']
# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training Features:\\n", X_train)
print("Test Features:\\n", X_test)
Exercise 6: Cross-Validation
Task: Use 5-fold cross-validation to evaluate the performance of a logistic regression model on the following dataset:
Solution:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40, 45],
'Salary': [50000, 60000, 70000, 80000, 90000],
'Purchased': [0, 1, 0, 1, 1]}
df = pd.DataFrame(data)
# Features (X) and target (y)
X = df[['Age', 'Salary']]
y = df['Purchased']
# Initialize the model
model = LogisticRegression()
# Perform 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5)
print("Cross-Validation Scores:", scores)
print("Average Cross-Validation Accuracy:", scores.mean())
Exercise 7: Data Augmentation for Images
Task: Apply image augmentation techniques such as rotation, zooming, and flipping to an image using Keras’ ImageDataGenerator.
Solution:
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
import matplotlib.pyplot as plt
# Initialize the ImageDataGenerator
datagen = ImageDataGenerator(
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest'
)
# Load an example image
img_path = 'path_to_image.jpg'
img = image.load_img(img_path, target_size=(150, 150))
x = image.img_to_array(img)
x = x.reshape((1,) + x.shape)
# Generate augmented images
i = 0
for batch in datagen.flow(x, batch_size=1):
plt.figure(i)
imgplot = plt.imshow(image.array_to_img(batch[0]))
i += 1
if i % 4 == 0: # Display 4 augmented images
break
plt.show()
Exercise 8: Data Augmentation for Text
Task: Use synonym replacement to augment the following sentence:
"The quick brown fox jumps over the lazy dog."
Solution:
import random
from nltk.corpus import wordnet
# Function to get synonyms of a word
def get_synonyms(word):
synonyms = []
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
synonyms.append(lemma.name())
return synonyms
# Sample sentence
sentence = "The quick brown fox jumps over the lazy dog"
words = sentence.split()
# Randomly replace some words with their synonyms
augmented_sentence = []
for word in words:
synonyms = get_synonyms(word)
if synonyms and random.random() > 0.5: # Replace with a synonym 50% of the time
augmented_sentence.append(random.choice(synonyms))
else:
augmented_sentence.append(word)
augmented_sentence = ' '.join(augmented_sentence)
print("Original sentence:", sentence)
print("Augmented sentence:", augmented_sentence)
These hands-on exercises provide a solid foundation for building and improving machine learning models.
Practical Exercises Chapter 3
Exercise 1: Handling Missing Data
Task:You have the following dataset:
Your task is to:
- Detect missing data.
- Impute the missing values in the "Age" and "Salary" columns using the mean of the respective columns.
Solution:
import pandas as pd
# Create the DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, None, 35, 40],
'Salary': [50000, 60000, None, 80000]}
df = pd.DataFrame(data)
# Detect missing data
print("Missing data:\\n", df.isnull().sum())
# Impute missing values with the mean of each column
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())
print("\\nDataFrame after imputation:\\n", df)
Exercise 2: Encoding Categorical Variables
Task: You have the following dataset:
Apply one-hot encoding to the "City" column.
Solution:
# Sample DataFrame
data = {'City': ['New York', 'London', 'Paris', 'London'],
'Temperature': [30, 25, 28, 26]}
df = pd.DataFrame(data)
# One-hot encode the "City" column
df_encoded = pd.get_dummies(df, columns=['City'])
print(df_encoded)
Exercise 3: Feature Engineering - Interaction Terms
Task: You are given a dataset with two features: "Age" and "Salary". Create an interaction term between these two features.
Solution:
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40],
'Salary': [50000, 60000, 70000, 80000]}
df = pd.DataFrame(data)
# Initialize PolynomialFeatures with interaction only
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
# Create interaction terms
interaction_features = poly.fit_transform(df)
# Convert back to DataFrame
df_interaction = pd.DataFrame(interaction_features, columns=['Age', 'Salary', 'Age*Salary'])
print(df_interaction)
Exercise 4: Data Scaling
Task: You are given a dataset with the following features: "Age" and "Income". Apply Min-Max Scaling to both features to scale them between 0 and 1.
Solution:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40],
'Income': [50000, 60000, 70000, 80000]}
df = pd.DataFrame(data)
# Initialize MinMaxScaler
scaler = MinMaxScaler()
# Fit and transform the data
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=['Age', 'Income'])
print(df_scaled)
Exercise 5: Train-Test Split
Task: Given the following dataset:
Split the data into 80% training data and 20% test data.
Solution:
from sklearn.model_selection import train_test_split
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40, 45],
'Salary': [50000, 60000, 70000, 80000, 90000],
'Purchased': [0, 1, 0, 1, 1]}
df = pd.DataFrame(data)
# Features (X) and target (y)
X = df[['Age', 'Salary']]
y = df['Purchased']
# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training Features:\\n", X_train)
print("Test Features:\\n", X_test)
Exercise 6: Cross-Validation
Task: Use 5-fold cross-validation to evaluate the performance of a logistic regression model on the following dataset:
Solution:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40, 45],
'Salary': [50000, 60000, 70000, 80000, 90000],
'Purchased': [0, 1, 0, 1, 1]}
df = pd.DataFrame(data)
# Features (X) and target (y)
X = df[['Age', 'Salary']]
y = df['Purchased']
# Initialize the model
model = LogisticRegression()
# Perform 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5)
print("Cross-Validation Scores:", scores)
print("Average Cross-Validation Accuracy:", scores.mean())
Exercise 7: Data Augmentation for Images
Task: Apply image augmentation techniques such as rotation, zooming, and flipping to an image using Keras’ ImageDataGenerator.
Solution:
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
import matplotlib.pyplot as plt
# Initialize the ImageDataGenerator
datagen = ImageDataGenerator(
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest'
)
# Load an example image
img_path = 'path_to_image.jpg'
img = image.load_img(img_path, target_size=(150, 150))
x = image.img_to_array(img)
x = x.reshape((1,) + x.shape)
# Generate augmented images
i = 0
for batch in datagen.flow(x, batch_size=1):
plt.figure(i)
imgplot = plt.imshow(image.array_to_img(batch[0]))
i += 1
if i % 4 == 0: # Display 4 augmented images
break
plt.show()
Exercise 8: Data Augmentation for Text
Task: Use synonym replacement to augment the following sentence:
"The quick brown fox jumps over the lazy dog."
Solution:
import random
from nltk.corpus import wordnet
# Function to get synonyms of a word
def get_synonyms(word):
synonyms = []
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
synonyms.append(lemma.name())
return synonyms
# Sample sentence
sentence = "The quick brown fox jumps over the lazy dog"
words = sentence.split()
# Randomly replace some words with their synonyms
augmented_sentence = []
for word in words:
synonyms = get_synonyms(word)
if synonyms and random.random() > 0.5: # Replace with a synonym 50% of the time
augmented_sentence.append(random.choice(synonyms))
else:
augmented_sentence.append(word)
augmented_sentence = ' '.join(augmented_sentence)
print("Original sentence:", sentence)
print("Augmented sentence:", augmented_sentence)
These hands-on exercises provide a solid foundation for building and improving machine learning models.
Practical Exercises Chapter 3
Exercise 1: Handling Missing Data
Task:You have the following dataset:
Your task is to:
- Detect missing data.
- Impute the missing values in the "Age" and "Salary" columns using the mean of the respective columns.
Solution:
import pandas as pd
# Create the DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, None, 35, 40],
'Salary': [50000, 60000, None, 80000]}
df = pd.DataFrame(data)
# Detect missing data
print("Missing data:\\n", df.isnull().sum())
# Impute missing values with the mean of each column
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())
print("\\nDataFrame after imputation:\\n", df)
Exercise 2: Encoding Categorical Variables
Task: You have the following dataset:
Apply one-hot encoding to the "City" column.
Solution:
# Sample DataFrame
data = {'City': ['New York', 'London', 'Paris', 'London'],
'Temperature': [30, 25, 28, 26]}
df = pd.DataFrame(data)
# One-hot encode the "City" column
df_encoded = pd.get_dummies(df, columns=['City'])
print(df_encoded)
Exercise 3: Feature Engineering - Interaction Terms
Task: You are given a dataset with two features: "Age" and "Salary". Create an interaction term between these two features.
Solution:
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40],
'Salary': [50000, 60000, 70000, 80000]}
df = pd.DataFrame(data)
# Initialize PolynomialFeatures with interaction only
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
# Create interaction terms
interaction_features = poly.fit_transform(df)
# Convert back to DataFrame
df_interaction = pd.DataFrame(interaction_features, columns=['Age', 'Salary', 'Age*Salary'])
print(df_interaction)
Exercise 4: Data Scaling
Task: You are given a dataset with the following features: "Age" and "Income". Apply Min-Max Scaling to both features to scale them between 0 and 1.
Solution:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40],
'Income': [50000, 60000, 70000, 80000]}
df = pd.DataFrame(data)
# Initialize MinMaxScaler
scaler = MinMaxScaler()
# Fit and transform the data
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=['Age', 'Income'])
print(df_scaled)
Exercise 5: Train-Test Split
Task: Given the following dataset:
Split the data into 80% training data and 20% test data.
Solution:
from sklearn.model_selection import train_test_split
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40, 45],
'Salary': [50000, 60000, 70000, 80000, 90000],
'Purchased': [0, 1, 0, 1, 1]}
df = pd.DataFrame(data)
# Features (X) and target (y)
X = df[['Age', 'Salary']]
y = df['Purchased']
# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training Features:\\n", X_train)
print("Test Features:\\n", X_test)
Exercise 6: Cross-Validation
Task: Use 5-fold cross-validation to evaluate the performance of a logistic regression model on the following dataset:
Solution:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import pandas as pd
# Sample DataFrame
data = {'Age': [25, 30, 35, 40, 45],
'Salary': [50000, 60000, 70000, 80000, 90000],
'Purchased': [0, 1, 0, 1, 1]}
df = pd.DataFrame(data)
# Features (X) and target (y)
X = df[['Age', 'Salary']]
y = df['Purchased']
# Initialize the model
model = LogisticRegression()
# Perform 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5)
print("Cross-Validation Scores:", scores)
print("Average Cross-Validation Accuracy:", scores.mean())
Exercise 7: Data Augmentation for Images
Task: Apply image augmentation techniques such as rotation, zooming, and flipping to an image using Keras’ ImageDataGenerator.
Solution:
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
import matplotlib.pyplot as plt
# Initialize the ImageDataGenerator
datagen = ImageDataGenerator(
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest'
)
# Load an example image
img_path = 'path_to_image.jpg'
img = image.load_img(img_path, target_size=(150, 150))
x = image.img_to_array(img)
x = x.reshape((1,) + x.shape)
# Generate augmented images
i = 0
for batch in datagen.flow(x, batch_size=1):
plt.figure(i)
imgplot = plt.imshow(image.array_to_img(batch[0]))
i += 1
if i % 4 == 0: # Display 4 augmented images
break
plt.show()
Exercise 8: Data Augmentation for Text
Task: Use synonym replacement to augment the following sentence:
"The quick brown fox jumps over the lazy dog."
Solution:
import random
from nltk.corpus import wordnet
# Function to get synonyms of a word
def get_synonyms(word):
synonyms = []
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
synonyms.append(lemma.name())
return synonyms
# Sample sentence
sentence = "The quick brown fox jumps over the lazy dog"
words = sentence.split()
# Randomly replace some words with their synonyms
augmented_sentence = []
for word in words:
synonyms = get_synonyms(word)
if synonyms and random.random() > 0.5: # Replace with a synonym 50% of the time
augmented_sentence.append(random.choice(synonyms))
else:
augmented_sentence.append(word)
augmented_sentence = ' '.join(augmented_sentence)
print("Original sentence:", sentence)
print("Augmented sentence:", augmented_sentence)
These hands-on exercises provide a solid foundation for building and improving machine learning models.