Chapter 1: Image Generation and Vision with OpenAI Models
Practical Exercises — Chapter 1
Exercise 1: Generate an Image from a Text Prompt Using DALL·E 3
Task:
Create an assistant with image generation capabilities and use it to generate an image of “a cozy mountain cabin at sunrise in watercolor style.”
Solution:
import openai
import time
# Create an assistant with DALL·E image generation tool
assistant = openai.beta.assistants.create(
name="Image Generator",
instructions="You generate artistic images based on user prompts.",
model="gpt-4o",
tools=[{"type": "image_generation"}]
)
# Create a thread
thread = openai.beta.threads.create()
# Add user message with prompt
openai.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content="Create an image of a cozy mountain cabin at sunrise in watercolor style."
)
# Run the assistant
run = openai.beta.threads.runs.create(
assistant_id=assistant.id,
thread_id=thread.id
)
# Wait for completion
while True:
run_status = openai.beta.threads.runs.retrieve(run.id, thread_id=thread.id)
if run_status.status == "completed":
break
time.sleep(1)
# Retrieve response
messages = openai.beta.threads.messages.list(thread_id=thread.id)
for msg in messages.data:
for content in msg.content:
if content.type == "image_file":
print("Image URL:", content.image_file.url)
Exercise 2: Edit an Existing Image Using Inpainting
Task:
Upload a PNG image and ask DALL·E to replace “a dog in the image with a cat.”
Solution:
# Upload an image for editing (with transparent section or masked area)
image_file = openai.files.create(
file=open("dog_scene.png", "rb"), # Image with masked-out dog area
purpose="image_edit"
)
# Create the assistant
assistant = openai.beta.assistants.create(
name="Editor",
instructions="You edit uploaded images based on user commands.",
model="gpt-4o",
tools=[{"type": "image_editing"}]
)
# Create a thread
thread = openai.beta.threads.create()
# Add edit request
openai.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content="Replace the dog in this image with a gray cat sitting calmly.",
file_ids=[image_file.id]
)
# Run the assistant
run = openai.beta.threads.runs.create(
assistant_id=assistant.id,
thread_id=thread.id
)
# Wait for run to complete
while True:
run_status = openai.beta.threads.runs.retrieve(run.id, thread_id=thread.id)
if run_status.status == "completed":
break
time.sleep(1)
# Get the edited image
messages = openai.beta.threads.messages.list(thread_id=thread.id)
for msg in messages.data:
for content in msg.content:
if content.type == "image_file":
print("Edited Image URL:", content.image_file.url)
Exercise 3: Vision-Based Image Analysis with GPT-4o
Task:
Upload an image of a pie chart and ask GPT-4o to summarize the main takeaway from it.
Solution:
# Upload a pie chart image
image_file = openai.files.create(
file=open("company_expenses_piechart.png", "rb"),
purpose="vision"
)
# Send image + text prompt to GPT-4o
response = openai.ChatCompletion.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Summarize the key insights from this pie chart."},
{"type": "image_url", "image_url": {"url": f"file-{image_file.id}"}}
]
}
],
max_tokens=300,
temperature=0.4
)
print("Summary:", response["choices"][0]["message"]["content"])
Exercise 4: Multimodal Comparison Between Two Designs
Task:
Upload two UI design mockups and ask GPT-4o to compare them and suggest improvements.
Solution:
# Upload the two UI designs
file_1 = openai.files.create(file=open("design_A.png", "rb"), purpose="vision")
file_2 = openai.files.create(file=open("design_B.png", "rb"), purpose="vision")
# Send both images with prompt
response = openai.ChatCompletion.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Compare these two app designs and suggest improvements for the second one."},
{"type": "image_url", "image_url": {"url": f"file-{file_1.id}"}},
{"type": "image_url", "image_url": {"url": f"file-{file_2.id}"}}
]
}
],
max_tokens=500
)
print("Comparison Summary:", response["choices"][0]["message"]["content"])
These exercises gave you practical experience with:
- Image generation using DALL·E 3
- Image editing and inpainting with natural language
- Visual reasoning and analysis with GPT-4o
- Multimodal input combining text and multiple images
You now have the tools to build engaging visual applications—whether you're creating art, automating content generation, analyzing data, or enhancing accessibility.
Practical Exercises — Chapter 1
Exercise 1: Generate an Image from a Text Prompt Using DALL·E 3
Task:
Create an assistant with image generation capabilities and use it to generate an image of “a cozy mountain cabin at sunrise in watercolor style.”
Solution:
import openai
import time
# Create an assistant with DALL·E image generation tool
assistant = openai.beta.assistants.create(
name="Image Generator",
instructions="You generate artistic images based on user prompts.",
model="gpt-4o",
tools=[{"type": "image_generation"}]
)
# Create a thread
thread = openai.beta.threads.create()
# Add user message with prompt
openai.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content="Create an image of a cozy mountain cabin at sunrise in watercolor style."
)
# Run the assistant
run = openai.beta.threads.runs.create(
assistant_id=assistant.id,
thread_id=thread.id
)
# Wait for completion
while True:
run_status = openai.beta.threads.runs.retrieve(run.id, thread_id=thread.id)
if run_status.status == "completed":
break
time.sleep(1)
# Retrieve response
messages = openai.beta.threads.messages.list(thread_id=thread.id)
for msg in messages.data:
for content in msg.content:
if content.type == "image_file":
print("Image URL:", content.image_file.url)
Exercise 2: Edit an Existing Image Using Inpainting
Task:
Upload a PNG image and ask DALL·E to replace “a dog in the image with a cat.”
Solution:
# Upload an image for editing (with transparent section or masked area)
image_file = openai.files.create(
file=open("dog_scene.png", "rb"), # Image with masked-out dog area
purpose="image_edit"
)
# Create the assistant
assistant = openai.beta.assistants.create(
name="Editor",
instructions="You edit uploaded images based on user commands.",
model="gpt-4o",
tools=[{"type": "image_editing"}]
)
# Create a thread
thread = openai.beta.threads.create()
# Add edit request
openai.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content="Replace the dog in this image with a gray cat sitting calmly.",
file_ids=[image_file.id]
)
# Run the assistant
run = openai.beta.threads.runs.create(
assistant_id=assistant.id,
thread_id=thread.id
)
# Wait for run to complete
while True:
run_status = openai.beta.threads.runs.retrieve(run.id, thread_id=thread.id)
if run_status.status == "completed":
break
time.sleep(1)
# Get the edited image
messages = openai.beta.threads.messages.list(thread_id=thread.id)
for msg in messages.data:
for content in msg.content:
if content.type == "image_file":
print("Edited Image URL:", content.image_file.url)
Exercise 3: Vision-Based Image Analysis with GPT-4o
Task:
Upload an image of a pie chart and ask GPT-4o to summarize the main takeaway from it.
Solution:
# Upload a pie chart image
image_file = openai.files.create(
file=open("company_expenses_piechart.png", "rb"),
purpose="vision"
)
# Send image + text prompt to GPT-4o
response = openai.ChatCompletion.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Summarize the key insights from this pie chart."},
{"type": "image_url", "image_url": {"url": f"file-{image_file.id}"}}
]
}
],
max_tokens=300,
temperature=0.4
)
print("Summary:", response["choices"][0]["message"]["content"])
Exercise 4: Multimodal Comparison Between Two Designs
Task:
Upload two UI design mockups and ask GPT-4o to compare them and suggest improvements.
Solution:
# Upload the two UI designs
file_1 = openai.files.create(file=open("design_A.png", "rb"), purpose="vision")
file_2 = openai.files.create(file=open("design_B.png", "rb"), purpose="vision")
# Send both images with prompt
response = openai.ChatCompletion.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Compare these two app designs and suggest improvements for the second one."},
{"type": "image_url", "image_url": {"url": f"file-{file_1.id}"}},
{"type": "image_url", "image_url": {"url": f"file-{file_2.id}"}}
]
}
],
max_tokens=500
)
print("Comparison Summary:", response["choices"][0]["message"]["content"])
These exercises gave you practical experience with:
- Image generation using DALL·E 3
- Image editing and inpainting with natural language
- Visual reasoning and analysis with GPT-4o
- Multimodal input combining text and multiple images
You now have the tools to build engaging visual applications—whether you're creating art, automating content generation, analyzing data, or enhancing accessibility.
Practical Exercises — Chapter 1
Exercise 1: Generate an Image from a Text Prompt Using DALL·E 3
Task:
Create an assistant with image generation capabilities and use it to generate an image of “a cozy mountain cabin at sunrise in watercolor style.”
Solution:
import openai
import time
# Create an assistant with DALL·E image generation tool
assistant = openai.beta.assistants.create(
name="Image Generator",
instructions="You generate artistic images based on user prompts.",
model="gpt-4o",
tools=[{"type": "image_generation"}]
)
# Create a thread
thread = openai.beta.threads.create()
# Add user message with prompt
openai.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content="Create an image of a cozy mountain cabin at sunrise in watercolor style."
)
# Run the assistant
run = openai.beta.threads.runs.create(
assistant_id=assistant.id,
thread_id=thread.id
)
# Wait for completion
while True:
run_status = openai.beta.threads.runs.retrieve(run.id, thread_id=thread.id)
if run_status.status == "completed":
break
time.sleep(1)
# Retrieve response
messages = openai.beta.threads.messages.list(thread_id=thread.id)
for msg in messages.data:
for content in msg.content:
if content.type == "image_file":
print("Image URL:", content.image_file.url)
Exercise 2: Edit an Existing Image Using Inpainting
Task:
Upload a PNG image and ask DALL·E to replace “a dog in the image with a cat.”
Solution:
# Upload an image for editing (with transparent section or masked area)
image_file = openai.files.create(
file=open("dog_scene.png", "rb"), # Image with masked-out dog area
purpose="image_edit"
)
# Create the assistant
assistant = openai.beta.assistants.create(
name="Editor",
instructions="You edit uploaded images based on user commands.",
model="gpt-4o",
tools=[{"type": "image_editing"}]
)
# Create a thread
thread = openai.beta.threads.create()
# Add edit request
openai.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content="Replace the dog in this image with a gray cat sitting calmly.",
file_ids=[image_file.id]
)
# Run the assistant
run = openai.beta.threads.runs.create(
assistant_id=assistant.id,
thread_id=thread.id
)
# Wait for run to complete
while True:
run_status = openai.beta.threads.runs.retrieve(run.id, thread_id=thread.id)
if run_status.status == "completed":
break
time.sleep(1)
# Get the edited image
messages = openai.beta.threads.messages.list(thread_id=thread.id)
for msg in messages.data:
for content in msg.content:
if content.type == "image_file":
print("Edited Image URL:", content.image_file.url)
Exercise 3: Vision-Based Image Analysis with GPT-4o
Task:
Upload an image of a pie chart and ask GPT-4o to summarize the main takeaway from it.
Solution:
# Upload a pie chart image
image_file = openai.files.create(
file=open("company_expenses_piechart.png", "rb"),
purpose="vision"
)
# Send image + text prompt to GPT-4o
response = openai.ChatCompletion.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Summarize the key insights from this pie chart."},
{"type": "image_url", "image_url": {"url": f"file-{image_file.id}"}}
]
}
],
max_tokens=300,
temperature=0.4
)
print("Summary:", response["choices"][0]["message"]["content"])
Exercise 4: Multimodal Comparison Between Two Designs
Task:
Upload two UI design mockups and ask GPT-4o to compare them and suggest improvements.
Solution:
# Upload the two UI designs
file_1 = openai.files.create(file=open("design_A.png", "rb"), purpose="vision")
file_2 = openai.files.create(file=open("design_B.png", "rb"), purpose="vision")
# Send both images with prompt
response = openai.ChatCompletion.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Compare these two app designs and suggest improvements for the second one."},
{"type": "image_url", "image_url": {"url": f"file-{file_1.id}"}},
{"type": "image_url", "image_url": {"url": f"file-{file_2.id}"}}
]
}
],
max_tokens=500
)
print("Comparison Summary:", response["choices"][0]["message"]["content"])
These exercises gave you practical experience with:
- Image generation using DALL·E 3
- Image editing and inpainting with natural language
- Visual reasoning and analysis with GPT-4o
- Multimodal input combining text and multiple images
You now have the tools to build engaging visual applications—whether you're creating art, automating content generation, analyzing data, or enhancing accessibility.
Practical Exercises — Chapter 1
Exercise 1: Generate an Image from a Text Prompt Using DALL·E 3
Task:
Create an assistant with image generation capabilities and use it to generate an image of “a cozy mountain cabin at sunrise in watercolor style.”
Solution:
import openai
import time
# Create an assistant with DALL·E image generation tool
assistant = openai.beta.assistants.create(
name="Image Generator",
instructions="You generate artistic images based on user prompts.",
model="gpt-4o",
tools=[{"type": "image_generation"}]
)
# Create a thread
thread = openai.beta.threads.create()
# Add user message with prompt
openai.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content="Create an image of a cozy mountain cabin at sunrise in watercolor style."
)
# Run the assistant
run = openai.beta.threads.runs.create(
assistant_id=assistant.id,
thread_id=thread.id
)
# Wait for completion
while True:
run_status = openai.beta.threads.runs.retrieve(run.id, thread_id=thread.id)
if run_status.status == "completed":
break
time.sleep(1)
# Retrieve response
messages = openai.beta.threads.messages.list(thread_id=thread.id)
for msg in messages.data:
for content in msg.content:
if content.type == "image_file":
print("Image URL:", content.image_file.url)
Exercise 2: Edit an Existing Image Using Inpainting
Task:
Upload a PNG image and ask DALL·E to replace “a dog in the image with a cat.”
Solution:
# Upload an image for editing (with transparent section or masked area)
image_file = openai.files.create(
file=open("dog_scene.png", "rb"), # Image with masked-out dog area
purpose="image_edit"
)
# Create the assistant
assistant = openai.beta.assistants.create(
name="Editor",
instructions="You edit uploaded images based on user commands.",
model="gpt-4o",
tools=[{"type": "image_editing"}]
)
# Create a thread
thread = openai.beta.threads.create()
# Add edit request
openai.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content="Replace the dog in this image with a gray cat sitting calmly.",
file_ids=[image_file.id]
)
# Run the assistant
run = openai.beta.threads.runs.create(
assistant_id=assistant.id,
thread_id=thread.id
)
# Wait for run to complete
while True:
run_status = openai.beta.threads.runs.retrieve(run.id, thread_id=thread.id)
if run_status.status == "completed":
break
time.sleep(1)
# Get the edited image
messages = openai.beta.threads.messages.list(thread_id=thread.id)
for msg in messages.data:
for content in msg.content:
if content.type == "image_file":
print("Edited Image URL:", content.image_file.url)
Exercise 3: Vision-Based Image Analysis with GPT-4o
Task:
Upload an image of a pie chart and ask GPT-4o to summarize the main takeaway from it.
Solution:
# Upload a pie chart image
image_file = openai.files.create(
file=open("company_expenses_piechart.png", "rb"),
purpose="vision"
)
# Send image + text prompt to GPT-4o
response = openai.ChatCompletion.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Summarize the key insights from this pie chart."},
{"type": "image_url", "image_url": {"url": f"file-{image_file.id}"}}
]
}
],
max_tokens=300,
temperature=0.4
)
print("Summary:", response["choices"][0]["message"]["content"])
Exercise 4: Multimodal Comparison Between Two Designs
Task:
Upload two UI design mockups and ask GPT-4o to compare them and suggest improvements.
Solution:
# Upload the two UI designs
file_1 = openai.files.create(file=open("design_A.png", "rb"), purpose="vision")
file_2 = openai.files.create(file=open("design_B.png", "rb"), purpose="vision")
# Send both images with prompt
response = openai.ChatCompletion.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Compare these two app designs and suggest improvements for the second one."},
{"type": "image_url", "image_url": {"url": f"file-{file_1.id}"}},
{"type": "image_url", "image_url": {"url": f"file-{file_2.id}"}}
]
}
],
max_tokens=500
)
print("Comparison Summary:", response["choices"][0]["message"]["content"])
These exercises gave you practical experience with:
- Image generation using DALL·E 3
- Image editing and inpainting with natural language
- Visual reasoning and analysis with GPT-4o
- Multimodal input combining text and multiple images
You now have the tools to build engaging visual applications—whether you're creating art, automating content generation, analyzing data, or enhancing accessibility.