Click here to view the next lesson.

Project: Voice Assistant Recorder — Use Whisper + GPT-4o to Transcribe, Summarize, and Analyze

Project Code

Download the audio sample: https://files.cuantum.tech/audio/voice_recording.mp3

import os
from openai import OpenAI, OpenAIError
from dotenv import load_dotenv
import datetime

# --- Configuration ---
load_dotenv()

# Get the current date and location context
current_timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")
current_location = "Houston, Texas, United States" # User location context
print(f"Running Voice Assistant Recorder project at: {current_timestamp}")
print(f"Location Context: {current_location}")


# Initialize the OpenAI client
try:
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY not found in environment variables.")
    client = OpenAI(api_key=api_key)
    print("OpenAI client initialized.")
except ValueError as e:
    print(f"Configuration Error: {e}")
    exit()
except Exception as e:
    print(f"Error initializing OpenAI client: {e}")
    exit()

# Define the path to the input audio file
# IMPORTANT: Replace 'voice_recording.mp3' with the actual filename.
#            Ensure this file exists in the same directory.
audio_file_path = "voice_recording.mp3"

# --- Prerequisites Check ---
if not os.path.exists(audio_file_path):
    print(f"\nError: Input audio file not found at '{audio_file_path}'")
    print("Please make sure the audio file exists and the path is correct.")
    exit()

# --- Step 1: Transcribe Audio using Whisper ---
def transcribe_audio(client, file_path):
    """Transcribes the audio file using Whisper."""
    print(f"\nStep 1: Transcribing audio file: {file_path}")
    # Add note about chunking for long files if size check implemented
    try:
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"File size: {file_size_mb:.2f} MB (Whisper Limit: 25MB per API call)")
        if file_size_mb > 25:
             print("Warning: File exceeds 25MB. Consider chunking for full transcription.")
             # You might want to handle this case differently, e.g., only process first 25MB
    except OSError:
        pass # Ignore size check error

    try:
        with open(file_path, "rb") as audio_file:
            response = client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
                response_format="text"
            )
        print("Transcription successful.")
        return response # Returns plain text
    except OpenAIError as e:
        print(f"OpenAI API Error during transcription: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during transcription: {e}")
        return None

# --- Step 2: Summarize Transcription using GPT-4o ---
def summarize_text(client, text_to_summarize):
    """Generates a concise summary of the provided text using GPT-4o."""
    print("\nStep 2: Generating summary...")
    if not text_to_summarize:
        print("Error: No text provided for summarization.")
        return None

    system_prompt = "You are an expert summarizer. Create a concise summary of the following text."
    user_prompt = f"""Please provide a brief summary (1-3 sentences) of the key points in the following transcription:

Transcription:
---
{text_to_summarize}
---

Summary:
"""
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=150,
            temperature=0.5
        )
        summary = response.choices[0].message.content
        print("Summary generation successful.")
        return summary.strip()
    except OpenAIError as e:
        print(f"OpenAI API Error during summarization: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during summarization: {e}")
        return None

# --- Step 3: (Optional) Extract Action Items using GPT-4o ---
def extract_action_items(client, text_to_analyze):
    """Identifies and extracts action items from the text using GPT-4o."""
    print("\nStep 3: Extracting action items (Optional)...")
    if not text_to_analyze:
        print("Error: No text provided for extraction.")
        return None

    system_prompt = "You are an expert meeting analyst focused on identifying actionable tasks."
    user_prompt = f"""Analyze the following transcription. List any specific action items mentioned. Include who is assigned (if stated) and any deadlines (if stated). If no action items are found, state "None identified".

Transcription Text:
---
{text_to_analyze}
---

Extracted Action Items (Format as a list):
"""
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=300,
            temperature=0.1 # Low temp for factual extraction
        )
        action_items = response.choices[0].message.content
        # Check if GPT actually found items or returned "None identified" etc.
        if "none identified" in action_items.lower() and len(action_items) < 30:
             print("No specific action items identified by the model.")
             return None # Return None if no items found
        else:
             print("Action item extraction successful.")
             return action_items.strip()

    except OpenAIError as e:
        print(f"OpenAI API Error during action item extraction: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during action item extraction: {e}")
        return None

# --- Main Execution Logic ---
if __name__ == "__main__":
    print(f"\nProcessing audio file: {audio_file_path}")

    # 1. Transcribe
    transcription = transcribe_audio(client, audio_file_path)

    if transcription:
        print(f"\n--- Full Transcription ---")
        print(transcription)
        print("-" * 26)

        # 2. Summarize
        summary = summarize_text(client, transcription)
        if summary:
            print(f"\n--- Summary ---")
            print(summary)
            print("-" * 13)

        # 3. Extract Action Items (Optional Step)
        action_items = extract_action_items(client, transcription)
        if action_items:
            print(f"\n--- Action Items ---")
            print(action_items)
            print("-" * 18)
        else:
            # This is expected if no action items were in the audio/transcription
            print("\nNo specific action items were extracted.")

        print("\n--- Processing Complete ---")

    else:
        print("\nProcessing failed due to transcription error.")

Code Breakdown Explanation

This Python script implements the "Voice Assistant Recorder" project, processing an audio file to generate a transcription, summary, and optionally, a list of action items.

Setup and Initialization:
- Imports: Imports necessary libraries: os (for file path checks), openai, dotenv (for API key), datetime, and OpenAIError.
- API Key & Client: Loads the OpenAI API key from a .env file and initializes the OpenAI client object (client) for API interactions. Includes error handling for missing keys or initialization failures.
- File Path: Defines the audio_file_path variable, pointing to the input voice recording file.
- Prerequisite Check: Includes a basic check using os.path.exists() to ensure the specified audio file actually exists before proceeding.
Step 1: Transcribe Audio (transcribe_audio function):
- Purpose: Converts the input audio file into text using Whisper.
- Input: Takes the client object and the file_path of the audio.
- File Handling: Opens the audio file in binary read mode ("rb"). Includes an optional file size check and warning regarding Whisper's 25MB limit.
- API Call: Uses client.audio.transcriptions.create with model="whisper-1" and response_format="text".
- Output: Returns the plain text transcription as a string, or None if an error occurs.
Step 2: Summarize Transcription (summarize_text function):
- Purpose: Creates a concise summary of the transcribed text using GPT-4o.
- Input: Takes the client object and the text_to_summarize (the output from Whisper).
- Prompt Engineering: Constructs a prompt instructing GPT-4o to act as an expert summarizer and provide a brief summary (1-3 sentences) of the key points from the provided transcription.
- API Call: Uses client.chat.completions.create with model="gpt-4o", the system/user prompts, and appropriate max_tokens and temperature settings for summarization.
- Output: Returns the generated summary text as a string, or None on error.
Step 3: Extract Action Items (extract_action_items function - Optional):
- Purpose: Identifies and lists specific tasks or commitments mentioned in the transcription using GPT-4o.
- Input: Takes the client object and the text_to_analyze (the transcription).
- Prompt Engineering: Uses a prompt specifically designed to instruct GPT-4o to find action items, including assigned person and deadline if mentioned, and to format them as a list. It also asks the model to distinguish tasks from suggestions and to state "None identified" if applicable. A low temperature (0.1) is used for more factual extraction.
- API Call: Uses client.chat.completions.create with model="gpt-4o".
- Output: Returns the list of action items as a string. It includes a basic check to return None if the model explicitly indicates no items were found, preventing the printing of just "None identified."
Main Execution (if __name__ == "__main__":):
- Orchestration: This block controls the overall flow.
- It calls transcribe_audio first.
- If transcription is successful, it proceeds to call summarize_text.
- It then calls extract_action_items.
- Output Display: It prints the full transcription, the generated summary, and the extracted action items (if any were found) to the console with clear headings.
- Error Handling: Includes basic checks to ensure steps only run if the previous step was successful and provides messages if processing fails.

This script provides a clear workflow for transforming raw voice recordings into structured, actionable information using the combined power of Whisper and GPT-4o.

Project Code

Download the audio sample: https://files.cuantum.tech/audio/voice_recording.mp3

import os
from openai import OpenAI, OpenAIError
from dotenv import load_dotenv
import datetime

# --- Configuration ---
load_dotenv()

# Get the current date and location context
current_timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")
current_location = "Houston, Texas, United States" # User location context
print(f"Running Voice Assistant Recorder project at: {current_timestamp}")
print(f"Location Context: {current_location}")


# Initialize the OpenAI client
try:
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY not found in environment variables.")
    client = OpenAI(api_key=api_key)
    print("OpenAI client initialized.")
except ValueError as e:
    print(f"Configuration Error: {e}")
    exit()
except Exception as e:
    print(f"Error initializing OpenAI client: {e}")
    exit()

# Define the path to the input audio file
# IMPORTANT: Replace 'voice_recording.mp3' with the actual filename.
#            Ensure this file exists in the same directory.
audio_file_path = "voice_recording.mp3"

# --- Prerequisites Check ---
if not os.path.exists(audio_file_path):
    print(f"\nError: Input audio file not found at '{audio_file_path}'")
    print("Please make sure the audio file exists and the path is correct.")
    exit()

# --- Step 1: Transcribe Audio using Whisper ---
def transcribe_audio(client, file_path):
    """Transcribes the audio file using Whisper."""
    print(f"\nStep 1: Transcribing audio file: {file_path}")
    # Add note about chunking for long files if size check implemented
    try:
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"File size: {file_size_mb:.2f} MB (Whisper Limit: 25MB per API call)")
        if file_size_mb > 25:
             print("Warning: File exceeds 25MB. Consider chunking for full transcription.")
             # You might want to handle this case differently, e.g., only process first 25MB
    except OSError:
        pass # Ignore size check error

    try:
        with open(file_path, "rb") as audio_file:
            response = client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
                response_format="text"
            )
        print("Transcription successful.")
        return response # Returns plain text
    except OpenAIError as e:
        print(f"OpenAI API Error during transcription: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during transcription: {e}")
        return None

# --- Step 2: Summarize Transcription using GPT-4o ---
def summarize_text(client, text_to_summarize):
    """Generates a concise summary of the provided text using GPT-4o."""
    print("\nStep 2: Generating summary...")
    if not text_to_summarize:
        print("Error: No text provided for summarization.")
        return None

    system_prompt = "You are an expert summarizer. Create a concise summary of the following text."
    user_prompt = f"""Please provide a brief summary (1-3 sentences) of the key points in the following transcription:

Transcription:
---
{text_to_summarize}
---

Summary:
"""
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=150,
            temperature=0.5
        )
        summary = response.choices[0].message.content
        print("Summary generation successful.")
        return summary.strip()
    except OpenAIError as e:
        print(f"OpenAI API Error during summarization: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during summarization: {e}")
        return None

# --- Step 3: (Optional) Extract Action Items using GPT-4o ---
def extract_action_items(client, text_to_analyze):
    """Identifies and extracts action items from the text using GPT-4o."""
    print("\nStep 3: Extracting action items (Optional)...")
    if not text_to_analyze:
        print("Error: No text provided for extraction.")
        return None

    system_prompt = "You are an expert meeting analyst focused on identifying actionable tasks."
    user_prompt = f"""Analyze the following transcription. List any specific action items mentioned. Include who is assigned (if stated) and any deadlines (if stated). If no action items are found, state "None identified".

Transcription Text:
---
{text_to_analyze}
---

Extracted Action Items (Format as a list):
"""
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=300,
            temperature=0.1 # Low temp for factual extraction
        )
        action_items = response.choices[0].message.content
        # Check if GPT actually found items or returned "None identified" etc.
        if "none identified" in action_items.lower() and len(action_items) < 30:
             print("No specific action items identified by the model.")
             return None # Return None if no items found
        else:
             print("Action item extraction successful.")
             return action_items.strip()

    except OpenAIError as e:
        print(f"OpenAI API Error during action item extraction: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during action item extraction: {e}")
        return None

# --- Main Execution Logic ---
if __name__ == "__main__":
    print(f"\nProcessing audio file: {audio_file_path}")

    # 1. Transcribe
    transcription = transcribe_audio(client, audio_file_path)

    if transcription:
        print(f"\n--- Full Transcription ---")
        print(transcription)
        print("-" * 26)

        # 2. Summarize
        summary = summarize_text(client, transcription)
        if summary:
            print(f"\n--- Summary ---")
            print(summary)
            print("-" * 13)

        # 3. Extract Action Items (Optional Step)
        action_items = extract_action_items(client, transcription)
        if action_items:
            print(f"\n--- Action Items ---")
            print(action_items)
            print("-" * 18)
        else:
            # This is expected if no action items were in the audio/transcription
            print("\nNo specific action items were extracted.")

        print("\n--- Processing Complete ---")

    else:
        print("\nProcessing failed due to transcription error.")

Code Breakdown Explanation

This Python script implements the "Voice Assistant Recorder" project, processing an audio file to generate a transcription, summary, and optionally, a list of action items.

Setup and Initialization:
- Imports: Imports necessary libraries: os (for file path checks), openai, dotenv (for API key), datetime, and OpenAIError.
- API Key & Client: Loads the OpenAI API key from a .env file and initializes the OpenAI client object (client) for API interactions. Includes error handling for missing keys or initialization failures.
- File Path: Defines the audio_file_path variable, pointing to the input voice recording file.
- Prerequisite Check: Includes a basic check using os.path.exists() to ensure the specified audio file actually exists before proceeding.
Step 1: Transcribe Audio (transcribe_audio function):
- Purpose: Converts the input audio file into text using Whisper.
- Input: Takes the client object and the file_path of the audio.
- File Handling: Opens the audio file in binary read mode ("rb"). Includes an optional file size check and warning regarding Whisper's 25MB limit.
- API Call: Uses client.audio.transcriptions.create with model="whisper-1" and response_format="text".
- Output: Returns the plain text transcription as a string, or None if an error occurs.
Step 2: Summarize Transcription (summarize_text function):
- Purpose: Creates a concise summary of the transcribed text using GPT-4o.
- Input: Takes the client object and the text_to_summarize (the output from Whisper).
- Prompt Engineering: Constructs a prompt instructing GPT-4o to act as an expert summarizer and provide a brief summary (1-3 sentences) of the key points from the provided transcription.
- API Call: Uses client.chat.completions.create with model="gpt-4o", the system/user prompts, and appropriate max_tokens and temperature settings for summarization.
- Output: Returns the generated summary text as a string, or None on error.
Step 3: Extract Action Items (extract_action_items function - Optional):
- Purpose: Identifies and lists specific tasks or commitments mentioned in the transcription using GPT-4o.
- Input: Takes the client object and the text_to_analyze (the transcription).
- Prompt Engineering: Uses a prompt specifically designed to instruct GPT-4o to find action items, including assigned person and deadline if mentioned, and to format them as a list. It also asks the model to distinguish tasks from suggestions and to state "None identified" if applicable. A low temperature (0.1) is used for more factual extraction.
- API Call: Uses client.chat.completions.create with model="gpt-4o".
- Output: Returns the list of action items as a string. It includes a basic check to return None if the model explicitly indicates no items were found, preventing the printing of just "None identified."
Main Execution (if __name__ == "__main__":):
- Orchestration: This block controls the overall flow.
- It calls transcribe_audio first.
- If transcription is successful, it proceeds to call summarize_text.
- It then calls extract_action_items.
- Output Display: It prints the full transcription, the generated summary, and the extracted action items (if any were found) to the console with clear headings.
- Error Handling: Includes basic checks to ensure steps only run if the previous step was successful and provides messages if processing fails.

This script provides a clear workflow for transforming raw voice recordings into structured, actionable information using the combined power of Whisper and GPT-4o.

Project Code

Download the audio sample: https://files.cuantum.tech/audio/voice_recording.mp3

import os
from openai import OpenAI, OpenAIError
from dotenv import load_dotenv
import datetime

# --- Configuration ---
load_dotenv()

# Get the current date and location context
current_timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")
current_location = "Houston, Texas, United States" # User location context
print(f"Running Voice Assistant Recorder project at: {current_timestamp}")
print(f"Location Context: {current_location}")


# Initialize the OpenAI client
try:
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY not found in environment variables.")
    client = OpenAI(api_key=api_key)
    print("OpenAI client initialized.")
except ValueError as e:
    print(f"Configuration Error: {e}")
    exit()
except Exception as e:
    print(f"Error initializing OpenAI client: {e}")
    exit()

# Define the path to the input audio file
# IMPORTANT: Replace 'voice_recording.mp3' with the actual filename.
#            Ensure this file exists in the same directory.
audio_file_path = "voice_recording.mp3"

# --- Prerequisites Check ---
if not os.path.exists(audio_file_path):
    print(f"\nError: Input audio file not found at '{audio_file_path}'")
    print("Please make sure the audio file exists and the path is correct.")
    exit()

# --- Step 1: Transcribe Audio using Whisper ---
def transcribe_audio(client, file_path):
    """Transcribes the audio file using Whisper."""
    print(f"\nStep 1: Transcribing audio file: {file_path}")
    # Add note about chunking for long files if size check implemented
    try:
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"File size: {file_size_mb:.2f} MB (Whisper Limit: 25MB per API call)")
        if file_size_mb > 25:
             print("Warning: File exceeds 25MB. Consider chunking for full transcription.")
             # You might want to handle this case differently, e.g., only process first 25MB
    except OSError:
        pass # Ignore size check error

    try:
        with open(file_path, "rb") as audio_file:
            response = client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
                response_format="text"
            )
        print("Transcription successful.")
        return response # Returns plain text
    except OpenAIError as e:
        print(f"OpenAI API Error during transcription: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during transcription: {e}")
        return None

# --- Step 2: Summarize Transcription using GPT-4o ---
def summarize_text(client, text_to_summarize):
    """Generates a concise summary of the provided text using GPT-4o."""
    print("\nStep 2: Generating summary...")
    if not text_to_summarize:
        print("Error: No text provided for summarization.")
        return None

    system_prompt = "You are an expert summarizer. Create a concise summary of the following text."
    user_prompt = f"""Please provide a brief summary (1-3 sentences) of the key points in the following transcription:

Transcription:
---
{text_to_summarize}
---

Summary:
"""
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=150,
            temperature=0.5
        )
        summary = response.choices[0].message.content
        print("Summary generation successful.")
        return summary.strip()
    except OpenAIError as e:
        print(f"OpenAI API Error during summarization: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during summarization: {e}")
        return None

# --- Step 3: (Optional) Extract Action Items using GPT-4o ---
def extract_action_items(client, text_to_analyze):
    """Identifies and extracts action items from the text using GPT-4o."""
    print("\nStep 3: Extracting action items (Optional)...")
    if not text_to_analyze:
        print("Error: No text provided for extraction.")
        return None

    system_prompt = "You are an expert meeting analyst focused on identifying actionable tasks."
    user_prompt = f"""Analyze the following transcription. List any specific action items mentioned. Include who is assigned (if stated) and any deadlines (if stated). If no action items are found, state "None identified".

Transcription Text:
---
{text_to_analyze}
---

Extracted Action Items (Format as a list):
"""
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=300,
            temperature=0.1 # Low temp for factual extraction
        )
        action_items = response.choices[0].message.content
        # Check if GPT actually found items or returned "None identified" etc.
        if "none identified" in action_items.lower() and len(action_items) < 30:
             print("No specific action items identified by the model.")
             return None # Return None if no items found
        else:
             print("Action item extraction successful.")
             return action_items.strip()

    except OpenAIError as e:
        print(f"OpenAI API Error during action item extraction: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during action item extraction: {e}")
        return None

# --- Main Execution Logic ---
if __name__ == "__main__":
    print(f"\nProcessing audio file: {audio_file_path}")

    # 1. Transcribe
    transcription = transcribe_audio(client, audio_file_path)

    if transcription:
        print(f"\n--- Full Transcription ---")
        print(transcription)
        print("-" * 26)

        # 2. Summarize
        summary = summarize_text(client, transcription)
        if summary:
            print(f"\n--- Summary ---")
            print(summary)
            print("-" * 13)

        # 3. Extract Action Items (Optional Step)
        action_items = extract_action_items(client, transcription)
        if action_items:
            print(f"\n--- Action Items ---")
            print(action_items)
            print("-" * 18)
        else:
            # This is expected if no action items were in the audio/transcription
            print("\nNo specific action items were extracted.")

        print("\n--- Processing Complete ---")

    else:
        print("\nProcessing failed due to transcription error.")

Code Breakdown Explanation

This Python script implements the "Voice Assistant Recorder" project, processing an audio file to generate a transcription, summary, and optionally, a list of action items.

Setup and Initialization:
- Imports: Imports necessary libraries: os (for file path checks), openai, dotenv (for API key), datetime, and OpenAIError.
- API Key & Client: Loads the OpenAI API key from a .env file and initializes the OpenAI client object (client) for API interactions. Includes error handling for missing keys or initialization failures.
- File Path: Defines the audio_file_path variable, pointing to the input voice recording file.
- Prerequisite Check: Includes a basic check using os.path.exists() to ensure the specified audio file actually exists before proceeding.
Step 1: Transcribe Audio (transcribe_audio function):
- Purpose: Converts the input audio file into text using Whisper.
- Input: Takes the client object and the file_path of the audio.
- File Handling: Opens the audio file in binary read mode ("rb"). Includes an optional file size check and warning regarding Whisper's 25MB limit.
- API Call: Uses client.audio.transcriptions.create with model="whisper-1" and response_format="text".
- Output: Returns the plain text transcription as a string, or None if an error occurs.
Step 2: Summarize Transcription (summarize_text function):
- Purpose: Creates a concise summary of the transcribed text using GPT-4o.
- Input: Takes the client object and the text_to_summarize (the output from Whisper).
- Prompt Engineering: Constructs a prompt instructing GPT-4o to act as an expert summarizer and provide a brief summary (1-3 sentences) of the key points from the provided transcription.
- API Call: Uses client.chat.completions.create with model="gpt-4o", the system/user prompts, and appropriate max_tokens and temperature settings for summarization.
- Output: Returns the generated summary text as a string, or None on error.
Step 3: Extract Action Items (extract_action_items function - Optional):
- Purpose: Identifies and lists specific tasks or commitments mentioned in the transcription using GPT-4o.
- Input: Takes the client object and the text_to_analyze (the transcription).
- Prompt Engineering: Uses a prompt specifically designed to instruct GPT-4o to find action items, including assigned person and deadline if mentioned, and to format them as a list. It also asks the model to distinguish tasks from suggestions and to state "None identified" if applicable. A low temperature (0.1) is used for more factual extraction.
- API Call: Uses client.chat.completions.create with model="gpt-4o".
- Output: Returns the list of action items as a string. It includes a basic check to return None if the model explicitly indicates no items were found, preventing the printing of just "None identified."
Main Execution (if __name__ == "__main__":):
- Orchestration: This block controls the overall flow.
- It calls transcribe_audio first.
- If transcription is successful, it proceeds to call summarize_text.
- It then calls extract_action_items.
- Output Display: It prints the full transcription, the generated summary, and the extracted action items (if any were found) to the console with clear headings.
- Error Handling: Includes basic checks to ensure steps only run if the previous step was successful and provides messages if processing fails.

This script provides a clear workflow for transforming raw voice recordings into structured, actionable information using the combined power of Whisper and GPT-4o.

Project Code

Download the audio sample: https://files.cuantum.tech/audio/voice_recording.mp3

import os
from openai import OpenAI, OpenAIError
from dotenv import load_dotenv
import datetime

# --- Configuration ---
load_dotenv()

# Get the current date and location context
current_timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")
current_location = "Houston, Texas, United States" # User location context
print(f"Running Voice Assistant Recorder project at: {current_timestamp}")
print(f"Location Context: {current_location}")


# Initialize the OpenAI client
try:
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY not found in environment variables.")
    client = OpenAI(api_key=api_key)
    print("OpenAI client initialized.")
except ValueError as e:
    print(f"Configuration Error: {e}")
    exit()
except Exception as e:
    print(f"Error initializing OpenAI client: {e}")
    exit()

# Define the path to the input audio file
# IMPORTANT: Replace 'voice_recording.mp3' with the actual filename.
#            Ensure this file exists in the same directory.
audio_file_path = "voice_recording.mp3"

# --- Prerequisites Check ---
if not os.path.exists(audio_file_path):
    print(f"\nError: Input audio file not found at '{audio_file_path}'")
    print("Please make sure the audio file exists and the path is correct.")
    exit()

# --- Step 1: Transcribe Audio using Whisper ---
def transcribe_audio(client, file_path):
    """Transcribes the audio file using Whisper."""
    print(f"\nStep 1: Transcribing audio file: {file_path}")
    # Add note about chunking for long files if size check implemented
    try:
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"File size: {file_size_mb:.2f} MB (Whisper Limit: 25MB per API call)")
        if file_size_mb > 25:
             print("Warning: File exceeds 25MB. Consider chunking for full transcription.")
             # You might want to handle this case differently, e.g., only process first 25MB
    except OSError:
        pass # Ignore size check error

    try:
        with open(file_path, "rb") as audio_file:
            response = client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
                response_format="text"
            )
        print("Transcription successful.")
        return response # Returns plain text
    except OpenAIError as e:
        print(f"OpenAI API Error during transcription: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during transcription: {e}")
        return None

# --- Step 2: Summarize Transcription using GPT-4o ---
def summarize_text(client, text_to_summarize):
    """Generates a concise summary of the provided text using GPT-4o."""
    print("\nStep 2: Generating summary...")
    if not text_to_summarize:
        print("Error: No text provided for summarization.")
        return None

    system_prompt = "You are an expert summarizer. Create a concise summary of the following text."
    user_prompt = f"""Please provide a brief summary (1-3 sentences) of the key points in the following transcription:

Transcription:
---
{text_to_summarize}
---

Summary:
"""
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=150,
            temperature=0.5
        )
        summary = response.choices[0].message.content
        print("Summary generation successful.")
        return summary.strip()
    except OpenAIError as e:
        print(f"OpenAI API Error during summarization: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during summarization: {e}")
        return None

# --- Step 3: (Optional) Extract Action Items using GPT-4o ---
def extract_action_items(client, text_to_analyze):
    """Identifies and extracts action items from the text using GPT-4o."""
    print("\nStep 3: Extracting action items (Optional)...")
    if not text_to_analyze:
        print("Error: No text provided for extraction.")
        return None

    system_prompt = "You are an expert meeting analyst focused on identifying actionable tasks."
    user_prompt = f"""Analyze the following transcription. List any specific action items mentioned. Include who is assigned (if stated) and any deadlines (if stated). If no action items are found, state "None identified".

Transcription Text:
---
{text_to_analyze}
---

Extracted Action Items (Format as a list):
"""
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=300,
            temperature=0.1 # Low temp for factual extraction
        )
        action_items = response.choices[0].message.content
        # Check if GPT actually found items or returned "None identified" etc.
        if "none identified" in action_items.lower() and len(action_items) < 30:
             print("No specific action items identified by the model.")
             return None # Return None if no items found
        else:
             print("Action item extraction successful.")
             return action_items.strip()

    except OpenAIError as e:
        print(f"OpenAI API Error during action item extraction: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during action item extraction: {e}")
        return None

# --- Main Execution Logic ---
if __name__ == "__main__":
    print(f"\nProcessing audio file: {audio_file_path}")

    # 1. Transcribe
    transcription = transcribe_audio(client, audio_file_path)

    if transcription:
        print(f"\n--- Full Transcription ---")
        print(transcription)
        print("-" * 26)

        # 2. Summarize
        summary = summarize_text(client, transcription)
        if summary:
            print(f"\n--- Summary ---")
            print(summary)
            print("-" * 13)

        # 3. Extract Action Items (Optional Step)
        action_items = extract_action_items(client, transcription)
        if action_items:
            print(f"\n--- Action Items ---")
            print(action_items)
            print("-" * 18)
        else:
            # This is expected if no action items were in the audio/transcription
            print("\nNo specific action items were extracted.")

        print("\n--- Processing Complete ---")

    else:
        print("\nProcessing failed due to transcription error.")

Code Breakdown Explanation

This Python script implements the "Voice Assistant Recorder" project, processing an audio file to generate a transcription, summary, and optionally, a list of action items.

Setup and Initialization:
- Imports: Imports necessary libraries: os (for file path checks), openai, dotenv (for API key), datetime, and OpenAIError.
- API Key & Client: Loads the OpenAI API key from a .env file and initializes the OpenAI client object (client) for API interactions. Includes error handling for missing keys or initialization failures.
- File Path: Defines the audio_file_path variable, pointing to the input voice recording file.
- Prerequisite Check: Includes a basic check using os.path.exists() to ensure the specified audio file actually exists before proceeding.
Step 1: Transcribe Audio (transcribe_audio function):
- Purpose: Converts the input audio file into text using Whisper.
- Input: Takes the client object and the file_path of the audio.
- File Handling: Opens the audio file in binary read mode ("rb"). Includes an optional file size check and warning regarding Whisper's 25MB limit.
- API Call: Uses client.audio.transcriptions.create with model="whisper-1" and response_format="text".
- Output: Returns the plain text transcription as a string, or None if an error occurs.
Step 2: Summarize Transcription (summarize_text function):
- Purpose: Creates a concise summary of the transcribed text using GPT-4o.
- Input: Takes the client object and the text_to_summarize (the output from Whisper).
- Prompt Engineering: Constructs a prompt instructing GPT-4o to act as an expert summarizer and provide a brief summary (1-3 sentences) of the key points from the provided transcription.
- API Call: Uses client.chat.completions.create with model="gpt-4o", the system/user prompts, and appropriate max_tokens and temperature settings for summarization.
- Output: Returns the generated summary text as a string, or None on error.
Step 3: Extract Action Items (extract_action_items function - Optional):
- Purpose: Identifies and lists specific tasks or commitments mentioned in the transcription using GPT-4o.
- Input: Takes the client object and the text_to_analyze (the transcription).
- Prompt Engineering: Uses a prompt specifically designed to instruct GPT-4o to find action items, including assigned person and deadline if mentioned, and to format them as a list. It also asks the model to distinguish tasks from suggestions and to state "None identified" if applicable. A low temperature (0.1) is used for more factual extraction.
- API Call: Uses client.chat.completions.create with model="gpt-4o".
- Output: Returns the list of action items as a string. It includes a basic check to return None if the model explicitly indicates no items were found, preventing the printing of just "None identified."
Main Execution (if __name__ == "__main__":):
- Orchestration: This block controls the overall flow.
- It calls transcribe_audio first.
- If transcription is successful, it proceeds to call summarize_text.
- It then calls extract_action_items.
- Output Display: It prints the full transcription, the generated summary, and the extracted action items (if any were found) to the console with clear headings.
- Error Handling: Includes basic checks to ensure steps only run if the previous step was successful and provides messages if processing fails.

This script provides a clear workflow for transforming raw voice recordings into structured, actionable information using the combined power of Whisper and GPT-4o.

Purchase this book

The App is Under a Quick Maintenance

We apologize for the inconvenience. Please come back later

Project: Voice Assistant Recorder — Use Whisper + GPT-4o to Transcribe, Summarize, and Analyze

Project Code

Project Code

Project Code

Project Code