Whisper API Tutorial 2026: Transcription, Translation, and Meeting Intelligence
Build automated meeting transcription, speaker diarization, and intelligent meeting summaries using OpenAI Whisper and GPT-4o
Whisper API Tutorial 2026: Transcription, Translation, and Meeting Intelligence
Build automated meeting transcription, speaker diarization, and intelligent meeting summaries using OpenAI Whisper and GPT-4o
Complete guide to using the OpenAI Whisper API for audio transcription in 2026. Covers real-time transcription, speaker identification, meeting summarization, automated action item extraction, and building a complete meeting intelligence system.
Whisper API Tutorial 2026: Transcription, Translation, and Meeting Intelligence
Meeting recordings hold enormous value—but most never get reviewed. This tutorial builds an automated system that transcribes, analyzes, and extracts actionable intelligence from any recorded meeting.
Whisper Capabilities in 2026
Setup
python
from openai import OpenAI
from pathlib import Path
import jsonclient = OpenAI()
Basic Transcription
python
def transcribe_audio(file_path: str, language: str = None) -> dict:
with open(file_path, "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="verbose_json", # Includes timestamps and segments
timestamp_granularities=["word", "segment"],
language=language # None = auto-detect
)
return {
"text": transcript.text,
"language": transcript.language,
"duration": transcript.duration,
"segments": transcript.segments,
"words": transcript.words
}Basic usage
result = transcribe_audio("meeting_recording.mp3")
print(f"Language: {result['language']}")
print(f"Duration: {result['duration']:.0f}s")
print(f"\nTranscript:\n{result['text'][:500]}...")
Handle Large Files with Chunking
python
from pydub import AudioSegment
import tempfile
import osdef transcribe_large_file(file_path: str, chunk_minutes: int = 10) -> str:
"""Handle files larger than 25MB by splitting into chunks."""
audio = AudioSegment.from_file(file_path)
chunk_ms = chunk_minutes * 60 * 1000
full_transcript = []
for i in range(0, len(audio), chunk_ms):
chunk = audio[i:i + chunk_ms]
# Export chunk to temp file
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp:
chunk.export(tmp.name, format="mp3")
try:
result = transcribe_audio(tmp.name)
full_transcript.append(result["text"])
finally:
os.unlink(tmp.name)
return " ".join(full_transcript)
Translation (Non-English to English)
python
def translate_audio(file_path: str) -> str:
"""Transcribe AND translate any language to English."""
with open(file_path, "rb") as audio_file:
translation = client.audio.translations.create(
model="whisper-1",
file=audio_file
)
return translation.textTranslates Spanish/French/German/Japanese/etc. to English
english_text = translate_audio("spanish_meeting.mp3")
print(english_text)
Meeting Intelligence System
The core feature: turn raw transcripts into actionable meeting intelligence.
python
import re
from dataclasses import dataclass
from typing import List, Optional@dataclass
class MeetingInsights:
summary: str
action_items: List[dict]
decisions_made: List[str]
open_questions: List[str]
attendees_mentioned: List[str]
key_topics: List[str]
sentiment: str
follow_up_required: bool
MEETING_ANALYSIS_PROMPT = """Analyze this meeting transcript and extract structured information.
Return JSON with:
{
"summary": "3-5 sentence executive summary",
"action_items": [
{
"task": "specific action",
"owner": "person name or 'unassigned'",
"due_date": "mentioned date or null",
"priority": "high/medium/low"
}
],
"decisions_made": ["decision 1", "decision 2"],
"open_questions": ["question 1", "question 2"],
"attendees_mentioned": ["name1", "name2"],
"key_topics": ["topic1", "topic2"],
"sentiment": "positive/neutral/negative/mixed",
"follow_up_required": true/false
}
Be specific with task descriptions. Capture ALL action items even if no owner is assigned."""
def analyze_meeting(transcript: str) -> MeetingInsights:
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": MEETING_ANALYSIS_PROMPT},
{"role": "user", "content": f"Meeting transcript:\n\n{transcript}"}
],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return MeetingInsights(
summary=data.get("summary", ""),
action_items=data.get("action_items", []),
decisions_made=data.get("decisions_made", []),
open_questions=data.get("open_questions", []),
attendees_mentioned=data.get("attendees_mentioned", []),
key_topics=data.get("key_topics", []),
sentiment=data.get("sentiment", "neutral"),
follow_up_required=data.get("follow_up_required", False)
)
Speaker Diarization
python
def identify_speakers(transcript: str, known_attendees: List[str] = None) -> str:
"""Identify and label different speakers in transcript."""
attendee_context = ""
if known_attendees:
attendee_context = f"Known attendees: {', '.join(known_attendees)}"
response = client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": f"""Identify different speakers in this meeting transcript.
{attendee_context}
Format as:
[SPEAKER A]: text
[SPEAKER B]: text
If you can identify who's speaking from context (introductions, names mentioned),
use their actual name. Otherwise use Speaker A, B, C etc.
Transcript:
{transcript[:8000]}""" # Truncate for context window
}]
)
return response.choices[0].message.content
Complete Pipeline
python
def process_meeting(audio_path: str, attendees: List[str] = None) -> dict:
print(f"Processing: {audio_path}")
# Step 1: Transcribe
print(" Transcribing audio...")
result = transcribe_audio(audio_path)
transcript = result["text"]
duration_minutes = result["duration"] / 60
# Step 2: Identify speakers
print(" Identifying speakers...")
labeled_transcript = identify_speakers(transcript, attendees)
# Step 3: Extract insights
print(" Extracting meeting insights...")
insights = analyze_meeting(labeled_transcript)
# Step 4: Format output
output = {
"file": audio_path,
"duration_minutes": round(duration_minutes, 1),
"language": result["language"],
"transcript": labeled_transcript,
"insights": {
"summary": insights.summary,
"action_items": insights.action_items,
"decisions": insights.decisions_made,
"open_questions": insights.open_questions,
"key_topics": insights.key_topics,
"sentiment": insights.sentiment
}
}
# Step 5: Save results
output_path = Path(audio_path).stem + "_intelligence.json"
with open(output_path, "w") as f:
json.dump(output, f, indent=2)
print(f" Complete! Output saved to: {output_path}")
print(f" Found {len(insights.action_items)} action items")
return outputUsage
meeting_data = process_meeting(
"q4_planning_meeting.mp3",
attendees=["Sarah (CEO)", "Marcus (CTO)", "Priya (VP Sales)"]
)Print summary
print("\n=== MEETING INTELLIGENCE REPORT ===")
print(f"Duration: {meeting_data['duration_minutes']} minutes")
print(f"\nSummary:\n{meeting_data['insights']['summary']}")
print(f"\nAction Items:")
for item in meeting_data['insights']['action_items']:
owner = item.get('owner', 'Unassigned')
due = item.get('due_date', 'No date')
print(f" [{item['priority'].upper()}] {item['task']} → {owner} ({due})")
Integrate with Calendar
python
import datetime
from googleapiclient.discovery import builddef post_to_google_calendar(meeting_data: dict, calendar_id: str, service):
"""Add meeting notes to Google Calendar event."""
action_items_text = "\n".join([
f"• {item['task']} ({item.get('owner', 'TBD')})"
for item in meeting_data['insights']['action_items']
])
description = f"""MEETING SUMMARY
{meeting_data['insights']['summary']}
ACTION ITEMS
{action_items_text}
DECISIONS MADE
{chr(10).join(['• ' + d for d in meeting_data['insights']['decisions']])}"""
# Find today's meeting event and update description
now = datetime.datetime.utcnow()
events_result = service.events().list(
calendarId=calendar_id,
timeMin=now.strftime("%Y-%m-%dT00:00:00Z"),
maxResults=10,
singleEvents=True
).execute()
# Update the matching event with meeting notes
# Implementation depends on how you match recordings to calendar events
Cost and Performance
Analysis cost (GPT-4o): ~$0.10-0.30 per meeting
Total per meeting: $0.40-1.40 — vs. $10-20/meeting for human transcription
Conclusion
The meeting intelligence pipeline above transforms recorded meetings into structured, searchable knowledge. The cost is under $1.50 per meeting. At 10 meetings/week, that's $60/month to never lose a meeting insight again. Most teams find the action item extraction alone justifies the cost—no more wondering who owns what after a meeting.
相关工具
相关教程
Automatically classify, summarize, and draft replies to emails using AI
Build voice AI applications with natural-sounding TTS and custom voice cloning
Transcribe audio files, meetings, and real-time speech with Whisper