Whisper API Tutorial 2026: Transcription, Translation, and Meeting Intelligence

Build automated meeting transcription, speaker diarization, and intelligent meeting summaries using OpenAI Whisper and GPT-4o

返回教程列表
进阶28 分钟

Whisper API Tutorial 2026: Transcription, Translation, and Meeting Intelligence

Build automated meeting transcription, speaker diarization, and intelligent meeting summaries using OpenAI Whisper and GPT-4o

Complete guide to using the OpenAI Whisper API for audio transcription in 2026. Covers real-time transcription, speaker identification, meeting summarization, automated action item extraction, and building a complete meeting intelligence system.

whispertranscriptionmeetingopenaipythonaudio

Whisper API Tutorial 2026: Transcription, Translation, and Meeting Intelligence

Meeting recordings hold enormous value—but most never get reviewed. This tutorial builds an automated system that transcribes, analyzes, and extracts actionable intelligence from any recorded meeting.

Whisper Capabilities in 2026

  • 99+ languages with high accuracy
  • Technical vocabulary (medical, legal, technical)
  • Multiple audio formats: MP3, MP4, MPEG, MPGA, M4A, WAV, WEBM
  • Word-level timestamps
  • Large-v3 model available via API
  • Setup

    python
    from openai import OpenAI
    from pathlib import Path
    import json

    client = OpenAI()

    Basic Transcription

    python
    def transcribe_audio(file_path: str, language: str = None) -> dict:
        with open(file_path, "rb") as audio_file:
            transcript = client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
                response_format="verbose_json",  # Includes timestamps and segments
                timestamp_granularities=["word", "segment"],
                language=language  # None = auto-detect
            )
        
        return {
            "text": transcript.text,
            "language": transcript.language,
            "duration": transcript.duration,
            "segments": transcript.segments,
            "words": transcript.words
        }

    Basic usage

    result = transcribe_audio("meeting_recording.mp3") print(f"Language: {result['language']}") print(f"Duration: {result['duration']:.0f}s") print(f"\nTranscript:\n{result['text'][:500]}...")

    Handle Large Files with Chunking

    python
    from pydub import AudioSegment
    import tempfile
    import os

    def transcribe_large_file(file_path: str, chunk_minutes: int = 10) -> str: """Handle files larger than 25MB by splitting into chunks.""" audio = AudioSegment.from_file(file_path) chunk_ms = chunk_minutes * 60 * 1000 full_transcript = [] for i in range(0, len(audio), chunk_ms): chunk = audio[i:i + chunk_ms] # Export chunk to temp file with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp: chunk.export(tmp.name, format="mp3") try: result = transcribe_audio(tmp.name) full_transcript.append(result["text"]) finally: os.unlink(tmp.name) return " ".join(full_transcript)

    Translation (Non-English to English)

    python
    def translate_audio(file_path: str) -> str:
        """Transcribe AND translate any language to English."""
        with open(file_path, "rb") as audio_file:
            translation = client.audio.translations.create(
                model="whisper-1",
                file=audio_file
            )
        return translation.text

    Translates Spanish/French/German/Japanese/etc. to English

    english_text = translate_audio("spanish_meeting.mp3") print(english_text)

    Meeting Intelligence System

    The core feature: turn raw transcripts into actionable meeting intelligence.

    python
    import re
    from dataclasses import dataclass
    from typing import List, Optional

    @dataclass class MeetingInsights: summary: str action_items: List[dict] decisions_made: List[str] open_questions: List[str] attendees_mentioned: List[str] key_topics: List[str] sentiment: str follow_up_required: bool

    MEETING_ANALYSIS_PROMPT = """Analyze this meeting transcript and extract structured information.

    Return JSON with: { "summary": "3-5 sentence executive summary", "action_items": [ { "task": "specific action", "owner": "person name or 'unassigned'", "due_date": "mentioned date or null", "priority": "high/medium/low" } ], "decisions_made": ["decision 1", "decision 2"], "open_questions": ["question 1", "question 2"], "attendees_mentioned": ["name1", "name2"], "key_topics": ["topic1", "topic2"], "sentiment": "positive/neutral/negative/mixed", "follow_up_required": true/false }

    Be specific with task descriptions. Capture ALL action items even if no owner is assigned."""

    def analyze_meeting(transcript: str) -> MeetingInsights: response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": MEETING_ANALYSIS_PROMPT}, {"role": "user", "content": f"Meeting transcript:\n\n{transcript}"} ], response_format={"type": "json_object"} ) data = json.loads(response.choices[0].message.content) return MeetingInsights( summary=data.get("summary", ""), action_items=data.get("action_items", []), decisions_made=data.get("decisions_made", []), open_questions=data.get("open_questions", []), attendees_mentioned=data.get("attendees_mentioned", []), key_topics=data.get("key_topics", []), sentiment=data.get("sentiment", "neutral"), follow_up_required=data.get("follow_up_required", False) )

    Speaker Diarization

    python
    def identify_speakers(transcript: str, known_attendees: List[str] = None) -> str:
        """Identify and label different speakers in transcript."""
        attendee_context = ""
        if known_attendees:
            attendee_context = f"Known attendees: {', '.join(known_attendees)}"
        
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Identify different speakers in this meeting transcript.
                {attendee_context}
                
                Format as:
                [SPEAKER A]: text
                [SPEAKER B]: text
                
                If you can identify who's speaking from context (introductions, names mentioned),
                use their actual name. Otherwise use Speaker A, B, C etc.
                
                Transcript:
                {transcript[:8000]}"""  # Truncate for context window
            }]
        )
        
        return response.choices[0].message.content
    

    Complete Pipeline

    python
    def process_meeting(audio_path: str, attendees: List[str] = None) -> dict:
        print(f"Processing: {audio_path}")
        
        # Step 1: Transcribe
        print("  Transcribing audio...")
        result = transcribe_audio(audio_path)
        transcript = result["text"]
        duration_minutes = result["duration"] / 60
        
        # Step 2: Identify speakers
        print("  Identifying speakers...")
        labeled_transcript = identify_speakers(transcript, attendees)
        
        # Step 3: Extract insights
        print("  Extracting meeting insights...")
        insights = analyze_meeting(labeled_transcript)
        
        # Step 4: Format output
        output = {
            "file": audio_path,
            "duration_minutes": round(duration_minutes, 1),
            "language": result["language"],
            "transcript": labeled_transcript,
            "insights": {
                "summary": insights.summary,
                "action_items": insights.action_items,
                "decisions": insights.decisions_made,
                "open_questions": insights.open_questions,
                "key_topics": insights.key_topics,
                "sentiment": insights.sentiment
            }
        }
        
        # Step 5: Save results
        output_path = Path(audio_path).stem + "_intelligence.json"
        with open(output_path, "w") as f:
            json.dump(output, f, indent=2)
        
        print(f"  Complete! Output saved to: {output_path}")
        print(f"  Found {len(insights.action_items)} action items")
        
        return output

    Usage

    meeting_data = process_meeting( "q4_planning_meeting.mp3", attendees=["Sarah (CEO)", "Marcus (CTO)", "Priya (VP Sales)"] )

    Print summary

    print("\n=== MEETING INTELLIGENCE REPORT ===") print(f"Duration: {meeting_data['duration_minutes']} minutes") print(f"\nSummary:\n{meeting_data['insights']['summary']}") print(f"\nAction Items:") for item in meeting_data['insights']['action_items']: owner = item.get('owner', 'Unassigned') due = item.get('due_date', 'No date') print(f" [{item['priority'].upper()}] {item['task']} → {owner} ({due})")

    Integrate with Calendar

    python
    import datetime
    from googleapiclient.discovery import build

    def post_to_google_calendar(meeting_data: dict, calendar_id: str, service): """Add meeting notes to Google Calendar event.""" action_items_text = "\n".join([ f"• {item['task']} ({item.get('owner', 'TBD')})" for item in meeting_data['insights']['action_items'] ]) description = f"""MEETING SUMMARY {meeting_data['insights']['summary']}

    ACTION ITEMS {action_items_text}

    DECISIONS MADE {chr(10).join(['• ' + d for d in meeting_data['insights']['decisions']])}""" # Find today's meeting event and update description now = datetime.datetime.utcnow() events_result = service.events().list( calendarId=calendar_id, timeMin=now.strftime("%Y-%m-%dT00:00:00Z"), maxResults=10, singleEvents=True ).execute() # Update the matching event with meeting notes # Implementation depends on how you match recordings to calendar events

    Cost and Performance

    File LengthTranscription TimeAPI Cost

    30 min meeting~45 seconds$0.27 60 min meeting~90 seconds$0.54 2 hour meeting~3 minutes$1.08

    Analysis cost (GPT-4o): ~$0.10-0.30 per meeting

    Total per meeting: $0.40-1.40 — vs. $10-20/meeting for human transcription

    Conclusion

    The meeting intelligence pipeline above transforms recorded meetings into structured, searchable knowledge. The cost is under $1.50 per meeting. At 10 meetings/week, that's $60/month to never lose a meeting insight again. Most teams find the action item extraction alone justifies the cost—no more wondering who owns what after a meeting.

    相关工具

    openaiwhisperpython