MQL-Development
/
meetings_app


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
							import Foundation
import AVFoundation
import Speech

/// A single piece of transcript text attributed to a channel and a time range
/// (offsets are in seconds from the start of the recording).
struct TranscriptSegment: Codable, Hashable {
    let speaker: String
    let startOffset: TimeInterval
    let endOffset: TimeInterval
    let text: String
}

/// Logical speaker labels used when merging per-channel transcripts.
enum TranscriptSpeaker: String {
    case microphone = "You"
    case system = "Meeting"
}

/// Progress snapshot for UI status updates.
struct MeetingTranscriptionProgress: Sendable {
    let totalChunks: Int
    let completedChunks: Int
}

enum MeetingTranscriptionError: Error, LocalizedError {
    case authorizationDenied
    case authorizationRestricted
    case recognizerUnavailable(locale: String)
    case noAudioToTranscribe

    var errorDescription: String? {
        switch self {
        case .authorizationDenied:
            return "Speech recognition permission denied. Enable it in System Settings and try again."
        case .authorizationRestricted:
            return "Speech recognition is restricted on this Mac."
        case .recognizerUnavailable(let locale):
            return "Speech recognizer is unavailable for \(locale)."
        case .noAudioToTranscribe:
            return "No audio was available to transcribe."
        }
    }
}

/// Transcribes meeting audio by running Apple Speech on per-channel files
/// in fixed-size chunks, falling back across a list of locales per chunk.
final class MeetingTranscriptionService {
    private struct ChunkPlan {
        let index: Int
        let startFrame: AVAudioFramePosition
        let frameCount: AVAudioFrameCount
        let startOffset: TimeInterval
        let endOffset: TimeInterval
    }

    /// Shared progress counter used across concurrent channels.
    private actor ProgressCounter {
        private let total: Int
        private var completed: Int = 0
        private let onProgress: (@Sendable (MeetingTranscriptionProgress) -> Void)?

        init(total: Int, onProgress: (@Sendable (MeetingTranscriptionProgress) -> Void)?) {
            self.total = total
            self.onProgress = onProgress
        }

        func emitInitial() {
            onProgress?(MeetingTranscriptionProgress(totalChunks: total, completedChunks: 0))
        }

        func increment() {
            completed += 1
            onProgress?(MeetingTranscriptionProgress(totalChunks: total, completedChunks: completed))
        }
    }

    func requestAuthorization() async throws {
        switch SFSpeechRecognizer.authorizationStatus() {
        case .authorized:
            return
        case .notDetermined:
            let status: SFSpeechRecognizerAuthorizationStatus = await withCheckedContinuation { continuation in
                SFSpeechRecognizer.requestAuthorization { continuation.resume(returning: $0) }
            }
            guard status == .authorized else { throw MeetingTranscriptionError.authorizationDenied }
        case .denied:
            throw MeetingTranscriptionError.authorizationDenied
        case .restricted:
            throw MeetingTranscriptionError.authorizationRestricted
        @unknown default:
            throw MeetingTranscriptionError.authorizationDenied
        }
    }

    /// Transcribes the mic and system channel audio (either may be nil) and
    /// returns a flat, time-ordered list of transcript segments labeled with
    /// the speaker channel.
    func transcribeMeeting(
        micURL: URL?,
        systemURL: URL?,
        chunkSeconds: TimeInterval = 30,
        overlapSeconds: TimeInterval = 0,
        locales: [Locale] = [Locale(identifier: "en-US")],
        onProgress: (@Sendable (MeetingTranscriptionProgress) -> Void)? = nil
    ) async throws -> [TranscriptSegment] {
        try await requestAuthorization()

        let micPlan: (URL, [ChunkPlan])? = try micURL.flatMap { url -> (URL, [ChunkPlan])? in
            guard FileManager.default.fileExists(atPath: url.path) else { return nil }
            let chunks = try planChunks(for: url, chunkSeconds: chunkSeconds, overlapSeconds: overlapSeconds)
            return chunks.isEmpty ? nil : (url, chunks)
        }
        let systemPlan: (URL, [ChunkPlan])? = try systemURL.flatMap { url -> (URL, [ChunkPlan])? in
            guard FileManager.default.fileExists(atPath: url.path) else { return nil }
            let chunks = try planChunks(for: url, chunkSeconds: chunkSeconds, overlapSeconds: overlapSeconds)
            return chunks.isEmpty ? nil : (url, chunks)
        }

        let totalChunks = (micPlan?.1.count ?? 0) + (systemPlan?.1.count ?? 0)
        guard totalChunks > 0 else {
            throw MeetingTranscriptionError.noAudioToTranscribe
        }

        let counter = ProgressCounter(total: totalChunks, onProgress: onProgress)
        await counter.emitInitial()

        let effectiveLocales = locales.isEmpty ? [Locale(identifier: "en-US")] : locales

        async let micSegments: [TranscriptSegment] = {
            guard let plan = micPlan else { return [] }
            return try await self.transcribeChannel(
                url: plan.0,
                chunks: plan.1,
                speaker: .microphone,
                locales: effectiveLocales,
                counter: counter
            )
        }()
        async let systemSegments: [TranscriptSegment] = {
            guard let plan = systemPlan else { return [] }
            return try await self.transcribeChannel(
                url: plan.0,
                chunks: plan.1,
                speaker: .system,
                locales: effectiveLocales,
                counter: counter
            )
        }()

        let combined = try await micSegments + systemSegments
        return combined
            .filter { $0.text.isEmpty == false }
            .sorted { $0.startOffset < $1.startOffset }
    }

    // MARK: - Chunk planning

    private func planChunks(for url: URL, chunkSeconds: TimeInterval, overlapSeconds: TimeInterval) throws -> [ChunkPlan] {
        let audioFile = try AVAudioFile(forReading: url)
        let sampleRate = audioFile.processingFormat.sampleRate
        guard sampleRate > 0 else { return [] }
        let totalFrames = audioFile.length
        guard totalFrames > 0 else { return [] }

        let chunkSamples = AVAudioFramePosition(max(1, chunkSeconds * sampleRate))
        let overlapSamples = AVAudioFramePosition(max(0, overlapSeconds * sampleRate))
        let step = max(AVAudioFramePosition(1), chunkSamples - overlapSamples)

        var plans: [ChunkPlan] = []
        var start: AVAudioFramePosition = 0
        var index = 0
        while start < totalFrames {
            let end = min(start + chunkSamples, totalFrames)
            let frameCount = AVAudioFrameCount(end - start)
            let startOffset = Double(start) / sampleRate
            let endOffset = Double(end) / sampleRate
            plans.append(ChunkPlan(
                index: index,
                startFrame: start,
                frameCount: frameCount,
                startOffset: startOffset,
                endOffset: endOffset
            ))
            index += 1
            if end >= totalFrames { break }
            start += step
        }
        return plans
    }

    // MARK: - Per-channel transcription

    private func transcribeChannel(
        url: URL,
        chunks: [ChunkPlan],
        speaker: TranscriptSpeaker,
        locales: [Locale],
        counter: ProgressCounter
    ) async throws -> [TranscriptSegment] {
        var segments: [TranscriptSegment] = []
        segments.reserveCapacity(chunks.count)

        for plan in chunks {
            try Task.checkCancellation()
            let buffer = try readChunkBuffer(url: url, startFrame: plan.startFrame, frameCount: plan.frameCount)
            let text = await transcribeBufferWithLocaleFallback(buffer: buffer, locales: locales)
            await counter.increment()
            let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
            if trimmed.isEmpty { continue }
            segments.append(TranscriptSegment(
                speaker: speaker.rawValue,
                startOffset: plan.startOffset,
                endOffset: plan.endOffset,
                text: trimmed
            ))
        }
        return segments
    }

    private func readChunkBuffer(url: URL, startFrame: AVAudioFramePosition, frameCount: AVAudioFrameCount) throws -> AVAudioPCMBuffer {
        let audioFile = try AVAudioFile(forReading: url)
        audioFile.framePosition = startFrame
        guard let buffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat, frameCapacity: frameCount) else {
            throw NSError(domain: "MeetingTranscriptionService", code: 1, userInfo: [NSLocalizedDescriptionKey: "Unable to allocate audio buffer."])
        }
        try audioFile.read(into: buffer, frameCount: frameCount)
        return buffer
    }

    private func transcribeBufferWithLocaleFallback(buffer: AVAudioPCMBuffer, locales: [Locale]) async -> String {
        for locale in locales {
            guard let recognizer = SFSpeechRecognizer(locale: locale), recognizer.isAvailable else { continue }
            do {
                let text = try await transcribeBuffer(buffer: buffer, recognizer: recognizer)
                let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
                if trimmed.isEmpty == false { return trimmed }
            } catch {
                // One transient retry before moving on to the next locale.
                try? await Task.sleep(nanoseconds: 500_000_000)
                if let text = try? await transcribeBuffer(buffer: buffer, recognizer: recognizer) {
                    let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
                    if trimmed.isEmpty == false { return trimmed }
                }
                continue
            }
        }
        return ""
    }

    private func transcribeBuffer(buffer: AVAudioPCMBuffer, recognizer: SFSpeechRecognizer) async throws -> String {
        let request = SFSpeechAudioBufferRecognitionRequest()
        request.shouldReportPartialResults = false
        if #available(macOS 13.0, *) {
            request.addsPunctuation = true
        }

        return try await withCheckedThrowingContinuation { continuation in
            var hasResumed = false
            let lock = NSLock()
            func resumeOnce(with result: Result<String, Error>) {
                lock.lock()
                defer { lock.unlock() }
                if hasResumed { return }
                hasResumed = true
                switch result {
                case .success(let text):
                    continuation.resume(returning: text)
                case .failure(let error):
                    continuation.resume(throwing: error)
                }
            }

            let task = recognizer.recognitionTask(with: request) { result, error in
                if let error {
                    let nsError = error as NSError
                    // "No speech detected" is a normal empty-chunk outcome (code 203 in kafAssistant domain).
                    if nsError.domain == "kAFAssistantErrorDomain" && (nsError.code == 203 || nsError.code == 1110) {
                        resumeOnce(with: .success(""))
                        return
                    }
                    resumeOnce(with: .failure(error))
                    return
                }
                if let result, result.isFinal {
                    resumeOnce(with: .success(result.bestTranscription.formattedString))
                }
            }

            request.append(buffer)
            request.endAudio()
            _ = task
        }
    }
}

extension Array where Element == TranscriptSegment {
    /// Renders segments as a human-readable timeline like:
    /// `[00:12] You: Hello everyone.`
    func renderedTimelineText() -> String {
        let formatter: (TimeInterval) -> String = { seconds in
            let total = Int(seconds.rounded(.down))
            let h = total / 3600
            let m = (total % 3600) / 60
            let s = total % 60
            if h > 0 {
                return String(format: "%02d:%02d:%02d", h, m, s)
            }
            return String(format: "%02d:%02d", m, s)
        }
        return self.map { segment in
            "[\(formatter(segment.startOffset))] \(segment.speaker): \(segment.text)"
        }.joined(separator: "\n")
    }
}