| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316 |
- import Foundation
- import AVFoundation
- import Speech
- /// A single piece of transcript text attributed to a channel and a time range
- /// (offsets are in seconds from the start of the recording).
- struct TranscriptSegment: Codable, Hashable {
- let speaker: String
- let startOffset: TimeInterval
- let endOffset: TimeInterval
- let text: String
- }
- /// Logical speaker labels used when merging per-channel transcripts.
- enum TranscriptSpeaker: String {
- case microphone = "You"
- case system = "Meeting"
- }
- /// Progress snapshot for UI status updates.
- struct MeetingTranscriptionProgress: Sendable {
- let totalChunks: Int
- let completedChunks: Int
- }
- enum MeetingTranscriptionError: Error, LocalizedError {
- case authorizationDenied
- case authorizationRestricted
- case recognizerUnavailable(locale: String)
- case noAudioToTranscribe
- var errorDescription: String? {
- switch self {
- case .authorizationDenied:
- return "Speech recognition permission denied. Enable it in System Settings and try again."
- case .authorizationRestricted:
- return "Speech recognition is restricted on this Mac."
- case .recognizerUnavailable(let locale):
- return "Speech recognizer is unavailable for \(locale)."
- case .noAudioToTranscribe:
- return "No audio was available to transcribe."
- }
- }
- }
- /// Transcribes meeting audio by running Apple Speech on per-channel files
- /// in fixed-size chunks, falling back across a list of locales per chunk.
- final class MeetingTranscriptionService {
- private struct ChunkPlan {
- let index: Int
- let startFrame: AVAudioFramePosition
- let frameCount: AVAudioFrameCount
- let startOffset: TimeInterval
- let endOffset: TimeInterval
- }
- /// Shared progress counter used across concurrent channels.
- private actor ProgressCounter {
- private let total: Int
- private var completed: Int = 0
- private let onProgress: (@Sendable (MeetingTranscriptionProgress) -> Void)?
- init(total: Int, onProgress: (@Sendable (MeetingTranscriptionProgress) -> Void)?) {
- self.total = total
- self.onProgress = onProgress
- }
- func emitInitial() {
- onProgress?(MeetingTranscriptionProgress(totalChunks: total, completedChunks: 0))
- }
- func increment() {
- completed += 1
- onProgress?(MeetingTranscriptionProgress(totalChunks: total, completedChunks: completed))
- }
- }
- func requestAuthorization() async throws {
- switch SFSpeechRecognizer.authorizationStatus() {
- case .authorized:
- return
- case .notDetermined:
- let status: SFSpeechRecognizerAuthorizationStatus = await withCheckedContinuation { continuation in
- SFSpeechRecognizer.requestAuthorization { continuation.resume(returning: $0) }
- }
- guard status == .authorized else { throw MeetingTranscriptionError.authorizationDenied }
- case .denied:
- throw MeetingTranscriptionError.authorizationDenied
- case .restricted:
- throw MeetingTranscriptionError.authorizationRestricted
- @unknown default:
- throw MeetingTranscriptionError.authorizationDenied
- }
- }
- /// Transcribes the mic and system channel audio (either may be nil) and
- /// returns a flat, time-ordered list of transcript segments labeled with
- /// the speaker channel.
- func transcribeMeeting(
- micURL: URL?,
- systemURL: URL?,
- chunkSeconds: TimeInterval = 30,
- overlapSeconds: TimeInterval = 0,
- locales: [Locale] = [Locale(identifier: "en-US")],
- onProgress: (@Sendable (MeetingTranscriptionProgress) -> Void)? = nil
- ) async throws -> [TranscriptSegment] {
- try await requestAuthorization()
- let micPlan: (URL, [ChunkPlan])? = try micURL.flatMap { url -> (URL, [ChunkPlan])? in
- guard FileManager.default.fileExists(atPath: url.path) else { return nil }
- let chunks = try planChunks(for: url, chunkSeconds: chunkSeconds, overlapSeconds: overlapSeconds)
- return chunks.isEmpty ? nil : (url, chunks)
- }
- let systemPlan: (URL, [ChunkPlan])? = try systemURL.flatMap { url -> (URL, [ChunkPlan])? in
- guard FileManager.default.fileExists(atPath: url.path) else { return nil }
- let chunks = try planChunks(for: url, chunkSeconds: chunkSeconds, overlapSeconds: overlapSeconds)
- return chunks.isEmpty ? nil : (url, chunks)
- }
- let totalChunks = (micPlan?.1.count ?? 0) + (systemPlan?.1.count ?? 0)
- guard totalChunks > 0 else {
- throw MeetingTranscriptionError.noAudioToTranscribe
- }
- let counter = ProgressCounter(total: totalChunks, onProgress: onProgress)
- await counter.emitInitial()
- let effectiveLocales = locales.isEmpty ? [Locale(identifier: "en-US")] : locales
- async let micSegments: [TranscriptSegment] = {
- guard let plan = micPlan else { return [] }
- return try await self.transcribeChannel(
- url: plan.0,
- chunks: plan.1,
- speaker: .microphone,
- locales: effectiveLocales,
- counter: counter
- )
- }()
- async let systemSegments: [TranscriptSegment] = {
- guard let plan = systemPlan else { return [] }
- return try await self.transcribeChannel(
- url: plan.0,
- chunks: plan.1,
- speaker: .system,
- locales: effectiveLocales,
- counter: counter
- )
- }()
- let combined = try await micSegments + systemSegments
- return combined
- .filter { $0.text.isEmpty == false }
- .sorted { $0.startOffset < $1.startOffset }
- }
- // MARK: - Chunk planning
- private func planChunks(for url: URL, chunkSeconds: TimeInterval, overlapSeconds: TimeInterval) throws -> [ChunkPlan] {
- let audioFile = try AVAudioFile(forReading: url)
- let sampleRate = audioFile.processingFormat.sampleRate
- guard sampleRate > 0 else { return [] }
- let totalFrames = audioFile.length
- guard totalFrames > 0 else { return [] }
- let chunkSamples = AVAudioFramePosition(max(1, chunkSeconds * sampleRate))
- let overlapSamples = AVAudioFramePosition(max(0, overlapSeconds * sampleRate))
- let step = max(AVAudioFramePosition(1), chunkSamples - overlapSamples)
- var plans: [ChunkPlan] = []
- var start: AVAudioFramePosition = 0
- var index = 0
- while start < totalFrames {
- let end = min(start + chunkSamples, totalFrames)
- let frameCount = AVAudioFrameCount(end - start)
- let startOffset = Double(start) / sampleRate
- let endOffset = Double(end) / sampleRate
- plans.append(ChunkPlan(
- index: index,
- startFrame: start,
- frameCount: frameCount,
- startOffset: startOffset,
- endOffset: endOffset
- ))
- index += 1
- if end >= totalFrames { break }
- start += step
- }
- return plans
- }
- // MARK: - Per-channel transcription
- private func transcribeChannel(
- url: URL,
- chunks: [ChunkPlan],
- speaker: TranscriptSpeaker,
- locales: [Locale],
- counter: ProgressCounter
- ) async throws -> [TranscriptSegment] {
- var segments: [TranscriptSegment] = []
- segments.reserveCapacity(chunks.count)
- for plan in chunks {
- try Task.checkCancellation()
- let buffer = try readChunkBuffer(url: url, startFrame: plan.startFrame, frameCount: plan.frameCount)
- let text = await transcribeBufferWithLocaleFallback(buffer: buffer, locales: locales)
- await counter.increment()
- let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
- if trimmed.isEmpty { continue }
- segments.append(TranscriptSegment(
- speaker: speaker.rawValue,
- startOffset: plan.startOffset,
- endOffset: plan.endOffset,
- text: trimmed
- ))
- }
- return segments
- }
- private func readChunkBuffer(url: URL, startFrame: AVAudioFramePosition, frameCount: AVAudioFrameCount) throws -> AVAudioPCMBuffer {
- let audioFile = try AVAudioFile(forReading: url)
- audioFile.framePosition = startFrame
- guard let buffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat, frameCapacity: frameCount) else {
- throw NSError(domain: "MeetingTranscriptionService", code: 1, userInfo: [NSLocalizedDescriptionKey: "Unable to allocate audio buffer."])
- }
- try audioFile.read(into: buffer, frameCount: frameCount)
- return buffer
- }
- private func transcribeBufferWithLocaleFallback(buffer: AVAudioPCMBuffer, locales: [Locale]) async -> String {
- for locale in locales {
- guard let recognizer = SFSpeechRecognizer(locale: locale), recognizer.isAvailable else { continue }
- do {
- let text = try await transcribeBuffer(buffer: buffer, recognizer: recognizer)
- let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
- if trimmed.isEmpty == false { return trimmed }
- } catch {
- // One transient retry before moving on to the next locale.
- try? await Task.sleep(nanoseconds: 500_000_000)
- if let text = try? await transcribeBuffer(buffer: buffer, recognizer: recognizer) {
- let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
- if trimmed.isEmpty == false { return trimmed }
- }
- continue
- }
- }
- return ""
- }
- private func transcribeBuffer(buffer: AVAudioPCMBuffer, recognizer: SFSpeechRecognizer) async throws -> String {
- let request = SFSpeechAudioBufferRecognitionRequest()
- request.shouldReportPartialResults = false
- if #available(macOS 13.0, *) {
- request.addsPunctuation = true
- }
- return try await withCheckedThrowingContinuation { continuation in
- var hasResumed = false
- let lock = NSLock()
- func resumeOnce(with result: Result<String, Error>) {
- lock.lock()
- defer { lock.unlock() }
- if hasResumed { return }
- hasResumed = true
- switch result {
- case .success(let text):
- continuation.resume(returning: text)
- case .failure(let error):
- continuation.resume(throwing: error)
- }
- }
- let task = recognizer.recognitionTask(with: request) { result, error in
- if let error {
- let nsError = error as NSError
- // "No speech detected" is a normal empty-chunk outcome (code 203 in kafAssistant domain).
- if nsError.domain == "kAFAssistantErrorDomain" && (nsError.code == 203 || nsError.code == 1110) {
- resumeOnce(with: .success(""))
- return
- }
- resumeOnce(with: .failure(error))
- return
- }
- if let result, result.isFinal {
- resumeOnce(with: .success(result.bestTranscription.formattedString))
- }
- }
- request.append(buffer)
- request.endAudio()
- _ = task
- }
- }
- }
- extension Array where Element == TranscriptSegment {
- /// Renders segments as a human-readable timeline like:
- /// `[00:12] You: Hello everyone.`
- func renderedTimelineText() -> String {
- let formatter: (TimeInterval) -> String = { seconds in
- let total = Int(seconds.rounded(.down))
- let h = total / 3600
- let m = (total % 3600) / 60
- let s = total % 60
- if h > 0 {
- return String(format: "%02d:%02d:%02d", h, m, s)
- }
- return String(format: "%02d:%02d", m, s)
- }
- return self.map { segment in
- "[\(formatter(segment.startOffset))] \(segment.speaker): \(segment.text)"
- }.joined(separator: "\n")
- }
- }
|