Ingen beskrivning

MeetingTranscriptionService.swift 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. import Foundation
  2. import AVFoundation
  3. import Speech
  4. /// A single piece of transcript text attributed to a channel and a time range
  5. /// (offsets are in seconds from the start of the recording).
  6. struct TranscriptSegment: Codable, Hashable {
  7. let speaker: String
  8. let startOffset: TimeInterval
  9. let endOffset: TimeInterval
  10. let text: String
  11. }
  12. /// Logical speaker labels used when merging per-channel transcripts.
  13. enum TranscriptSpeaker: String {
  14. case microphone = "You"
  15. case system = "Meeting"
  16. }
  17. /// Progress snapshot for UI status updates.
  18. struct MeetingTranscriptionProgress: Sendable {
  19. let totalChunks: Int
  20. let completedChunks: Int
  21. }
  22. enum MeetingTranscriptionError: Error, LocalizedError {
  23. case authorizationDenied
  24. case authorizationRestricted
  25. case recognizerUnavailable(locale: String)
  26. case noAudioToTranscribe
  27. var errorDescription: String? {
  28. switch self {
  29. case .authorizationDenied:
  30. return "Speech recognition permission denied. Enable it in System Settings and try again."
  31. case .authorizationRestricted:
  32. return "Speech recognition is restricted on this Mac."
  33. case .recognizerUnavailable(let locale):
  34. return "Speech recognizer is unavailable for \(locale)."
  35. case .noAudioToTranscribe:
  36. return "No audio was available to transcribe."
  37. }
  38. }
  39. }
  40. /// Transcribes meeting audio by running Apple Speech on per-channel files
  41. /// in fixed-size chunks, falling back across a list of locales per chunk.
  42. final class MeetingTranscriptionService {
  43. private struct ChunkPlan {
  44. let index: Int
  45. let startFrame: AVAudioFramePosition
  46. let frameCount: AVAudioFrameCount
  47. let startOffset: TimeInterval
  48. let endOffset: TimeInterval
  49. }
  50. /// Shared progress counter used across concurrent channels.
  51. private actor ProgressCounter {
  52. private let total: Int
  53. private var completed: Int = 0
  54. private let onProgress: (@Sendable (MeetingTranscriptionProgress) -> Void)?
  55. init(total: Int, onProgress: (@Sendable (MeetingTranscriptionProgress) -> Void)?) {
  56. self.total = total
  57. self.onProgress = onProgress
  58. }
  59. func emitInitial() {
  60. onProgress?(MeetingTranscriptionProgress(totalChunks: total, completedChunks: 0))
  61. }
  62. func increment() {
  63. completed += 1
  64. onProgress?(MeetingTranscriptionProgress(totalChunks: total, completedChunks: completed))
  65. }
  66. }
  67. func requestAuthorization() async throws {
  68. switch SFSpeechRecognizer.authorizationStatus() {
  69. case .authorized:
  70. return
  71. case .notDetermined:
  72. let status: SFSpeechRecognizerAuthorizationStatus = await withCheckedContinuation { continuation in
  73. SFSpeechRecognizer.requestAuthorization { continuation.resume(returning: $0) }
  74. }
  75. guard status == .authorized else { throw MeetingTranscriptionError.authorizationDenied }
  76. case .denied:
  77. throw MeetingTranscriptionError.authorizationDenied
  78. case .restricted:
  79. throw MeetingTranscriptionError.authorizationRestricted
  80. @unknown default:
  81. throw MeetingTranscriptionError.authorizationDenied
  82. }
  83. }
  84. /// Transcribes the mic and system channel audio (either may be nil) and
  85. /// returns a flat, time-ordered list of transcript segments labeled with
  86. /// the speaker channel.
  87. func transcribeMeeting(
  88. micURL: URL?,
  89. systemURL: URL?,
  90. chunkSeconds: TimeInterval = 30,
  91. overlapSeconds: TimeInterval = 0,
  92. locales: [Locale] = [Locale(identifier: "en-US")],
  93. onProgress: (@Sendable (MeetingTranscriptionProgress) -> Void)? = nil
  94. ) async throws -> [TranscriptSegment] {
  95. try await requestAuthorization()
  96. let micPlan: (URL, [ChunkPlan])? = try micURL.flatMap { url -> (URL, [ChunkPlan])? in
  97. guard FileManager.default.fileExists(atPath: url.path) else { return nil }
  98. let chunks = try planChunks(for: url, chunkSeconds: chunkSeconds, overlapSeconds: overlapSeconds)
  99. return chunks.isEmpty ? nil : (url, chunks)
  100. }
  101. let systemPlan: (URL, [ChunkPlan])? = try systemURL.flatMap { url -> (URL, [ChunkPlan])? in
  102. guard FileManager.default.fileExists(atPath: url.path) else { return nil }
  103. let chunks = try planChunks(for: url, chunkSeconds: chunkSeconds, overlapSeconds: overlapSeconds)
  104. return chunks.isEmpty ? nil : (url, chunks)
  105. }
  106. let totalChunks = (micPlan?.1.count ?? 0) + (systemPlan?.1.count ?? 0)
  107. guard totalChunks > 0 else {
  108. throw MeetingTranscriptionError.noAudioToTranscribe
  109. }
  110. let counter = ProgressCounter(total: totalChunks, onProgress: onProgress)
  111. await counter.emitInitial()
  112. let effectiveLocales = locales.isEmpty ? [Locale(identifier: "en-US")] : locales
  113. async let micSegments: [TranscriptSegment] = {
  114. guard let plan = micPlan else { return [] }
  115. return try await self.transcribeChannel(
  116. url: plan.0,
  117. chunks: plan.1,
  118. speaker: .microphone,
  119. locales: effectiveLocales,
  120. counter: counter
  121. )
  122. }()
  123. async let systemSegments: [TranscriptSegment] = {
  124. guard let plan = systemPlan else { return [] }
  125. return try await self.transcribeChannel(
  126. url: plan.0,
  127. chunks: plan.1,
  128. speaker: .system,
  129. locales: effectiveLocales,
  130. counter: counter
  131. )
  132. }()
  133. let combined = try await micSegments + systemSegments
  134. return combined
  135. .filter { $0.text.isEmpty == false }
  136. .sorted { $0.startOffset < $1.startOffset }
  137. }
  138. // MARK: - Chunk planning
  139. private func planChunks(for url: URL, chunkSeconds: TimeInterval, overlapSeconds: TimeInterval) throws -> [ChunkPlan] {
  140. let audioFile = try AVAudioFile(forReading: url)
  141. let sampleRate = audioFile.processingFormat.sampleRate
  142. guard sampleRate > 0 else { return [] }
  143. let totalFrames = audioFile.length
  144. guard totalFrames > 0 else { return [] }
  145. let chunkSamples = AVAudioFramePosition(max(1, chunkSeconds * sampleRate))
  146. let overlapSamples = AVAudioFramePosition(max(0, overlapSeconds * sampleRate))
  147. let step = max(AVAudioFramePosition(1), chunkSamples - overlapSamples)
  148. var plans: [ChunkPlan] = []
  149. var start: AVAudioFramePosition = 0
  150. var index = 0
  151. while start < totalFrames {
  152. let end = min(start + chunkSamples, totalFrames)
  153. let frameCount = AVAudioFrameCount(end - start)
  154. let startOffset = Double(start) / sampleRate
  155. let endOffset = Double(end) / sampleRate
  156. plans.append(ChunkPlan(
  157. index: index,
  158. startFrame: start,
  159. frameCount: frameCount,
  160. startOffset: startOffset,
  161. endOffset: endOffset
  162. ))
  163. index += 1
  164. if end >= totalFrames { break }
  165. start += step
  166. }
  167. return plans
  168. }
  169. // MARK: - Per-channel transcription
  170. private func transcribeChannel(
  171. url: URL,
  172. chunks: [ChunkPlan],
  173. speaker: TranscriptSpeaker,
  174. locales: [Locale],
  175. counter: ProgressCounter
  176. ) async throws -> [TranscriptSegment] {
  177. var segments: [TranscriptSegment] = []
  178. segments.reserveCapacity(chunks.count)
  179. for plan in chunks {
  180. try Task.checkCancellation()
  181. let buffer = try readChunkBuffer(url: url, startFrame: plan.startFrame, frameCount: plan.frameCount)
  182. let text = await transcribeBufferWithLocaleFallback(buffer: buffer, locales: locales)
  183. await counter.increment()
  184. let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
  185. if trimmed.isEmpty { continue }
  186. segments.append(TranscriptSegment(
  187. speaker: speaker.rawValue,
  188. startOffset: plan.startOffset,
  189. endOffset: plan.endOffset,
  190. text: trimmed
  191. ))
  192. }
  193. return segments
  194. }
  195. private func readChunkBuffer(url: URL, startFrame: AVAudioFramePosition, frameCount: AVAudioFrameCount) throws -> AVAudioPCMBuffer {
  196. let audioFile = try AVAudioFile(forReading: url)
  197. audioFile.framePosition = startFrame
  198. guard let buffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat, frameCapacity: frameCount) else {
  199. throw NSError(domain: "MeetingTranscriptionService", code: 1, userInfo: [NSLocalizedDescriptionKey: "Unable to allocate audio buffer."])
  200. }
  201. try audioFile.read(into: buffer, frameCount: frameCount)
  202. return buffer
  203. }
  204. private func transcribeBufferWithLocaleFallback(buffer: AVAudioPCMBuffer, locales: [Locale]) async -> String {
  205. for locale in locales {
  206. guard let recognizer = SFSpeechRecognizer(locale: locale), recognizer.isAvailable else { continue }
  207. do {
  208. let text = try await transcribeBuffer(buffer: buffer, recognizer: recognizer)
  209. let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
  210. if trimmed.isEmpty == false { return trimmed }
  211. } catch {
  212. // One transient retry before moving on to the next locale.
  213. try? await Task.sleep(nanoseconds: 500_000_000)
  214. if let text = try? await transcribeBuffer(buffer: buffer, recognizer: recognizer) {
  215. let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
  216. if trimmed.isEmpty == false { return trimmed }
  217. }
  218. continue
  219. }
  220. }
  221. return ""
  222. }
  223. private func transcribeBuffer(buffer: AVAudioPCMBuffer, recognizer: SFSpeechRecognizer) async throws -> String {
  224. let request = SFSpeechAudioBufferRecognitionRequest()
  225. request.shouldReportPartialResults = false
  226. if #available(macOS 13.0, *) {
  227. request.addsPunctuation = true
  228. }
  229. return try await withCheckedThrowingContinuation { continuation in
  230. var hasResumed = false
  231. let lock = NSLock()
  232. func resumeOnce(with result: Result<String, Error>) {
  233. lock.lock()
  234. defer { lock.unlock() }
  235. if hasResumed { return }
  236. hasResumed = true
  237. switch result {
  238. case .success(let text):
  239. continuation.resume(returning: text)
  240. case .failure(let error):
  241. continuation.resume(throwing: error)
  242. }
  243. }
  244. let task = recognizer.recognitionTask(with: request) { result, error in
  245. if let error {
  246. let nsError = error as NSError
  247. // "No speech detected" is a normal empty-chunk outcome (code 203 in kafAssistant domain).
  248. if nsError.domain == "kAFAssistantErrorDomain" && (nsError.code == 203 || nsError.code == 1110) {
  249. resumeOnce(with: .success(""))
  250. return
  251. }
  252. resumeOnce(with: .failure(error))
  253. return
  254. }
  255. if let result, result.isFinal {
  256. resumeOnce(with: .success(result.bestTranscription.formattedString))
  257. }
  258. }
  259. request.append(buffer)
  260. request.endAudio()
  261. _ = task
  262. }
  263. }
  264. }
  265. extension Array where Element == TranscriptSegment {
  266. /// Renders segments as a human-readable timeline like:
  267. /// `[00:12] You: Hello everyone.`
  268. func renderedTimelineText() -> String {
  269. let formatter: (TimeInterval) -> String = { seconds in
  270. let total = Int(seconds.rounded(.down))
  271. let h = total / 3600
  272. let m = (total % 3600) / 60
  273. let s = total % 60
  274. if h > 0 {
  275. return String(format: "%02d:%02d:%02d", h, m, s)
  276. }
  277. return String(format: "%02d:%02d", m, s)
  278. }
  279. return self.map { segment in
  280. "[\(formatter(segment.startOffset))] \(segment.speaker): \(segment.text)"
  281. }.joined(separator: "\n")
  282. }
  283. }