瀏覽代碼

Improve transcript completeness across chunk boundaries.

Use shorter overlapping chunks, preserve successful recognition locale across chunks, and remove overlap-induced duplicate prefixes to reduce dropped speech in generated transcripts.

Co-authored-by: Cursor <cursoragent@cursor.com>
huzaifahayat12 1 月之前
父節點
當前提交
3db1d72c98
共有 1 個文件被更改,包括 58 次插入10 次删除
  1. 58 10
      meetings_app/Transcription/MeetingTranscriptionService.swift

+ 58 - 10
meetings_app/Transcription/MeetingTranscriptionService.swift

@@ -99,8 +99,8 @@ final class MeetingTranscriptionService {
99 99
     func transcribeMeeting(
100 100
         micURL: URL?,
101 101
         systemURL: URL?,
102
-        chunkSeconds: TimeInterval = 30,
103
-        overlapSeconds: TimeInterval = 0,
102
+        chunkSeconds: TimeInterval = 20,
103
+        overlapSeconds: TimeInterval = 1.5,
104 104
         locales: [Locale] = [],
105 105
         onProgress: (@Sendable (MeetingTranscriptionProgress) -> Void)? = nil
106 106
     ) async throws -> [TranscriptSegment] {
@@ -200,19 +200,33 @@ final class MeetingTranscriptionService {
200 200
     ) async throws -> [TranscriptSegment] {
201 201
         var segments: [TranscriptSegment] = []
202 202
         segments.reserveCapacity(chunks.count)
203
+        var preferredLocale: Locale?
203 204
 
204 205
         for plan in chunks {
205 206
             try Task.checkCancellation()
206 207
             let buffer = try readChunkBuffer(url: url, startFrame: plan.startFrame, frameCount: plan.frameCount)
207
-            let text = await transcribeBufferWithLocaleFallback(buffer: buffer, locales: locales)
208
+            let result = await transcribeBufferWithLocaleFallback(
209
+                buffer: buffer,
210
+                locales: locales,
211
+                preferredLocale: preferredLocale
212
+            )
208 213
             await counter.increment()
209
-            let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
214
+            preferredLocale = result.locale ?? preferredLocale
215
+            let trimmed = result.text.trimmingCharacters(in: .whitespacesAndNewlines)
210 216
             if trimmed.isEmpty { continue }
217
+            let textForSegment: String
218
+            if let previous = segments.last {
219
+                textForSegment = removeRepeatedPrefix(from: trimmed, previousText: previous.text)
220
+            } else {
221
+                textForSegment = trimmed
222
+            }
223
+            let normalized = textForSegment.trimmingCharacters(in: .whitespacesAndNewlines)
224
+            if normalized.isEmpty { continue }
211 225
             segments.append(TranscriptSegment(
212 226
                 speaker: speaker.rawValue,
213 227
                 startOffset: plan.startOffset,
214 228
                 endOffset: plan.endOffset,
215
-                text: trimmed
229
+                text: normalized
216 230
             ))
217 231
         }
218 232
         return segments
@@ -228,24 +242,36 @@ final class MeetingTranscriptionService {
228 242
         return buffer
229 243
     }
230 244
 
231
-    private func transcribeBufferWithLocaleFallback(buffer: AVAudioPCMBuffer, locales: [Locale]) async -> String {
232
-        for locale in locales {
245
+    private func transcribeBufferWithLocaleFallback(
246
+        buffer: AVAudioPCMBuffer,
247
+        locales: [Locale],
248
+        preferredLocale: Locale?
249
+    ) async -> (text: String, locale: Locale?) {
250
+        var orderedLocales: [Locale] = []
251
+        if let preferredLocale {
252
+            orderedLocales.append(preferredLocale)
253
+        }
254
+        for locale in locales where orderedLocales.contains(where: { $0.identifier == locale.identifier }) == false {
255
+            orderedLocales.append(locale)
256
+        }
257
+
258
+        for locale in orderedLocales {
233 259
             guard let recognizer = SFSpeechRecognizer(locale: locale), recognizer.isAvailable else { continue }
234 260
             do {
235 261
                 let text = try await transcribeBuffer(buffer: buffer, recognizer: recognizer)
236 262
                 let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
237
-                if trimmed.isEmpty == false { return trimmed }
263
+                if trimmed.isEmpty == false { return (trimmed, locale) }
238 264
             } catch {
239 265
                 // One transient retry before moving on to the next locale.
240 266
                 try? await Task.sleep(nanoseconds: 500_000_000)
241 267
                 if let text = try? await transcribeBuffer(buffer: buffer, recognizer: recognizer) {
242 268
                     let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
243
-                    if trimmed.isEmpty == false { return trimmed }
269
+                    if trimmed.isEmpty == false { return (trimmed, locale) }
244 270
                 }
245 271
                 continue
246 272
             }
247 273
         }
248
-        return ""
274
+        return ("", nil)
249 275
     }
250 276
 
251 277
     private func preferredLocalesForGlobalRecognition() -> [Locale] {
@@ -290,6 +316,28 @@ final class MeetingTranscriptionService {
290 316
         identifier.replacingOccurrences(of: "_", with: "-").lowercased()
291 317
     }
292 318
 
319
+    /// Removes repeated leading words from a chunk that overlap with the
320
+    /// trailing words of the previous chunk.
321
+    private func removeRepeatedPrefix(from text: String, previousText: String) -> String {
322
+        let separators = CharacterSet.whitespacesAndNewlines
323
+        let currentWords = text.components(separatedBy: separators).filter { $0.isEmpty == false }
324
+        let previousWords = previousText.components(separatedBy: separators).filter { $0.isEmpty == false }
325
+        guard currentWords.isEmpty == false, previousWords.isEmpty == false else { return text }
326
+
327
+        let maxOverlap = min(12, currentWords.count, previousWords.count)
328
+        guard maxOverlap > 0 else { return text }
329
+
330
+        for overlap in stride(from: maxOverlap, through: 1, by: -1) {
331
+            let previousSuffix = Array(previousWords.suffix(overlap)).map { $0.lowercased() }
332
+            let currentPrefix = Array(currentWords.prefix(overlap)).map { $0.lowercased() }
333
+            if previousSuffix == currentPrefix {
334
+                let remainder = currentWords.dropFirst(overlap)
335
+                return remainder.joined(separator: " ")
336
+            }
337
+        }
338
+        return text
339
+    }
340
+
293 341
     private func transcribeBuffer(buffer: AVAudioPCMBuffer, recognizer: SFSpeechRecognizer) async throws -> String {
294 342
         let request = SFSpeechAudioBufferRecognitionRequest()
295 343
         request.shouldReportPartialResults = false