소스 검색

Improve transcript completeness across chunk boundaries.

Use shorter overlapping chunks, preserve successful recognition locale across chunks, and remove overlap-induced duplicate prefixes to reduce dropped speech in generated transcripts.

Co-authored-by: Cursor <cursoragent@cursor.com>
huzaifahayat12 1 개월 전
부모
커밋
3db1d72c98
1개의 변경된 파일58개의 추가작업 그리고 10개의 파일을 삭제
  1. 58 10
      meetings_app/Transcription/MeetingTranscriptionService.swift

+ 58 - 10
meetings_app/Transcription/MeetingTranscriptionService.swift

@@ -99,8 +99,8 @@ final class MeetingTranscriptionService {
99
     func transcribeMeeting(
99
     func transcribeMeeting(
100
         micURL: URL?,
100
         micURL: URL?,
101
         systemURL: URL?,
101
         systemURL: URL?,
102
-        chunkSeconds: TimeInterval = 30,
103
-        overlapSeconds: TimeInterval = 0,
102
+        chunkSeconds: TimeInterval = 20,
103
+        overlapSeconds: TimeInterval = 1.5,
104
         locales: [Locale] = [],
104
         locales: [Locale] = [],
105
         onProgress: (@Sendable (MeetingTranscriptionProgress) -> Void)? = nil
105
         onProgress: (@Sendable (MeetingTranscriptionProgress) -> Void)? = nil
106
     ) async throws -> [TranscriptSegment] {
106
     ) async throws -> [TranscriptSegment] {
@@ -200,19 +200,33 @@ final class MeetingTranscriptionService {
200
     ) async throws -> [TranscriptSegment] {
200
     ) async throws -> [TranscriptSegment] {
201
         var segments: [TranscriptSegment] = []
201
         var segments: [TranscriptSegment] = []
202
         segments.reserveCapacity(chunks.count)
202
         segments.reserveCapacity(chunks.count)
203
+        var preferredLocale: Locale?
203
 
204
 
204
         for plan in chunks {
205
         for plan in chunks {
205
             try Task.checkCancellation()
206
             try Task.checkCancellation()
206
             let buffer = try readChunkBuffer(url: url, startFrame: plan.startFrame, frameCount: plan.frameCount)
207
             let buffer = try readChunkBuffer(url: url, startFrame: plan.startFrame, frameCount: plan.frameCount)
207
-            let text = await transcribeBufferWithLocaleFallback(buffer: buffer, locales: locales)
208
+            let result = await transcribeBufferWithLocaleFallback(
209
+                buffer: buffer,
210
+                locales: locales,
211
+                preferredLocale: preferredLocale
212
+            )
208
             await counter.increment()
213
             await counter.increment()
209
-            let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
214
+            preferredLocale = result.locale ?? preferredLocale
215
+            let trimmed = result.text.trimmingCharacters(in: .whitespacesAndNewlines)
210
             if trimmed.isEmpty { continue }
216
             if trimmed.isEmpty { continue }
217
+            let textForSegment: String
218
+            if let previous = segments.last {
219
+                textForSegment = removeRepeatedPrefix(from: trimmed, previousText: previous.text)
220
+            } else {
221
+                textForSegment = trimmed
222
+            }
223
+            let normalized = textForSegment.trimmingCharacters(in: .whitespacesAndNewlines)
224
+            if normalized.isEmpty { continue }
211
             segments.append(TranscriptSegment(
225
             segments.append(TranscriptSegment(
212
                 speaker: speaker.rawValue,
226
                 speaker: speaker.rawValue,
213
                 startOffset: plan.startOffset,
227
                 startOffset: plan.startOffset,
214
                 endOffset: plan.endOffset,
228
                 endOffset: plan.endOffset,
215
-                text: trimmed
229
+                text: normalized
216
             ))
230
             ))
217
         }
231
         }
218
         return segments
232
         return segments
@@ -228,24 +242,36 @@ final class MeetingTranscriptionService {
228
         return buffer
242
         return buffer
229
     }
243
     }
230
 
244
 
231
-    private func transcribeBufferWithLocaleFallback(buffer: AVAudioPCMBuffer, locales: [Locale]) async -> String {
232
-        for locale in locales {
245
+    private func transcribeBufferWithLocaleFallback(
246
+        buffer: AVAudioPCMBuffer,
247
+        locales: [Locale],
248
+        preferredLocale: Locale?
249
+    ) async -> (text: String, locale: Locale?) {
250
+        var orderedLocales: [Locale] = []
251
+        if let preferredLocale {
252
+            orderedLocales.append(preferredLocale)
253
+        }
254
+        for locale in locales where orderedLocales.contains(where: { $0.identifier == locale.identifier }) == false {
255
+            orderedLocales.append(locale)
256
+        }
257
+
258
+        for locale in orderedLocales {
233
             guard let recognizer = SFSpeechRecognizer(locale: locale), recognizer.isAvailable else { continue }
259
             guard let recognizer = SFSpeechRecognizer(locale: locale), recognizer.isAvailable else { continue }
234
             do {
260
             do {
235
                 let text = try await transcribeBuffer(buffer: buffer, recognizer: recognizer)
261
                 let text = try await transcribeBuffer(buffer: buffer, recognizer: recognizer)
236
                 let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
262
                 let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
237
-                if trimmed.isEmpty == false { return trimmed }
263
+                if trimmed.isEmpty == false { return (trimmed, locale) }
238
             } catch {
264
             } catch {
239
                 // One transient retry before moving on to the next locale.
265
                 // One transient retry before moving on to the next locale.
240
                 try? await Task.sleep(nanoseconds: 500_000_000)
266
                 try? await Task.sleep(nanoseconds: 500_000_000)
241
                 if let text = try? await transcribeBuffer(buffer: buffer, recognizer: recognizer) {
267
                 if let text = try? await transcribeBuffer(buffer: buffer, recognizer: recognizer) {
242
                     let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
268
                     let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
243
-                    if trimmed.isEmpty == false { return trimmed }
269
+                    if trimmed.isEmpty == false { return (trimmed, locale) }
244
                 }
270
                 }
245
                 continue
271
                 continue
246
             }
272
             }
247
         }
273
         }
248
-        return ""
274
+        return ("", nil)
249
     }
275
     }
250
 
276
 
251
     private func preferredLocalesForGlobalRecognition() -> [Locale] {
277
     private func preferredLocalesForGlobalRecognition() -> [Locale] {
@@ -290,6 +316,28 @@ final class MeetingTranscriptionService {
290
         identifier.replacingOccurrences(of: "_", with: "-").lowercased()
316
         identifier.replacingOccurrences(of: "_", with: "-").lowercased()
291
     }
317
     }
292
 
318
 
319
+    /// Removes repeated leading words from a chunk that overlap with the
320
+    /// trailing words of the previous chunk.
321
+    private func removeRepeatedPrefix(from text: String, previousText: String) -> String {
322
+        let separators = CharacterSet.whitespacesAndNewlines
323
+        let currentWords = text.components(separatedBy: separators).filter { $0.isEmpty == false }
324
+        let previousWords = previousText.components(separatedBy: separators).filter { $0.isEmpty == false }
325
+        guard currentWords.isEmpty == false, previousWords.isEmpty == false else { return text }
326
+
327
+        let maxOverlap = min(12, currentWords.count, previousWords.count)
328
+        guard maxOverlap > 0 else { return text }
329
+
330
+        for overlap in stride(from: maxOverlap, through: 1, by: -1) {
331
+            let previousSuffix = Array(previousWords.suffix(overlap)).map { $0.lowercased() }
332
+            let currentPrefix = Array(currentWords.prefix(overlap)).map { $0.lowercased() }
333
+            if previousSuffix == currentPrefix {
334
+                let remainder = currentWords.dropFirst(overlap)
335
+                return remainder.joined(separator: " ")
336
+            }
337
+        }
338
+        return text
339
+    }
340
+
293
     private func transcribeBuffer(buffer: AVAudioPCMBuffer, recognizer: SFSpeechRecognizer) async throws -> String {
341
     private func transcribeBuffer(buffer: AVAudioPCMBuffer, recognizer: SFSpeechRecognizer) async throws -> String {
294
         let request = SFSpeechAudioBufferRecognitionRequest()
342
         let request = SFSpeechAudioBufferRecognitionRequest()
295
         request.shouldReportPartialResults = false
343
         request.shouldReportPartialResults = false