|
|
@@ -99,8 +99,8 @@ final class MeetingTranscriptionService {
|
|
99
|
99
|
func transcribeMeeting(
|
|
100
|
100
|
micURL: URL?,
|
|
101
|
101
|
systemURL: URL?,
|
|
102
|
|
- chunkSeconds: TimeInterval = 30,
|
|
103
|
|
- overlapSeconds: TimeInterval = 0,
|
|
|
102
|
+ chunkSeconds: TimeInterval = 20,
|
|
|
103
|
+ overlapSeconds: TimeInterval = 1.5,
|
|
104
|
104
|
locales: [Locale] = [],
|
|
105
|
105
|
onProgress: (@Sendable (MeetingTranscriptionProgress) -> Void)? = nil
|
|
106
|
106
|
) async throws -> [TranscriptSegment] {
|
|
|
@@ -200,19 +200,33 @@ final class MeetingTranscriptionService {
|
|
200
|
200
|
) async throws -> [TranscriptSegment] {
|
|
201
|
201
|
var segments: [TranscriptSegment] = []
|
|
202
|
202
|
segments.reserveCapacity(chunks.count)
|
|
|
203
|
+ var preferredLocale: Locale?
|
|
203
|
204
|
|
|
204
|
205
|
for plan in chunks {
|
|
205
|
206
|
try Task.checkCancellation()
|
|
206
|
207
|
let buffer = try readChunkBuffer(url: url, startFrame: plan.startFrame, frameCount: plan.frameCount)
|
|
207
|
|
- let text = await transcribeBufferWithLocaleFallback(buffer: buffer, locales: locales)
|
|
|
208
|
+ let result = await transcribeBufferWithLocaleFallback(
|
|
|
209
|
+ buffer: buffer,
|
|
|
210
|
+ locales: locales,
|
|
|
211
|
+ preferredLocale: preferredLocale
|
|
|
212
|
+ )
|
|
208
|
213
|
await counter.increment()
|
|
209
|
|
- let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
|
214
|
+ preferredLocale = result.locale ?? preferredLocale
|
|
|
215
|
+ let trimmed = result.text.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
210
|
216
|
if trimmed.isEmpty { continue }
|
|
|
217
|
+ let textForSegment: String
|
|
|
218
|
+ if let previous = segments.last {
|
|
|
219
|
+ textForSegment = removeRepeatedPrefix(from: trimmed, previousText: previous.text)
|
|
|
220
|
+ } else {
|
|
|
221
|
+ textForSegment = trimmed
|
|
|
222
|
+ }
|
|
|
223
|
+ let normalized = textForSegment.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
|
224
|
+ if normalized.isEmpty { continue }
|
|
211
|
225
|
segments.append(TranscriptSegment(
|
|
212
|
226
|
speaker: speaker.rawValue,
|
|
213
|
227
|
startOffset: plan.startOffset,
|
|
214
|
228
|
endOffset: plan.endOffset,
|
|
215
|
|
- text: trimmed
|
|
|
229
|
+ text: normalized
|
|
216
|
230
|
))
|
|
217
|
231
|
}
|
|
218
|
232
|
return segments
|
|
|
@@ -228,24 +242,36 @@ final class MeetingTranscriptionService {
|
|
228
|
242
|
return buffer
|
|
229
|
243
|
}
|
|
230
|
244
|
|
|
231
|
|
- private func transcribeBufferWithLocaleFallback(buffer: AVAudioPCMBuffer, locales: [Locale]) async -> String {
|
|
232
|
|
- for locale in locales {
|
|
|
245
|
+ private func transcribeBufferWithLocaleFallback(
|
|
|
246
|
+ buffer: AVAudioPCMBuffer,
|
|
|
247
|
+ locales: [Locale],
|
|
|
248
|
+ preferredLocale: Locale?
|
|
|
249
|
+ ) async -> (text: String, locale: Locale?) {
|
|
|
250
|
+ var orderedLocales: [Locale] = []
|
|
|
251
|
+ if let preferredLocale {
|
|
|
252
|
+ orderedLocales.append(preferredLocale)
|
|
|
253
|
+ }
|
|
|
254
|
+ for locale in locales where orderedLocales.contains(where: { $0.identifier == locale.identifier }) == false {
|
|
|
255
|
+ orderedLocales.append(locale)
|
|
|
256
|
+ }
|
|
|
257
|
+
|
|
|
258
|
+ for locale in orderedLocales {
|
|
233
|
259
|
guard let recognizer = SFSpeechRecognizer(locale: locale), recognizer.isAvailable else { continue }
|
|
234
|
260
|
do {
|
|
235
|
261
|
let text = try await transcribeBuffer(buffer: buffer, recognizer: recognizer)
|
|
236
|
262
|
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
237
|
|
- if trimmed.isEmpty == false { return trimmed }
|
|
|
263
|
+ if trimmed.isEmpty == false { return (trimmed, locale) }
|
|
238
|
264
|
} catch {
|
|
239
|
265
|
// One transient retry before moving on to the next locale.
|
|
240
|
266
|
try? await Task.sleep(nanoseconds: 500_000_000)
|
|
241
|
267
|
if let text = try? await transcribeBuffer(buffer: buffer, recognizer: recognizer) {
|
|
242
|
268
|
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
243
|
|
- if trimmed.isEmpty == false { return trimmed }
|
|
|
269
|
+ if trimmed.isEmpty == false { return (trimmed, locale) }
|
|
244
|
270
|
}
|
|
245
|
271
|
continue
|
|
246
|
272
|
}
|
|
247
|
273
|
}
|
|
248
|
|
- return ""
|
|
|
274
|
+ return ("", nil)
|
|
249
|
275
|
}
|
|
250
|
276
|
|
|
251
|
277
|
private func preferredLocalesForGlobalRecognition() -> [Locale] {
|
|
|
@@ -290,6 +316,28 @@ final class MeetingTranscriptionService {
|
|
290
|
316
|
identifier.replacingOccurrences(of: "_", with: "-").lowercased()
|
|
291
|
317
|
}
|
|
292
|
318
|
|
|
|
319
|
+ /// Removes repeated leading words from a chunk that overlap with the
|
|
|
320
|
+ /// trailing words of the previous chunk.
|
|
|
321
|
+ private func removeRepeatedPrefix(from text: String, previousText: String) -> String {
|
|
|
322
|
+ let separators = CharacterSet.whitespacesAndNewlines
|
|
|
323
|
+ let currentWords = text.components(separatedBy: separators).filter { $0.isEmpty == false }
|
|
|
324
|
+ let previousWords = previousText.components(separatedBy: separators).filter { $0.isEmpty == false }
|
|
|
325
|
+ guard currentWords.isEmpty == false, previousWords.isEmpty == false else { return text }
|
|
|
326
|
+
|
|
|
327
|
+ let maxOverlap = min(12, currentWords.count, previousWords.count)
|
|
|
328
|
+ guard maxOverlap > 0 else { return text }
|
|
|
329
|
+
|
|
|
330
|
+ for overlap in stride(from: maxOverlap, through: 1, by: -1) {
|
|
|
331
|
+ let previousSuffix = Array(previousWords.suffix(overlap)).map { $0.lowercased() }
|
|
|
332
|
+ let currentPrefix = Array(currentWords.prefix(overlap)).map { $0.lowercased() }
|
|
|
333
|
+ if previousSuffix == currentPrefix {
|
|
|
334
|
+ let remainder = currentWords.dropFirst(overlap)
|
|
|
335
|
+ return remainder.joined(separator: " ")
|
|
|
336
|
+ }
|
|
|
337
|
+ }
|
|
|
338
|
+ return text
|
|
|
339
|
+ }
|
|
|
340
|
+
|
|
293
|
341
|
private func transcribeBuffer(buffer: AVAudioPCMBuffer, recognizer: SFSpeechRecognizer) async throws -> String {
|
|
294
|
342
|
let request = SFSpeechAudioBufferRecognitionRequest()
|
|
295
|
343
|
request.shouldReportPartialResults = false
|