pietrop / slate-transcript-editor

A React component to make correcting automated transcriptions of audio and video easier and faster. Using the SlateJs editor.
https://pietrop.github.io/slate-transcript-editor
Other
73 stars 33 forks source link

avoid using stt-align-node! #101

Open Niceblueman opened 7 months ago

Niceblueman commented 7 months ago

Hi! πŸ‘‹

Firstly, thanks for your work on this project! πŸ™‚

Today I used patch-package to patch slate-transcript-editor@0.1.6-alpha.19 for the project I'm working on.

Here is the diff that solved my problem:

diff --git a/node_modules/slate-transcript-editor/util/export-adapters/slate-to-dpe/update-timestamps/plain-text-align-to-slate.js b/node_modules/slate-transcript-editor/util/export-adapters/slate-to-dpe/update-timestamps/plain-text-align-to-slate.js
index be190f0..25a9686 100644
--- a/node_modules/slate-transcript-editor/util/export-adapters/slate-to-dpe/update-timestamps/plain-text-align-to-slate.js
+++ b/node_modules/slate-transcript-editor/util/export-adapters/slate-to-dpe/update-timestamps/plain-text-align-to-slate.js
@@ -1,8 +1,81 @@
-import { alignSTT } from 'stt-align-node';
+// import { alignSTT } from 'stt-align-node';
 import { shortTimecode } from '../../../timecode-converter';
 import countWords from '../../../count-words';
 import generatePreviousTimingsUpToCurrent from '../../../dpe-to-slate/generate-previous-timings-up-to-current';
+function alignSTT(sttWords, transcriptText, start, end) {
+  const sttWordsList = sttWords.words;
+  const opCodes = calculateDiff(sttWordsList, transcriptText);
+  const transcriptWords = convertRefTextToList(transcriptText);
+  const alignedResults = alignRefTextWithSTT(
+    opCodes,
+    sttWordsList,
+    transcriptWords,
+    start,
+    end
+  );
+  return alignedResults;
+}
+
+// Function to calculate the difference between two arrays of words
+function calculateDiff(array1, array2) {
+  const opCodes = [];
+
+  // Iterate over the arrays and find the differences
+  let i = 0;
+  let j = 0;
+
+  while (i < array1.length && j < array2.length) {
+    if (array1[i] === array2[j]) {
+      opCodes.push(['equal', i, i + 1, j, j + 1]);
+      i++;
+      j++;
+    } else {
+      opCodes.push(['delete', i, i + 1, j, j]);
+      i++;
+    }
+  }

+  // Handle remaining elements in array1
+  while (i < array1.length) {
+    opCodes.push(['delete', i, i + 1, j, j]);
+    i++;
+  }
+
+  // Handle remaining elements in array2
+  while (j < array2.length) {
+    opCodes.push(['insert', i, i, j, j + 1]);
+    j++;
+  }
+
+  return opCodes;
+}
+
+// Function to convert a text string to a list of words
+function convertRefTextToList(text) {
+  return text.split(/\s+/);
+}
+
+// Function to align reference text with STT output based on calculated diff
+function alignRefTextWithSTT(opCodes, sttWords, refWords, start, end) {
+  let alignedResults = [];
+
+  for (const op of opCodes) {
+    const [tag, i1, i2, j1, j2] = op;
+
+    if (tag === 'equal') {
+      alignedResults.push(...sttWords.slice(i1, i2));
+    } else if (tag === 'delete') {
+      alignedResults.push(...Array(i2 - i1).fill(''));
+    } else if (tag === 'insert') {
+      alignedResults.push(...refWords.slice(j1, j2));
+    }
+  }
+
+  // Trim the result based on the specified start and end indices
+  alignedResults = alignedResults.slice(start, end);
+
+  return alignedResults;
+}
 const createSlateContentFromSlateJsParagraphs = (currentContent, newEntities) => {
   // Update entites to block structure.
   const updatedBlockArray = [];

This issue body was partially generated by patch-package.