joegesualdo / vtt-to-json

Convert WebVTT file to JSON
29 stars 5 forks source link

parse data with one extra unnecessary element #3

Open xixiaofinland opened 7 years ago

xixiaofinland commented 7 years ago

Hi Joe,

I use the code below to read a vtt file, but the second element in the JSON result is wrong.

The second element is {"start":160,"end":3600,"part":"\r 2\r","words":[{"word":""},{"word":"2"}]}

I have tried on multiple vtt files, all of which have correct format. They all have the same issue.

parsed JSON result

 [{"start":160,"end":3600,"part":"Before you even start, I'm not denying evolution, okay?  \r","words":[{"word":"before"},{"word":"you"},{"word":"even"},{"word":"start"},{"word":"i'm"},{"word":"not"
},{"word":"denying"},{"word":"evolution"},{"word":"okay"},{"word":""},{"word":""}]},{"start":160,"end":3600,"part":"\r 2\r","words":[{"word":""},{"word":"2"}]},{"start":3600,"end":5500,"part":"I'm just saying t
hat it's one of the possibilities.","words":[{"word":"i'm"},{"word":"just"},{"word":"saying"},{"word":"that"},{"word":"it's"},{"word":"one"},{"word":"of"},{"word":"the"},{"word":"possibilities"}]}]

code snippet

fs.readFile("./video.vtt", "utf8", (err, data) => {
  if (err) reject(err);
  vttToJson(data).then(result => {
    console.log(`json result: ${JSON.stringify(result)}`);
    resolve(result);
  });
});

video.vtt

WEBVTT

1
00:00:00.160 --> 00:00:03.600
Before you even start, I'm not denying evolution, okay?  

2
00:00:03.600 --> 00:00:05.500
I'm just saying that it's one of the possibilities.
nitank commented 7 years ago

Got same issue...so I have to remove the second object every time in my controller.. Better to fix it inside this module.

xixiaofinland commented 7 years ago

I debugged the code, and seems the logic is a bit redundant and weird inside vttArray.forEach((line, index) => { part. So I came up with my version of vtt.js. This version is meant to fix bug instead of big refactor, so other code sections are not touched at all. It works well in my environment.

How to use it put it in the project folder then,

const vttToJson = require("./vtt");
vttToJson(data).then(result => {
        resolve(result);
      });

the vtt.js code

function vttToJson(vttString) {
  return new Promise((resolve, reject) => {
    var current = {};
    var sections = [];
    var start = false;
    var vttArray = vttString.split("\n");
    vttArray.forEach((line, index) => {
      if (/^\s*[\r\n]/gm.test(line)) {
        if (current.start) {
          sections.push(clone(current));
        }
        start = false;
      } else if (line.indexOf("-->") !== -1) {
        start = true;

        current = {
          start: timeString2ms(
            line.split("-->")[0].trimRight().split(" ").pop()
          ),
          end: timeString2ms(
            line.split("-->")[1].trimLeft().split(" ").shift()
          ),
          part: ""
        };
      } else {
        if (start) {
          if (current.part.length === 0) {
            current.part = line;
          } else {
            current.part = `${current.part} ${line}`;
          }
          // If it's the last line of the subtitles
          if (index === vttArray.length - 1) {
            sections.push(clone(current));
          }
        }
      }
    });

    current = [];

    var regex = /(<([0-9:.>]+)>)/gi;
    sections.forEach(section => {
      strs = section.part.split();
      var results = strs.map(function(s) {
        return s.replace(regex, function(n) {
          return n.split("").reduce(function(s, i) {
            return `==${n.replace("<", "").replace(">", "")}`;
          }, 0);
        });
      });
      cleanText = results[0].replace(/<\/?[^>]+(>|$)/g, "");
      cleanArray = cleanText.split(" ");
      resultsArray = [];
      cleanArray.forEach(function(item) {
        if (item.indexOf("==") > -1) {
          var pair = item.split("==");
          var key = pair[0];
          var value = pair[1];
          if (key == "" || key == "##") {
            return;
          }
          resultsArray.push({
            word: cleanWord(item.split("==")[0]),
            time: timeString2ms(item.split("==")[1])
          });
        } else {
          resultsArray.push({
            word: cleanWord(item),
            time: undefined
          });
        }
      });
      section.words = resultsArray;
      section.part = section.part.replace(/<\/?[^>]+(>|$)/g, "");
    });
    resolve(sections);
  });
}

// helpers
//   http://codereview.stackexchange.com/questions/45335/milliseconds-to-time-string-time-string-to-milliseconds
function timeString2ms(a, b) {
  // time(HH:MM:SS.mss) // optimized
  return (a = a.split(".")), (b = a[1] * 1 || 0), (a = a[0].split(":")), b + // optimized // optimized
    (a[2]
      ? a[0] * 3600 + a[1] * 60 + a[2] * 1
      : a[1] ? a[0] * 60 + a[1] * 1 : a[0] * 1) *
      1e3; // optimized
}

// removes everything but characters and apostrophe and dash
function cleanWord(word) {
  return word.replace(/[^0-9a-z'-]/gi, "").toLowerCase();
}

function clone(obj) {
  if (null == obj || "object" != typeof obj) return obj;
  var copy = obj.constructor();
  for (var attr in obj) {
    if (obj.hasOwnProperty(attr)) copy[attr] = obj[attr];
  }
  return copy;
}

module.exports = vttToJson;
xixiaofinland commented 7 years ago

@nitank the author's version does not parse vtt correctly in many other circulations, you can have a try on my version if you want.

joegesualdo commented 6 years ago

@Xixiao007 Please consider opening a PR.