AlexPoint / SubtitlesParser

Multi formats subtitles parser in C#
MIT License
134 stars 40 forks source link

Stream is not in a valid Youtube XML format #37

Open nazar322 opened 2 years ago

nazar322 commented 2 years ago

YtXmlFormatParser.Parse causes System.ArgumentException: 'Stream is not in a valid Youtube XML format'

The code is as follows

List<SubtitlesParser.Classes.SubtitleItem> subtitleItems;
var ytSubtitlesParser = new SubtitlesParser.Classes.Parsers.YtXmlFormatParser();

using (var stream = new MemoryStream(Encoding.UTF8.GetBytes(subtitles)))
{
     subtitleItems = ytSubtitlesParser.ParseStream(stream, Encoding.UTF8);
}

YouTube captions attached yt-video-oPnDOxMXlUc.zip

nazar322 commented 2 years ago
private string ConvertYouTubeXmlToSrtFormat(string ytSubtitles)
        {
            var expression = new Regex("<p\\st=\"(?<timestamp>\\d+)\"\\sd=\"(?<duration>\\d+)\">(?<text>.*?)</p>", RegexOptions.Singleline);
            var matchedSubtitles = expression.Matches(ytSubtitles);

            if (matchedSubtitles.Count == 0) throw new Exception("Failed to extract subtitles");

            var srtWriter = new StringBuilder(matchedSubtitles.Count * 50);

            for (var i = 0; i < matchedSubtitles.Count; i++)
            {
                var matchedSubtitle = matchedSubtitles[i];

                srtWriter.AppendLine((i + 1).ToString()); // sequence number

                var timestamp = new TimeSpan(0, 0, 0, 0, matchedSubtitle.Groups["timestamp"].Value.ToInt32());
                var duration = new TimeSpan(0, 0, 0, 0, matchedSubtitle.Groups["duration"].Value.ToInt32());
                srtWriter.AppendLine($"{timestamp:hh\\:mm\\:ss\\,fff} --> {timestamp + duration:hh\\:mm\\:ss\\,fff}"); // timestamps

                srtWriter.AppendLine(matchedSubtitle.Groups["text"].Value); // text

                srtWriter.AppendLine();
            }

            return srtWriter.ToString();
        }