private static List <string> ExtractVttContent(string token, string vttUrl) { List <string> sentences = new List <string>(); using (var client = new WebClient()) { client.Headers.Clear(); client.Headers.Add("authorization", token); var content = client.DownloadData(vttUrl); using (var stream = new MemoryStream(content)) { string fileName = "file.txt"; string path = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, fileName); FileStream file = new FileStream(path, FileMode.Create, FileAccess.Write); stream.WriteTo(file); file.Close(); SubtitlesParser.Classes.Parsers.SubParser parser = new SubtitlesParser.Classes.Parsers.SubParser(); using (var fileStream = new FileStream(path, FileMode.Open, FileAccess.Read)) { try { var mostLikelyFormat = parser.GetMostLikelyFormat(fileStream.Name); var items = parser.ParseStream(fileStream, Encoding.UTF8, mostLikelyFormat); string sentence = " "; foreach (var item in items) { foreach (var line in item.Lines) { foreach (var word in line.ToCharArray()) { sentence = sentence + word; if (word.Equals('.') || word.Equals('?')) { sentences.Add(sentence); sentence = string.Empty; } } sentence = sentence + " "; } } //Remove small talks from trans script sentences RemoveSmallTalks(sentences); } catch (Exception ex) { } } } } return(sentences); }
public static IEnumerable <Sample> SearchVocabularyInSubtitles(string[] vocab, string SubtilesFolder) { var samplesWithVocab = new List <Sample>(); String[] files = System.IO.Directory.GetFiles(SubtilesFolder); Console.WriteLine($"Files : {files[0]}"); Parallel.ForEach(files, (currentFile) => { String fileName = System.IO.Path.GetFullPath(currentFile); var parser = new SubtitlesParser.Classes.Parsers.SubParser(); using (var fileStream = File.OpenRead(fileName)) { try { var mostLikelyFormat = parser.GetMostLikelyFormat(fileName); var items = parser.ParseStream(fileStream, Encoding.UTF8, mostLikelyFormat); foreach (var sequence in items) { foreach (var sentence in sequence.Lines) { // On enlève la ponctuation var sentenceStripped = Regex.Replace(sentence, @"[^\w\s]", ""); sentenceStripped.ToLower(); // Recherche du vocabulaire dans la séquence if (sentenceStripped.Split(" ").Contains(vocab[0])) { samplesWithVocab.Add(new Sample { StartTime = new TimeSpan(sequence.StartTime), EndTime = new TimeSpan(sequence.EndTime), SubFileName = fileName }); var strConcat = ""; foreach (var str in sequence.Lines) { strConcat += str + " "; } Console.WriteLine(strConcat); } } } }catch (Exception ex) { Console.WriteLine("Parsing of file {0}: FAILURE\n{1}", fileName, ex); } } }); return(samplesWithVocab); }
static void Main(string[] args) { var parser = new SubtitlesParser.Classes.Parsers.SubParser(); var allFiles = BrowseTestSubtitlesFiles(); foreach (var file in allFiles) { var fileName = Path.GetFileName(file); using (var fileStream = File.OpenRead(file)) { try { var mostLikelyFormat = parser.GetMostLikelyFormat(fileName); var items = parser.ParseStream(fileStream, Encoding.UTF8, mostLikelyFormat); if (items.Any()) { Console.WriteLine("Parsing of file {0}: SUCCESS ({1} items - {2}% corrupted)", fileName, items.Count, (items.Count(it => it.StartTime <= 0 || it.EndTime <= 0) * 100) / items.Count); /*foreach (var item in items) * { * Console.WriteLine(item); * }*/ /*var duplicates = * items.GroupBy(it => new {it.StartTime, it.EndTime}).Where(grp => grp.Count() > 1); * Console.WriteLine("{0} duplicate items", duplicates.Count());*/ Console.WriteLine("----------------"); } else { throw new ArgumentException("Not items found!"); } } catch (Exception ex) { Console.WriteLine("Parsing of file {0}: FAILURE\n{1}", fileName, ex); } } Console.WriteLine("----------------------"); } Console.ReadLine(); }
static void Main(string[] args) { var parser = new SubtitlesParser.Classes.Parsers.SubParser(); var allFiles = BrowseTestSubtitlesFiles(); foreach (var file in allFiles) { var fileName = Path.GetFileName(file); using (var fileStream = File.OpenRead(file)) { try { var mostLikelyFormat = parser.GetMostLikelyFormat(fileName); var items = parser.ParseStream(fileStream, Encoding.UTF8, mostLikelyFormat); if (items.Any()) { Console.WriteLine("Parsing of file {0}: SUCCESS ({1} items - {2}% corrupted)", fileName, items.Count, (items.Count(it => it.StartTime <= 0 || it.EndTime <= 0) * 100)/ items.Count); /*foreach (var item in items) { Console.WriteLine(item); }*/ /*var duplicates = items.GroupBy(it => new {it.StartTime, it.EndTime}).Where(grp => grp.Count() > 1); Console.WriteLine("{0} duplicate items", duplicates.Count());*/ Console.WriteLine("----------------"); } else { throw new ArgumentException("Not items found!"); } } catch (Exception ex) { Console.WriteLine("Parsing of file {0}: FAILURE\n{1}", fileName, ex); } } Console.WriteLine("----------------------"); } Console.ReadLine(); }