private static async Task <List <List <SentenceDetector.SentenceDetectorToken> > > ReadCorpusAsync(List <string> trainDocuments, EnumCase ConvertCase, SentenceDetector sentenceDetector) { var allLines = new List <string>(); foreach (var file in trainDocuments) { allLines.AddRange(await File.ReadAllLinesAsync(file)); } var sentences = allLines.Where(l => l.StartsWith("# text =")).Select(l => l.Split(new char[] { '=' }, 2).Last().Trim()).ToList(); if (ConvertCase == EnumCase.ForceUpper) { sentences = sentences.Select(s => s.ToUpperInvariant()).ToList(); } if (ConvertCase == EnumCase.ForceLower) { sentences = sentences.Select(s => s.ToLowerInvariant()).ToList(); } return(sentences.Select(s => { var tk = sentenceDetector.SentenceDetectorTokenizer(s).Select(t => new SentenceDetector.SentenceDetectorToken(t.Value, t.Begin, t.End)).ToList(); tk.Last().IsSentenceEnd = true; return tk; }).ToList()); }
private static List <List <SentenceDetector.SentenceDetectorToken> > ReadCorpus(List <string> trainDocuments, EnumCase ConvertCase, SentenceDetector sentenceDetector) { var allLines = trainDocuments.SelectMany(f => File.ReadAllLines(f)); var sentences = allLines.Where(l => l.StartsWith("# text =")).Select(l => l.Split(new char[] { '=' }, 2).Last().Trim()).ToList(); if (ConvertCase == EnumCase.ForceUpper) { sentences = sentences.Select(s => s.ToUpperInvariant()).ToList(); } if (ConvertCase == EnumCase.ForceLower) { sentences = sentences.Select(s => s.ToLowerInvariant()).ToList(); } return(sentences.Select(s => { var tk = sentenceDetector.SentenceDetectorTokenizer(s).Select(t => new SentenceDetector.SentenceDetectorToken(t.Value, t.Begin, t.End)).ToList(); tk.Last().IsSentenceEnd = true; return tk; }).ToList()); }