public static async Task <Pipeline> TokenizerForAsync(Language language) { var p = new Pipeline() { Language = language }; p.Add(new FastTokenizer(language)); IProcess sd = null; try { //Uses english sentence detector as a default sd = await SentenceDetector.FromStoreAsync((language == Language.Any)?Language.English : language, -1, ""); p.Add(sd); } catch { Logger.LogWarning("Could not find sentence detector model for language {LANGUAGE}. Falling back to english model", language); } if (sd is null) { try { sd = await SentenceDetector.FromStoreAsync(Language.English, -1, ""); p.Add(sd); } catch { Logger.LogWarning("Could not find sentence detector model for language {LANGUAGE}. Continuing without one", Language.English); } } return(p); }
public static async Task <Pipeline> For(IEnumerable <Language> languages, bool sentenceDetector = true, bool tagger = true) { var processes = new List <IProcess>(); foreach (var language in languages) { processes.Add(new FastTokenizer(language)); if (sentenceDetector) { processes.Add(await SentenceDetector.FromStoreAsync(language, -1, "")); } if (tagger) { processes.Add(await AveragePerceptronTagger.FromStoreAsync(language, -1, "")); } } var p = new Pipeline(processes) { Language = Language.Any }; return(p); }
private static async Task <List <List <SentenceDetector.SentenceDetectorToken> > > ReadCorpusAsync(List <string> trainDocuments, EnumCase ConvertCase, SentenceDetector sentenceDetector) { var allLines = new List <string>(); foreach (var file in trainDocuments) { allLines.AddRange(await File.ReadAllLinesAsync(file)); } var sentences = allLines.Where(l => l.StartsWith("# text =")).Select(l => l.Split(new char[] { '=' }, 2).Last().Trim()).ToList(); if (ConvertCase == EnumCase.ForceUpper) { sentences = sentences.Select(s => s.ToUpperInvariant()).ToList(); } if (ConvertCase == EnumCase.ForceLower) { sentences = sentences.Select(s => s.ToLowerInvariant()).ToList(); } return(sentences.Select(s => { var tk = sentenceDetector.SentenceDetectorTokenizer(s).Select(t => new SentenceDetector.SentenceDetectorToken(t.Value, t.Begin, t.End)).ToList(); tk.Last().IsSentenceEnd = true; return tk; }).ToList()); }
public static async Task Train(string udSource, string languagesDirectory) { var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories); var testFiles = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories); var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var testFilesPerLanguage = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var languages = new List <(Language language, string lang)>(); foreach (var lang in trainFilesPerLanguage.Keys) { try { var language = Languages.CodeToEnum(lang); languages.Add((language, lang)); } catch { Logger.LogWarning($"Unknown language {lang}"); } } Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages.Select(l => l.language))}"); foreach (var forceCase in new [] { EnumCase.Original, EnumCase.ForceUpper, EnumCase.ForceLower }) { await Task.WhenAll(languages.Select(async v => { await Task.Yield(); var(language, lang) = (v.language, v.lang); var modelTag = (forceCase == EnumCase.ForceUpper ? "upper" : (forceCase == EnumCase.ForceLower ? "lower" : "")); var sentenceDetector = new SentenceDetector(language, 0, modelTag); var trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], ConvertCase: forceCase, sentenceDetector: sentenceDetector); //TODO: Implement test //if(testFilesPerLanguage.TryGetValue(lang, out var testFile)) //{ // var testDocuments = ReadUniversalDependencyCorpus(testFile, ConvertCase: forceCase, sentenceDetector: sentenceDetector); //} Logger.LogInformation($"Now training {lang} in mode {forceCase} using files {string.Join(", ", trainFilesPerLanguage[lang])}"); var scoreTest = sentenceDetector.Train(trainDocuments); Logger.LogInformation($"Finished training {lang} in mode {forceCase}"); await sentenceDetector.StoreAsync(); if (scoreTest > 90) { //Prepare models for new nuget-based distribution var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources"); Directory.CreateDirectory(resDir); using (var f = File.OpenWrite(Path.Combine(resDir, $"sentence-detector{(string.IsNullOrEmpty(modelTag) ? "" : "-" + modelTag)}.bin"))) { await sentenceDetector.StoreAsync(f); } await File.WriteAllTextAsync(Path.Combine(resDir, $"sentence-detector{(string.IsNullOrEmpty(modelTag) ? "" : "-" + modelTag)}.score"), $"{scoreTest:0.0}%"); } }).ToArray()); } }
private static List <List <SentenceDetector.SentenceDetectorToken> > ReadCorpus(List <string> trainDocuments, EnumCase ConvertCase, SentenceDetector sentenceDetector) { var allLines = trainDocuments.SelectMany(f => File.ReadAllLines(f)); var sentences = allLines.Where(l => l.StartsWith("# text =")).Select(l => l.Split(new char[] { '=' }, 2).Last().Trim()).ToList(); if (ConvertCase == EnumCase.ForceUpper) { sentences = sentences.Select(s => s.ToUpperInvariant()).ToList(); } if (ConvertCase == EnumCase.ForceLower) { sentences = sentences.Select(s => s.ToLowerInvariant()).ToList(); } return(sentences.Select(s => { var tk = sentenceDetector.SentenceDetectorTokenizer(s).Select(t => new SentenceDetector.SentenceDetectorToken(t.Value, t.Begin, t.End)).ToList(); tk.Last().IsSentenceEnd = true; return tk; }).ToList()); }
public static string[] DetectSentences(string transcript) { return(SentenceDetector.SentenceDetect(transcript)); }
private SentenceDetector GetSentenceDetector() { return(_sentenceDetector ?? (_sentenceDetector = ResolveOpenNlpTool <SentenceModel, SentenceDetectorME>("en-sent.bin"))); }