Example #1
0
        public static async Task <Pipeline> TokenizerForAsync(Language language)
        {
            var p = new Pipeline()
            {
                Language = language
            };

            p.Add(new FastTokenizer(language));

            IProcess sd = null;

            try
            {
                //Uses english sentence detector as a default
                sd = await SentenceDetector.FromStoreAsync((language == Language.Any)?Language.English : language, -1, "");

                p.Add(sd);
            }
            catch
            {
                Logger.LogWarning("Could not find sentence detector model for language {LANGUAGE}. Falling back to english model", language);
            }

            if (sd is null)
            {
                try
                {
                    sd = await SentenceDetector.FromStoreAsync(Language.English, -1, "");

                    p.Add(sd);
                }
                catch
                {
                    Logger.LogWarning("Could not find sentence detector model for language {LANGUAGE}. Continuing without one", Language.English);
                }
            }

            return(p);
        }
        public static async Task <Pipeline> For(IEnumerable <Language> languages, bool sentenceDetector = true, bool tagger = true)
        {
            var processes = new List <IProcess>();

            foreach (var language in languages)
            {
                processes.Add(new FastTokenizer(language));
                if (sentenceDetector)
                {
                    processes.Add(await SentenceDetector.FromStoreAsync(language, -1, ""));
                }
                if (tagger)
                {
                    processes.Add(await AveragePerceptronTagger.FromStoreAsync(language, -1, ""));
                }
            }
            var p = new Pipeline(processes)
            {
                Language = Language.Any
            };

            return(p);
        }
Example #3
0
        private static async Task <List <List <SentenceDetector.SentenceDetectorToken> > > ReadCorpusAsync(List <string> trainDocuments, EnumCase ConvertCase, SentenceDetector sentenceDetector)
        {
            var allLines = new List <string>();

            foreach (var file in trainDocuments)
            {
                allLines.AddRange(await File.ReadAllLinesAsync(file));
            }

            var sentences = allLines.Where(l => l.StartsWith("# text =")).Select(l => l.Split(new char[] { '=' }, 2).Last().Trim()).ToList();

            if (ConvertCase == EnumCase.ForceUpper)
            {
                sentences = sentences.Select(s => s.ToUpperInvariant()).ToList();
            }
            if (ConvertCase == EnumCase.ForceLower)
            {
                sentences = sentences.Select(s => s.ToLowerInvariant()).ToList();
            }

            return(sentences.Select(s =>
            {
                var tk = sentenceDetector.SentenceDetectorTokenizer(s).Select(t => new SentenceDetector.SentenceDetectorToken(t.Value, t.Begin, t.End)).ToList();
                tk.Last().IsSentenceEnd = true;
                return tk;
            }).ToList());
        }
Example #4
0
        public static async Task Train(string udSource, string languagesDirectory)
        {
            var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories);
            var testFiles  = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories);

            var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var testFilesPerLanguage  = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());

            var languages = new List <(Language language, string lang)>();

            foreach (var lang in trainFilesPerLanguage.Keys)
            {
                try
                {
                    var language = Languages.CodeToEnum(lang);
                    languages.Add((language, lang));
                }
                catch
                {
                    Logger.LogWarning($"Unknown language {lang}");
                }
            }

            Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages.Select(l => l.language))}");

            foreach (var forceCase in new [] { EnumCase.Original, EnumCase.ForceUpper, EnumCase.ForceLower })
            {
                await Task.WhenAll(languages.Select(async v =>
                {
                    await Task.Yield();

                    var(language, lang) = (v.language, v.lang);

                    var modelTag = (forceCase == EnumCase.ForceUpper ? "upper" : (forceCase == EnumCase.ForceLower ? "lower" : ""));
                    var sentenceDetector = new SentenceDetector(language, 0, modelTag);

                    var trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], ConvertCase: forceCase, sentenceDetector: sentenceDetector);

                    //TODO: Implement test
                    //if(testFilesPerLanguage.TryGetValue(lang, out var testFile))
                    //{
                    //    var testDocuments = ReadUniversalDependencyCorpus(testFile, ConvertCase: forceCase, sentenceDetector: sentenceDetector);
                    //}

                    Logger.LogInformation($"Now training {lang} in mode {forceCase} using files {string.Join(", ", trainFilesPerLanguage[lang])}");
                    var scoreTest = sentenceDetector.Train(trainDocuments);
                    Logger.LogInformation($"Finished training {lang} in mode {forceCase}");
                    await sentenceDetector.StoreAsync();

                    if (scoreTest > 90)
                    {
                        //Prepare models for new nuget-based distribution
                        var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources");

                        Directory.CreateDirectory(resDir);

                        using (var f = File.OpenWrite(Path.Combine(resDir, $"sentence-detector{(string.IsNullOrEmpty(modelTag) ? "" : "-" + modelTag)}.bin")))
                        {
                            await sentenceDetector.StoreAsync(f);
                        }
                        await File.WriteAllTextAsync(Path.Combine(resDir, $"sentence-detector{(string.IsNullOrEmpty(modelTag) ? "" : "-" + modelTag)}.score"), $"{scoreTest:0.0}%");
                    }
                }).ToArray());
            }
        }
Example #5
0
        private static List <List <SentenceDetector.SentenceDetectorToken> > ReadCorpus(List <string> trainDocuments, EnumCase ConvertCase, SentenceDetector sentenceDetector)
        {
            var allLines  = trainDocuments.SelectMany(f => File.ReadAllLines(f));
            var sentences = allLines.Where(l => l.StartsWith("# text =")).Select(l => l.Split(new char[] { '=' }, 2).Last().Trim()).ToList();

            if (ConvertCase == EnumCase.ForceUpper)
            {
                sentences = sentences.Select(s => s.ToUpperInvariant()).ToList();
            }
            if (ConvertCase == EnumCase.ForceLower)
            {
                sentences = sentences.Select(s => s.ToLowerInvariant()).ToList();
            }

            return(sentences.Select(s =>
            {
                var tk = sentenceDetector.SentenceDetectorTokenizer(s).Select(t => new SentenceDetector.SentenceDetectorToken(t.Value, t.Begin, t.End)).ToList();
                tk.Last().IsSentenceEnd = true;
                return tk;
            }).ToList());
        }
Example #6
0
 public static string[] DetectSentences(string transcript)
 {
     return(SentenceDetector.SentenceDetect(transcript));
 }
Example #7
0
 private SentenceDetector GetSentenceDetector()
 {
     return(_sentenceDetector ?? (_sentenceDetector = ResolveOpenNlpTool <SentenceModel, SentenceDetectorME>("en-sent.bin")));
 }