コード例 #1
0
        public static async Task TrainAsync(string basePath, Language language, int version, string tag)
        {
            var langMarker = "-" + Languages.EnumToCode(language) + "-";
            var files      = Directory.EnumerateFiles(basePath, "*.bz2").Where(f => f.Contains(langMarker));

            var documents = new List <IDocument>();

            foreach (var f in files)
            {
                documents.AddRange(ReadFile(f));
            }

            var pos = await AveragePerceptronTagger.FromStoreAsync(language, -1, "");

            using (var m = new Measure(Logger, "Tagging documents"))
            {
                Parallel.ForEach(documents, doc => pos.Predict(doc));
            }

            var aper = new AveragePerceptronEntityRecognizer(language, version, tag, new string[] { "Person", "Organization", "Location" }, ignoreCase: false);

            aper.Train(documents);

            await aper.StoreAsync();
        }
コード例 #2
0
        private static async Task Main()
        {
            //Initialize the English built-in models
            Catalyst.Models.English.Register();

            //Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models"));

            Console.OutputEncoding = Encoding.UTF8;
            ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole()));

            // Catalyst currently supports 3 different types of models for Named Entity Recognition (NER):
            // - Gazetteer-like(i.e. [Spotter](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/Spotter.cs))
            // - Regex-like(i.e. [PatternSpotter](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/PatternSpotter.cs))
            // - Perceptron (i.e. [AveragePerceptronEntityRecognizer](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/AveragePerceptronEntityRecognizer.cs))



            //var s = typeof(Catalyst.Models.English).Assembly.GetManifestResourceStream($"{typeof(Catalyst.Models.English).Assembly.GetName().Name}.Resources.sentence-detector.bin");
            //foreach(var name in typeof(Catalyst.Models.English).Assembly.GetManifestResourceNames())
            //{
            //    Console.WriteLine(name);
            //}

            var sd = await SentenceDetector.FromStoreAsync(Language.English, -1, "");

            var a = new AveragePerceptronTagger(Language.English, 0, "");
            await a.LoadDataAsync();


            var p = await AveragePerceptronTagger.FromStoreAsync(Language.English, -1, "");

            await DemonstrateAveragePerceptronEntityRecognizerAndPatternSpotter();

            DemonstrateSpotter();
        }
コード例 #3
0
        public static async Task <Pipeline> ForAsync(Language language, bool sentenceDetector = true, bool tagger = true)
        {
            var p = await TokenizerForAsync(language, sentenceDetector);

            if (tagger && language != Language.Any && language != Language.Unknown)
            {
                p.Add(await AveragePerceptronTagger.FromStoreAsync(language, 0, ""));
            }
            return(p);
        }
コード例 #4
0
        public static async Task <Pipeline> ForAsync(Language language, bool sentenceDetector = true, bool tagger = true)
        {
            var p = new Pipeline(language);

            p.Add(new FastTokenizer(language));
            if (sentenceDetector)
            {
                p.Add(await SentenceDetector.FromStoreAsync(language, 0, ""));
            }
            if (tagger)
            {
                p.Add(await AveragePerceptronTagger.FromStoreAsync(language, 0, ""));
            }
            return(p);
        }
コード例 #5
0
        public static Pipeline For(Language language, bool sentenceDetector = true, bool tagger = true)
        {
            var p = new Pipeline(language);

            p.Add(new FastTokenizer(language));
            if (sentenceDetector)
            {
                p.Add(SentenceDetector.FromStoreAsync(language, -1, "").WaitResult());
            }
            if (tagger)
            {
                p.Add(AveragePerceptronTagger.FromStoreAsync(language, -1, "").WaitResult());
            }
            return(p);
        }
コード例 #6
0
        public static async Task <Pipeline> ForManyAsync(IEnumerable <Language> languages, bool sentenceDetector = true, bool tagger = true)
        {
            var processes = new List <IProcess>();

            foreach (var language in languages)
            {
                var tmp = await TokenizerForAsync(language, sentenceDetector);

                processes.AddRange(tmp.Processes);
                if (tagger)
                {
                    processes.Add(await AveragePerceptronTagger.FromStoreAsync(language, -1, ""));
                }
            }
            var p = new Pipeline(processes)
            {
                Language = Language.Any
            };

            return(p);
        }
コード例 #7
0
        public static async Task TrainAsync(string basePath, Language language, int version, string tag, string languagesDirectory)
        {
            var langMarker = "-" + Languages.EnumToCode(language) + "-";
            var files      = Directory.EnumerateFiles(basePath, "*.bz2").Where(f => f.Contains(langMarker));

            var documents = new List <IDocument>();

            foreach (var f in files)
            {
                documents.AddRange(ReadFile(f));
            }

            var pos = await AveragePerceptronTagger.FromStoreAsync(language, -1, "");

            using (var m = new Measure(Logger, "Tagging documents"))
            {
                Parallel.ForEach(documents, doc => pos.Predict(doc));
            }

            var aper = new AveragePerceptronEntityRecognizer(language, version, tag, new string[] { "Person", "Organization", "Location" }, ignoreCase: false);

            aper.Train(documents);

            await aper.StoreAsync();

            //Prepare model for new nuget-based distribution

            var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources");

            Directory.CreateDirectory(resDir);

            using (var f = File.OpenWrite(Path.Combine(resDir, "wikiner.bin")))
            {
                await aper.StoreAsync(f);
            }
        }
コード例 #8
0
        private static double TestTagger(List <IDocument> testDocuments, AveragePerceptronTagger Tagger)
        {
            var sentences = testDocuments.SelectMany(d => d.Spans).ToList();
            int correct = 0, total = 0;
            var sw = new System.Diagnostics.Stopwatch();

            sw.Start();

            int TP = 0, FN = 0, FP = 0;

            Parallel.ForEach(sentences, s =>
            {
                var tags = s.Tokens.Select(t => t.POS).ToArray();
                Tagger.Predict(s);
                var pred = s.Tokens.Select(t => t.POS).ToArray();
                int correctOnSentence = tags.Zip(pred, (t, p) => t == p ? 1 : 0).Sum();

                int _TP = 0, _FN = 0, _FP = 0;

                for (int m = 0; m < tags.Length; m++)
                {
                    if (tags[m] == pred[m])
                    {
                        TP++;
                    }
                    if (tags[m] != pred[m])
                    {
                        FP++; FN++;
                    }                                       //Same if we are not evaluating per-tag precision / recall
                }

                Interlocked.Add(ref TP, _TP);
                Interlocked.Add(ref FN, _FN);
                Interlocked.Add(ref FP, _FP);

                if (correctOnSentence < s.TokensCount)
                {
                    var sb = new StringBuilder();

                    for (int m = 0; m < tags.Length; m++)
                    {
                        sb.Append(s[m].Value);
                        if (tags[m] != pred[m])
                        {
                            sb.Append("[").Append("P:").Append(pred[m]).Append(" C:").Append(tags[m]).Append("]");
                        }
                        sb.Append(" ");
                    }
                    sb.AppendLine();

                    lock (lockMistake)
                    {
                        File.AppendAllText("mistakes.txt", sb.ToString());
                    }
                }

                Interlocked.Add(ref correct, correctOnSentence);
                Interlocked.Add(ref total, s.TokensCount);

                for (int i = 0; i < s.TokensCount; i++)
                {
                    s[i].POS = tags[i];
                }
            });
            sw.Stop();

            Logger.LogInformation($"POS: {Math.Round(100D * correct / total, 2)}% at a rate of {Math.Round(1000D * total / sw.ElapsedMilliseconds, 0) } tokens/second");

            var precision = (double)TP / (TP + FP);
            var recall    = (double)TP / (TP + FN);

            Logger.LogInformation($"F1={100 * 2 * (precision * recall) / (precision + recall):0.00}% P={100 * precision:0.00}% R={100 * recall:0.00}% ");

            return(100D * correct / total);
        }
コード例 #9
0
        public static void Train(string udSource, string ontonotesSource)
        {
            var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories);
            var testFiles  = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories);

            List <string> trainFilesOntonotesEnglish = null;

            if (!string.IsNullOrWhiteSpace(ontonotesSource))
            {
                trainFilesOntonotesEnglish = Directory.GetFiles(ontonotesSource, "*.parse.ddg", SearchOption.AllDirectories)
                                             .Where(fn => !fn.Contains("sel_") || int.Parse(Path.GetFileNameWithoutExtension(fn).Split(new char[] { '_', '.' }).Skip(1).First()) < 3654)
                                             .ToList();
            }

            var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var testFilesPerLanguage  = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var languages             = trainFilesPerLanguage.Keys.ToList();

            Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages)}");

            int N_training = 5;

            Parallel.ForEach(languages, lang =>
            {
                Language language;
                try
                {
                    language = Languages.CodeToEnum(lang);
                }
                catch
                {
                    Logger.LogWarning($"Unknown language {lang}");
                    return;
                }

                var arcNames = new HashSet <string>();

                if (trainFilesPerLanguage.TryGetValue(lang, out var langTrainFiles) && testFilesPerLanguage.TryGetValue(lang, out var langTestFiles))
                {
                    var trainDocuments = ReadCorpus(langTrainFiles, arcNames, language);
                    var testDocuments  = ReadCorpus(langTestFiles, arcNames, language);

                    if (language == Language.English)
                    {
                        //Merge with Ontonotes 5.0 corpus
                        trainDocuments.AddRange(ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true));
                    }

                    double bestScore = double.MinValue;

                    for (int i = 0; i < N_training; i++)
                    {
                        var Tagger = new AveragePerceptronTagger(language, 0);
                        Tagger.Train(trainDocuments.AsEnumerable(), (int)(5 + ThreadSafeRandom.Next(15)));
                        var scoreTrain = TestTagger(trainDocuments, Tagger);
                        var scoreTest  = TestTagger(testDocuments, Tagger);
                        if (scoreTest > bestScore)
                        {
                            Logger.LogInformation($"\n>>>>> {lang}: NEW POS BEST: {scoreTest:0.0}%");
                            try
                            {
                                Tagger.StoreAsync().Wait();
                            }
                            catch (Exception E)
                            {
                                Logger.LogError(E, $"\n>>>>> {lang}: Failed to store model");
                            }
                            bestScore = scoreTest;
                        }
                        else
                        {
                            Logger.LogInformation($"\n>>>>> {lang}: POS BEST IS STILL : {bestScore:0.0}%");
                        }
                    }


                    bestScore = double.MinValue;
                    for (int i = 0; i < N_training; i++)
                    {
                        var Parser = new AveragePerceptronDependencyParser(language, 0 /*, arcNames.ToList()*/);
                        try
                        {
                            Parser.Train(trainDocuments.AsEnumerable(), (int)(5 + ThreadSafeRandom.Next(10)), (float)(1D - ThreadSafeRandom.NextDouble() * ThreadSafeRandom.NextDouble()));
                        }
                        catch (Exception E)
                        {
                            Logger.LogInformation("FAIL: " + E.Message);
                            continue;
                        }

                        trainDocuments = ReadCorpus(langTrainFiles, arcNames, language);
                        testDocuments  = ReadCorpus(langTestFiles, arcNames, language);

                        if (language == Language.English)
                        {
                            //Merge with Ontonotes 5.0 corpus
                            trainDocuments.AddRange(ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true));
                        }

                        var scoreTrain = TestParser(trainDocuments, Parser);
                        var scoreTest  = TestParser(testDocuments, Parser);

                        if (scoreTest > bestScore)
                        {
                            Logger.LogInformation($"\n>>>>> {lang}: NEW DEP BEST: {scoreTest:0.0}%");
                            try
                            {
                                Parser.StoreAsync().Wait();
                            }
                            catch (Exception E)
                            {
                                Logger.LogError(E, $"\n>>>>> {lang}: Failed to store model");
                            }
                            bestScore = scoreTest;
                        }
                        else
                        {
                            Logger.LogInformation($"\n>>>>> {lang}: DEP BEST IS STILL : {bestScore:0.0}%");
                        }
                        Parser = null;
                    }
                }
            });

            foreach (var lang in languages)
            {
                Language language;
                try
                {
                    language = Languages.CodeToEnum(lang);
                }
                catch
                {
                    Logger.LogInformation($"Unknown language {lang}");
                    return;
                }

                var arcNames = new HashSet <string>();

                var trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], arcNames, language);
                var testDocuments  = ReadCorpus(testFilesPerLanguage[lang], arcNames, language);

                if (language == Language.English)
                {
                    //Merge with Ontonotes 5.0 corpus
                    var ontonotesDocuments = ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true);
                    trainDocuments.AddRange(ontonotesDocuments);
                }

                var Tagger = AveragePerceptronTagger.FromStoreAsync(language, 0, "").WaitResult();
                Logger.LogInformation($"\n{lang} - TAGGER / TRAIN");
                TestTagger(trainDocuments, Tagger);

                Logger.LogInformation($"\n{lang} - TAGGER / TEST");
                TestTagger(testDocuments, Tagger);

                trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], arcNames, language);
                testDocuments  = ReadCorpus(testFilesPerLanguage[lang], arcNames, language);

                var Parser = AveragePerceptronDependencyParser.FromStoreAsync(language, 0, "").WaitResult();
                Logger.LogInformation($"\n{lang} - PARSER / TRAIN");
                TestParser(trainDocuments, Parser);

                Logger.LogInformation($"\n{lang} - PARSER / TEST");
                TestParser(testDocuments, Parser);
            }
        }
コード例 #10
0
        public static async Task Train(string udSource, string ontonotesSource, string languagesDirectory)
        {
            var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories);
            var testFiles  = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories);

            List <string> trainFilesOntonotesEnglish = null;

            if (!string.IsNullOrWhiteSpace(ontonotesSource))
            {
                trainFilesOntonotesEnglish = Directory.GetFiles(ontonotesSource, "*.parse.ddg", SearchOption.AllDirectories)
                                             .Where(fn => !fn.Contains("sel_") || int.Parse(Path.GetFileNameWithoutExtension(fn).Split(new char[] { '_', '.' }).Skip(1).First()) < 3654)
                                             .ToList();
            }

            var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var testFilesPerLanguage  = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());

            var languages = new List <(Language language, string lang)>();

            foreach (var lang in trainFilesPerLanguage.Keys.Intersect(testFilesPerLanguage.Keys))
            {
                try
                {
                    var language = Languages.CodeToEnum(lang);
                    languages.Add((language, lang));
                }
                catch
                {
                    Logger.LogWarning($"Unknown language {lang}");
                }
            }

            Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages.Select(l => l.language))}");

            int attempts = 5;

            await Task.WhenAll(languages.Select(async v =>
            {
                await Task.Yield();

                var(language, lang) = (v.language, v.lang);

                var arcNames = new HashSet <string>();

                if (trainFilesPerLanguage.TryGetValue(lang, out var langTrainFiles) && testFilesPerLanguage.TryGetValue(lang, out var langTestFiles))
                {
                    var trainDocuments = await ReadCorpusAsync(langTrainFiles, arcNames, language);
                    var testDocuments  = await ReadCorpusAsync(langTestFiles, arcNames, language);

                    if (language == Language.English)
                    {
                        //Merge with Ontonotes 5.0 corpus
                        var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count * testDocuments.Count / trainDocuments.Count);

                        trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                        testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                    }

                    double bestScore = double.MinValue;

                    for (int i = 0; i < attempts; i++)
                    {
                        await Task.Run(async() =>
                        {
                            var tagger = new AveragePerceptronTagger(language, 0);
                            tagger.Train(trainDocuments, (5 + ThreadSafeRandom.Next(15)));
                            var scoreTrain = TestTagger(trainDocuments, tagger);
                            var scoreTest  = TestTagger(testDocuments, tagger);
                            if (scoreTest > bestScore)
                            {
                                Logger.LogInformation($"\n>>>>> {language}: NEW POS BEST: {scoreTest:0.0}%");
                                await tagger.StoreAsync();

                                if (scoreTest > 80)
                                {
                                    //Prepare models for new nuget-based distribution
                                    var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources");
                                    Directory.CreateDirectory(resDir);
                                    using (var f = File.OpenWrite(Path.Combine(resDir, "tagger.bin")))
                                    {
                                        await tagger.StoreAsync(f);
                                    }
                                    await File.WriteAllTextAsync(Path.Combine(resDir, "tagger.score"), $"{scoreTest:0.0}%");
                                }

                                bestScore = scoreTest;
                            }
                            else
                            {
                                Logger.LogInformation($"\n>>>>> {language}: POS BEST IS STILL : {bestScore:0.0}%");
                            }
                        });
                    }


                    bestScore = double.MinValue;
                    for (int i = 0; i < attempts; i++)
                    {
                        await Task.Run(async() =>
                        {
                            var parser = new AveragePerceptronDependencyParser(language, 0 /*, arcNames.ToList()*/);
                            try
                            {
                                parser.Train(trainDocuments, (5 + ThreadSafeRandom.Next(10)), (float)(1D - ThreadSafeRandom.NextDouble() * ThreadSafeRandom.NextDouble()));
                            }
                            catch (Exception E)
                            {
                                Logger.LogError("FAIL", E);
                                return;
                            }

                            trainDocuments = await ReadCorpusAsync(langTrainFiles, arcNames, language);
                            testDocuments  = await ReadCorpusAsync(langTestFiles, arcNames, language);

                            if (language == Language.English)
                            {
                                //Merge with Ontonotes 5.0 corpus
                                //Merge with Ontonotes 5.0 corpus
                                var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count *testDocuments.Count / trainDocuments.Count);

                                trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                                testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                            }

                            var scoreTrain = TestParser(trainDocuments, parser);
                            var scoreTest  = TestParser(testDocuments, parser);

                            if (scoreTest > bestScore)
                            {
                                Logger.LogInformation($"\n>>>>> {language}: NEW DEP BEST: {scoreTest:0.0}%");

                                if (scoreTest > 80)
                                {
                                    //Prepare models for new nuget-based distribution
                                    var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources");
                                    Directory.CreateDirectory(resDir);
                                    using (var f = File.OpenWrite(Path.Combine(resDir, "parser.bin")))
                                    {
                                        await parser.StoreAsync(f);
                                    }
                                    await File.WriteAllTextAsync(Path.Combine(resDir, "parser.score"), $"{scoreTest:0.0}%");
                                }

                                bestScore = scoreTest;
                            }
                            else
                            {
                                Logger.LogInformation($"\n>>>>> {language}: DEP BEST IS STILL : {bestScore:0.0}%");
                            }
                            parser = null;
                        });
                    }
                }
            }));

            foreach (var(language, lang) in languages)
            {
                var arcNames = new HashSet <string>();

                var trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], arcNames, language);

                var testDocuments = await ReadCorpusAsync(testFilesPerLanguage[lang], arcNames, language);

                if (language == Language.English)
                {
                    //Merge with Ontonotes 5.0 corpus
                    var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count * testDocuments.Count / trainDocuments.Count);

                    trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                    testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                }

                var tagger = await AveragePerceptronTagger.FromStoreAsync(language, 0, "");

                Logger.LogInformation($"\n{lang} - TAGGER / TRAIN");
                TestTagger(trainDocuments, tagger);

                Logger.LogInformation($"\n{lang} - TAGGER / TEST");
                TestTagger(testDocuments, tagger);

                trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], arcNames, language);

                testDocuments = await ReadCorpusAsync(testFilesPerLanguage[lang], arcNames, language);

                var parser = await AveragePerceptronDependencyParser.FromStoreAsync(language, 0, "");

                Logger.LogInformation($"\n{lang} - PARSER / TRAIN");
                TestParser(trainDocuments, parser);

                Logger.LogInformation($"\n{lang} - PARSER / TEST");
                TestParser(testDocuments, parser);
            }
        }