public static async Task TrainAsync(string basePath, Language language, int version, string tag) { var langMarker = "-" + Languages.EnumToCode(language) + "-"; var files = Directory.EnumerateFiles(basePath, "*.bz2").Where(f => f.Contains(langMarker)); var documents = new List <IDocument>(); foreach (var f in files) { documents.AddRange(ReadFile(f)); } var pos = await AveragePerceptronTagger.FromStoreAsync(language, -1, ""); using (var m = new Measure(Logger, "Tagging documents")) { Parallel.ForEach(documents, doc => pos.Predict(doc)); } var aper = new AveragePerceptronEntityRecognizer(language, version, tag, new string[] { "Person", "Organization", "Location" }, ignoreCase: false); aper.Train(documents); await aper.StoreAsync(); }
private static async Task Main() { //Initialize the English built-in models Catalyst.Models.English.Register(); //Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); Console.OutputEncoding = Encoding.UTF8; ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole())); // Catalyst currently supports 3 different types of models for Named Entity Recognition (NER): // - Gazetteer-like(i.e. [Spotter](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/Spotter.cs)) // - Regex-like(i.e. [PatternSpotter](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/PatternSpotter.cs)) // - Perceptron (i.e. [AveragePerceptronEntityRecognizer](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/AveragePerceptronEntityRecognizer.cs)) //var s = typeof(Catalyst.Models.English).Assembly.GetManifestResourceStream($"{typeof(Catalyst.Models.English).Assembly.GetName().Name}.Resources.sentence-detector.bin"); //foreach(var name in typeof(Catalyst.Models.English).Assembly.GetManifestResourceNames()) //{ // Console.WriteLine(name); //} var sd = await SentenceDetector.FromStoreAsync(Language.English, -1, ""); var a = new AveragePerceptronTagger(Language.English, 0, ""); await a.LoadDataAsync(); var p = await AveragePerceptronTagger.FromStoreAsync(Language.English, -1, ""); await DemonstrateAveragePerceptronEntityRecognizerAndPatternSpotter(); DemonstrateSpotter(); }
public static async Task <Pipeline> ForAsync(Language language, bool sentenceDetector = true, bool tagger = true) { var p = await TokenizerForAsync(language, sentenceDetector); if (tagger && language != Language.Any && language != Language.Unknown) { p.Add(await AveragePerceptronTagger.FromStoreAsync(language, 0, "")); } return(p); }
public static async Task <Pipeline> ForAsync(Language language, bool sentenceDetector = true, bool tagger = true) { var p = new Pipeline(language); p.Add(new FastTokenizer(language)); if (sentenceDetector) { p.Add(await SentenceDetector.FromStoreAsync(language, 0, "")); } if (tagger) { p.Add(await AveragePerceptronTagger.FromStoreAsync(language, 0, "")); } return(p); }
public static Pipeline For(Language language, bool sentenceDetector = true, bool tagger = true) { var p = new Pipeline(language); p.Add(new FastTokenizer(language)); if (sentenceDetector) { p.Add(SentenceDetector.FromStoreAsync(language, -1, "").WaitResult()); } if (tagger) { p.Add(AveragePerceptronTagger.FromStoreAsync(language, -1, "").WaitResult()); } return(p); }
public static async Task <Pipeline> ForManyAsync(IEnumerable <Language> languages, bool sentenceDetector = true, bool tagger = true) { var processes = new List <IProcess>(); foreach (var language in languages) { var tmp = await TokenizerForAsync(language, sentenceDetector); processes.AddRange(tmp.Processes); if (tagger) { processes.Add(await AveragePerceptronTagger.FromStoreAsync(language, -1, "")); } } var p = new Pipeline(processes) { Language = Language.Any }; return(p); }
public static async Task TrainAsync(string basePath, Language language, int version, string tag, string languagesDirectory) { var langMarker = "-" + Languages.EnumToCode(language) + "-"; var files = Directory.EnumerateFiles(basePath, "*.bz2").Where(f => f.Contains(langMarker)); var documents = new List <IDocument>(); foreach (var f in files) { documents.AddRange(ReadFile(f)); } var pos = await AveragePerceptronTagger.FromStoreAsync(language, -1, ""); using (var m = new Measure(Logger, "Tagging documents")) { Parallel.ForEach(documents, doc => pos.Predict(doc)); } var aper = new AveragePerceptronEntityRecognizer(language, version, tag, new string[] { "Person", "Organization", "Location" }, ignoreCase: false); aper.Train(documents); await aper.StoreAsync(); //Prepare model for new nuget-based distribution var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources"); Directory.CreateDirectory(resDir); using (var f = File.OpenWrite(Path.Combine(resDir, "wikiner.bin"))) { await aper.StoreAsync(f); } }
private static double TestTagger(List <IDocument> testDocuments, AveragePerceptronTagger Tagger) { var sentences = testDocuments.SelectMany(d => d.Spans).ToList(); int correct = 0, total = 0; var sw = new System.Diagnostics.Stopwatch(); sw.Start(); int TP = 0, FN = 0, FP = 0; Parallel.ForEach(sentences, s => { var tags = s.Tokens.Select(t => t.POS).ToArray(); Tagger.Predict(s); var pred = s.Tokens.Select(t => t.POS).ToArray(); int correctOnSentence = tags.Zip(pred, (t, p) => t == p ? 1 : 0).Sum(); int _TP = 0, _FN = 0, _FP = 0; for (int m = 0; m < tags.Length; m++) { if (tags[m] == pred[m]) { TP++; } if (tags[m] != pred[m]) { FP++; FN++; } //Same if we are not evaluating per-tag precision / recall } Interlocked.Add(ref TP, _TP); Interlocked.Add(ref FN, _FN); Interlocked.Add(ref FP, _FP); if (correctOnSentence < s.TokensCount) { var sb = new StringBuilder(); for (int m = 0; m < tags.Length; m++) { sb.Append(s[m].Value); if (tags[m] != pred[m]) { sb.Append("[").Append("P:").Append(pred[m]).Append(" C:").Append(tags[m]).Append("]"); } sb.Append(" "); } sb.AppendLine(); lock (lockMistake) { File.AppendAllText("mistakes.txt", sb.ToString()); } } Interlocked.Add(ref correct, correctOnSentence); Interlocked.Add(ref total, s.TokensCount); for (int i = 0; i < s.TokensCount; i++) { s[i].POS = tags[i]; } }); sw.Stop(); Logger.LogInformation($"POS: {Math.Round(100D * correct / total, 2)}% at a rate of {Math.Round(1000D * total / sw.ElapsedMilliseconds, 0) } tokens/second"); var precision = (double)TP / (TP + FP); var recall = (double)TP / (TP + FN); Logger.LogInformation($"F1={100 * 2 * (precision * recall) / (precision + recall):0.00}% P={100 * precision:0.00}% R={100 * recall:0.00}% "); return(100D * correct / total); }
public static void Train(string udSource, string ontonotesSource) { var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories); var testFiles = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories); List <string> trainFilesOntonotesEnglish = null; if (!string.IsNullOrWhiteSpace(ontonotesSource)) { trainFilesOntonotesEnglish = Directory.GetFiles(ontonotesSource, "*.parse.ddg", SearchOption.AllDirectories) .Where(fn => !fn.Contains("sel_") || int.Parse(Path.GetFileNameWithoutExtension(fn).Split(new char[] { '_', '.' }).Skip(1).First()) < 3654) .ToList(); } var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var testFilesPerLanguage = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var languages = trainFilesPerLanguage.Keys.ToList(); Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages)}"); int N_training = 5; Parallel.ForEach(languages, lang => { Language language; try { language = Languages.CodeToEnum(lang); } catch { Logger.LogWarning($"Unknown language {lang}"); return; } var arcNames = new HashSet <string>(); if (trainFilesPerLanguage.TryGetValue(lang, out var langTrainFiles) && testFilesPerLanguage.TryGetValue(lang, out var langTestFiles)) { var trainDocuments = ReadCorpus(langTrainFiles, arcNames, language); var testDocuments = ReadCorpus(langTestFiles, arcNames, language); if (language == Language.English) { //Merge with Ontonotes 5.0 corpus trainDocuments.AddRange(ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true)); } double bestScore = double.MinValue; for (int i = 0; i < N_training; i++) { var Tagger = new AveragePerceptronTagger(language, 0); Tagger.Train(trainDocuments.AsEnumerable(), (int)(5 + ThreadSafeRandom.Next(15))); var scoreTrain = TestTagger(trainDocuments, Tagger); var scoreTest = TestTagger(testDocuments, Tagger); if (scoreTest > bestScore) { Logger.LogInformation($"\n>>>>> {lang}: NEW POS BEST: {scoreTest:0.0}%"); try { Tagger.StoreAsync().Wait(); } catch (Exception E) { Logger.LogError(E, $"\n>>>>> {lang}: Failed to store model"); } bestScore = scoreTest; } else { Logger.LogInformation($"\n>>>>> {lang}: POS BEST IS STILL : {bestScore:0.0}%"); } } bestScore = double.MinValue; for (int i = 0; i < N_training; i++) { var Parser = new AveragePerceptronDependencyParser(language, 0 /*, arcNames.ToList()*/); try { Parser.Train(trainDocuments.AsEnumerable(), (int)(5 + ThreadSafeRandom.Next(10)), (float)(1D - ThreadSafeRandom.NextDouble() * ThreadSafeRandom.NextDouble())); } catch (Exception E) { Logger.LogInformation("FAIL: " + E.Message); continue; } trainDocuments = ReadCorpus(langTrainFiles, arcNames, language); testDocuments = ReadCorpus(langTestFiles, arcNames, language); if (language == Language.English) { //Merge with Ontonotes 5.0 corpus trainDocuments.AddRange(ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true)); } var scoreTrain = TestParser(trainDocuments, Parser); var scoreTest = TestParser(testDocuments, Parser); if (scoreTest > bestScore) { Logger.LogInformation($"\n>>>>> {lang}: NEW DEP BEST: {scoreTest:0.0}%"); try { Parser.StoreAsync().Wait(); } catch (Exception E) { Logger.LogError(E, $"\n>>>>> {lang}: Failed to store model"); } bestScore = scoreTest; } else { Logger.LogInformation($"\n>>>>> {lang}: DEP BEST IS STILL : {bestScore:0.0}%"); } Parser = null; } } }); foreach (var lang in languages) { Language language; try { language = Languages.CodeToEnum(lang); } catch { Logger.LogInformation($"Unknown language {lang}"); return; } var arcNames = new HashSet <string>(); var trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], arcNames, language); var testDocuments = ReadCorpus(testFilesPerLanguage[lang], arcNames, language); if (language == Language.English) { //Merge with Ontonotes 5.0 corpus var ontonotesDocuments = ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true); trainDocuments.AddRange(ontonotesDocuments); } var Tagger = AveragePerceptronTagger.FromStoreAsync(language, 0, "").WaitResult(); Logger.LogInformation($"\n{lang} - TAGGER / TRAIN"); TestTagger(trainDocuments, Tagger); Logger.LogInformation($"\n{lang} - TAGGER / TEST"); TestTagger(testDocuments, Tagger); trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], arcNames, language); testDocuments = ReadCorpus(testFilesPerLanguage[lang], arcNames, language); var Parser = AveragePerceptronDependencyParser.FromStoreAsync(language, 0, "").WaitResult(); Logger.LogInformation($"\n{lang} - PARSER / TRAIN"); TestParser(trainDocuments, Parser); Logger.LogInformation($"\n{lang} - PARSER / TEST"); TestParser(testDocuments, Parser); } }
public static async Task Train(string udSource, string ontonotesSource, string languagesDirectory) { var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories); var testFiles = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories); List <string> trainFilesOntonotesEnglish = null; if (!string.IsNullOrWhiteSpace(ontonotesSource)) { trainFilesOntonotesEnglish = Directory.GetFiles(ontonotesSource, "*.parse.ddg", SearchOption.AllDirectories) .Where(fn => !fn.Contains("sel_") || int.Parse(Path.GetFileNameWithoutExtension(fn).Split(new char[] { '_', '.' }).Skip(1).First()) < 3654) .ToList(); } var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var testFilesPerLanguage = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var languages = new List <(Language language, string lang)>(); foreach (var lang in trainFilesPerLanguage.Keys.Intersect(testFilesPerLanguage.Keys)) { try { var language = Languages.CodeToEnum(lang); languages.Add((language, lang)); } catch { Logger.LogWarning($"Unknown language {lang}"); } } Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages.Select(l => l.language))}"); int attempts = 5; await Task.WhenAll(languages.Select(async v => { await Task.Yield(); var(language, lang) = (v.language, v.lang); var arcNames = new HashSet <string>(); if (trainFilesPerLanguage.TryGetValue(lang, out var langTrainFiles) && testFilesPerLanguage.TryGetValue(lang, out var langTestFiles)) { var trainDocuments = await ReadCorpusAsync(langTrainFiles, arcNames, language); var testDocuments = await ReadCorpusAsync(langTestFiles, arcNames, language); if (language == Language.English) { //Merge with Ontonotes 5.0 corpus var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count * testDocuments.Count / trainDocuments.Count); trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true)); testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true)); } double bestScore = double.MinValue; for (int i = 0; i < attempts; i++) { await Task.Run(async() => { var tagger = new AveragePerceptronTagger(language, 0); tagger.Train(trainDocuments, (5 + ThreadSafeRandom.Next(15))); var scoreTrain = TestTagger(trainDocuments, tagger); var scoreTest = TestTagger(testDocuments, tagger); if (scoreTest > bestScore) { Logger.LogInformation($"\n>>>>> {language}: NEW POS BEST: {scoreTest:0.0}%"); await tagger.StoreAsync(); if (scoreTest > 80) { //Prepare models for new nuget-based distribution var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources"); Directory.CreateDirectory(resDir); using (var f = File.OpenWrite(Path.Combine(resDir, "tagger.bin"))) { await tagger.StoreAsync(f); } await File.WriteAllTextAsync(Path.Combine(resDir, "tagger.score"), $"{scoreTest:0.0}%"); } bestScore = scoreTest; } else { Logger.LogInformation($"\n>>>>> {language}: POS BEST IS STILL : {bestScore:0.0}%"); } }); } bestScore = double.MinValue; for (int i = 0; i < attempts; i++) { await Task.Run(async() => { var parser = new AveragePerceptronDependencyParser(language, 0 /*, arcNames.ToList()*/); try { parser.Train(trainDocuments, (5 + ThreadSafeRandom.Next(10)), (float)(1D - ThreadSafeRandom.NextDouble() * ThreadSafeRandom.NextDouble())); } catch (Exception E) { Logger.LogError("FAIL", E); return; } trainDocuments = await ReadCorpusAsync(langTrainFiles, arcNames, language); testDocuments = await ReadCorpusAsync(langTestFiles, arcNames, language); if (language == Language.English) { //Merge with Ontonotes 5.0 corpus //Merge with Ontonotes 5.0 corpus var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count *testDocuments.Count / trainDocuments.Count); trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true)); testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true)); } var scoreTrain = TestParser(trainDocuments, parser); var scoreTest = TestParser(testDocuments, parser); if (scoreTest > bestScore) { Logger.LogInformation($"\n>>>>> {language}: NEW DEP BEST: {scoreTest:0.0}%"); if (scoreTest > 80) { //Prepare models for new nuget-based distribution var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources"); Directory.CreateDirectory(resDir); using (var f = File.OpenWrite(Path.Combine(resDir, "parser.bin"))) { await parser.StoreAsync(f); } await File.WriteAllTextAsync(Path.Combine(resDir, "parser.score"), $"{scoreTest:0.0}%"); } bestScore = scoreTest; } else { Logger.LogInformation($"\n>>>>> {language}: DEP BEST IS STILL : {bestScore:0.0}%"); } parser = null; }); } } })); foreach (var(language, lang) in languages) { var arcNames = new HashSet <string>(); var trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], arcNames, language); var testDocuments = await ReadCorpusAsync(testFilesPerLanguage[lang], arcNames, language); if (language == Language.English) { //Merge with Ontonotes 5.0 corpus var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count * testDocuments.Count / trainDocuments.Count); trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true)); testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true)); } var tagger = await AveragePerceptronTagger.FromStoreAsync(language, 0, ""); Logger.LogInformation($"\n{lang} - TAGGER / TRAIN"); TestTagger(trainDocuments, tagger); Logger.LogInformation($"\n{lang} - TAGGER / TEST"); TestTagger(testDocuments, tagger); trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], arcNames, language); testDocuments = await ReadCorpusAsync(testFilesPerLanguage[lang], arcNames, language); var parser = await AveragePerceptronDependencyParser.FromStoreAsync(language, 0, ""); Logger.LogInformation($"\n{lang} - PARSER / TRAIN"); TestParser(trainDocuments, parser); Logger.LogInformation($"\n{lang} - PARSER / TEST"); TestParser(testDocuments, parser); } }