public static void TransformJsonDataInModelData(string pathToFiles) { var files = Directory.GetFiles(pathToFiles); var ld = new LanguageDetector(version: 0); foreach (var f in files) { var language = Languages.CodeToEnum(Path.GetFileNameWithoutExtension(f)); ld.Data.Languages.Add(language); string json = File.ReadAllText(f); var jsonProfile = JsonConvert.DeserializeObject <JsonLanguageProfile>(json); foreach (var word in jsonProfile.freq.Keys) { int hash = GetHash(word.ToLowerInvariant().Trim().AsSpan()); if (!ld.Data.WordLanguageProbabilities.ContainsKey(hash)) { ld.Data.WordLanguageProbabilities[hash] = new Dictionary <Language, double>(); } if (word.Length >= 1 && word.Length <= ld.NGramLength) { double prob = (double)jsonProfile.freq[word] / jsonProfile.n_words[word.Length - 1]; ld.Data.WordLanguageProbabilities[hash][language] = prob; } } } ld.StoreAsync().Wait(); }
public void Process(IDocument document) { if (document.Length == 0 || (document.Language != Language.Unknown && document.Language != Language.Any)) { return; } //Don't try to identify documents that already have their language set or is empty IDocument tempDocument = document; if (document.SpansCount == 0) // Have to tokenize temporarily the document { if (document.Length > 200) { tempDocument = new Document(document.Value.Substring(0, 200)); } else { tempDocument = new Document(document.Value); } Tokenizer.Process(tempDocument); } var tag = Model.PredictMax(tempDocument, 200); document.Language = Languages.CodeToEnum(tag.label); }
public void Process(IDocument document) { if (document.Length == 0 || (document.Language != Language.Unknown && document.Language != Language.Any)) { return; } //Don't try to identify documents that already have their language set or is empty IDocument tempDocument = Prepare(document); try { var tag = Model.PredictMax(tempDocument, 200); if (tag.label is null) { document.Language = Language.Unknown; } else { document.Language = Languages.CodeToEnum(tag.label); } } catch { document.Language = Language.Unknown; } }
public static void Test(string pathToSentences) { var vectorizer = FastTextLanguageDetector.FromStoreAsync(Language.Any, 0, null).WaitResult(); var pipe = new Pipeline(); pipe.Add(new SpaceTokenizer()); var docs = File.ReadAllLines(pathToSentences).Shuffle() .Where(txt => !string.IsNullOrWhiteSpace(txt)) .Select(txt => txt.Split('\t')) .Where(s => s.Length == 3 && Languages.IsValid3LetterCode(s[1])) //.GroupBy(l => l[1]) //.Where(g => g.Count() > 10_000) //.SelectMany(g => g) .Select(s => { var doc = new Document(s[2]); doc.Labels.Add(Languages.EnumToCode(Languages.ThreeLetterCodeToEnum(s[1]))); return(doc as IDocument); }); docs = pipe.Process(docs).WithCaching(Language.Any, 0, "language-detector-corpus", 100_000).ToList(); int TP = 0, FP = 0, FN = 0; int k = 0; var sw = Stopwatch.StartNew(); Parallel.ForEach(docs, (doc) => { k++; vectorizer.Process(doc); if (doc.Language == Languages.CodeToEnum(doc.Labels.First())) { Interlocked.Increment(ref TP); } else { Interlocked.Increment(ref FP); Interlocked.Increment(ref FN); } }); sw.Stop(); var precision = (double)TP / (double)(TP + FN); var recall = (double)TP / (double)(TP + FN); var f1 = 2 * (precision * recall) / (precision + recall); Console.WriteLine($"F1= {f1 * 100:0.0}% P= {precision * 100:0.0}% R={recall * 100:0.0}% in {sw.Elapsed.TotalSeconds:0.00}s or {k / sw.Elapsed.TotalSeconds} doc/s"); }
public Dictionary <Language, float> Predict(IDocument document) { IDocument tempDocument = Prepare(document); try { var predictions = Model.Predict(tempDocument); return(predictions.ToDictionary(kv => Languages.CodeToEnum(kv.Key), kv => kv.Value)); } catch { return(new Dictionary <Language, float>() { [Language.Unknown] = 1f }); } }
public static void Train(string udSource) { var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories); var testFiles = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories); var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var testFilesPerLanguage = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var languages = trainFilesPerLanguage.Keys.ToList(); Console.WriteLine($"Found these languages for training: {string.Join(", ", languages)}"); foreach (var forceCase in new EnumCase[] { EnumCase.Original, EnumCase.ForceUpper, EnumCase.ForceLower }) //need tom fix the storage model first - maybe join all in one model { ParallelAsync.ForEachAsync(languages, new ParallelOptions(), async lang => { Language language; try { language = Languages.CodeToEnum(lang); } catch { Console.WriteLine($"Unknown language {lang}"); return; } var modelTag = (forceCase == EnumCase.ForceUpper ? "upper" : (forceCase == EnumCase.ForceLower ? "lower" : "")); var sentenceDetector = new SentenceDetector(language, 0, modelTag); var trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], ConvertCase: forceCase, sentenceDetector: sentenceDetector); //TODO: Implement test //if(testFilesPerLanguage.TryGetValue(lang, out var testFile)) //{ // var testDocuments = ReadUniversalDependencyCorpus(testFile, ConvertCase: forceCase, sentenceDetector: sentenceDetector); //} Console.WriteLine($"Now training {lang} in mode {forceCase} using files {string.Join(", ", trainFilesPerLanguage[lang])}"); sentenceDetector.Train(trainDocuments); await sentenceDetector.StoreAsync(); }); } }
public static void Train(string udSource, string ontonotesSource) { var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories); var testFiles = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories); List <string> trainFilesOntonotesEnglish = null; if (!string.IsNullOrWhiteSpace(ontonotesSource)) { trainFilesOntonotesEnglish = Directory.GetFiles(ontonotesSource, "*.parse.ddg", SearchOption.AllDirectories) .Where(fn => !fn.Contains("sel_") || int.Parse(Path.GetFileNameWithoutExtension(fn).Split(new char[] { '_', '.' }).Skip(1).First()) < 3654) .ToList(); } var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var testFilesPerLanguage = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var languages = trainFilesPerLanguage.Keys.ToList(); Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages)}"); int N_training = 5; Parallel.ForEach(languages, lang => { Language language; try { language = Languages.CodeToEnum(lang); } catch { Logger.LogWarning($"Unknown language {lang}"); return; } var arcNames = new HashSet <string>(); if (trainFilesPerLanguage.TryGetValue(lang, out var langTrainFiles) && testFilesPerLanguage.TryGetValue(lang, out var langTestFiles)) { var trainDocuments = ReadCorpus(langTrainFiles, arcNames, language); var testDocuments = ReadCorpus(langTestFiles, arcNames, language); if (language == Language.English) { //Merge with Ontonotes 5.0 corpus trainDocuments.AddRange(ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true)); } double bestScore = double.MinValue; for (int i = 0; i < N_training; i++) { var Tagger = new AveragePerceptronTagger(language, 0); Tagger.Train(trainDocuments.AsEnumerable(), (int)(5 + ThreadSafeRandom.Next(15))); var scoreTrain = TestTagger(trainDocuments, Tagger); var scoreTest = TestTagger(testDocuments, Tagger); if (scoreTest > bestScore) { Logger.LogInformation($"\n>>>>> {lang}: NEW POS BEST: {scoreTest:0.0}%"); try { Tagger.StoreAsync().Wait(); } catch (Exception E) { Logger.LogError(E, $"\n>>>>> {lang}: Failed to store model"); } bestScore = scoreTest; } else { Logger.LogInformation($"\n>>>>> {lang}: POS BEST IS STILL : {bestScore:0.0}%"); } } bestScore = double.MinValue; for (int i = 0; i < N_training; i++) { var Parser = new AveragePerceptronDependencyParser(language, 0 /*, arcNames.ToList()*/); try { Parser.Train(trainDocuments.AsEnumerable(), (int)(5 + ThreadSafeRandom.Next(10)), (float)(1D - ThreadSafeRandom.NextDouble() * ThreadSafeRandom.NextDouble())); } catch (Exception E) { Logger.LogInformation("FAIL: " + E.Message); continue; } trainDocuments = ReadCorpus(langTrainFiles, arcNames, language); testDocuments = ReadCorpus(langTestFiles, arcNames, language); if (language == Language.English) { //Merge with Ontonotes 5.0 corpus trainDocuments.AddRange(ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true)); } var scoreTrain = TestParser(trainDocuments, Parser); var scoreTest = TestParser(testDocuments, Parser); if (scoreTest > bestScore) { Logger.LogInformation($"\n>>>>> {lang}: NEW DEP BEST: {scoreTest:0.0}%"); try { Parser.StoreAsync().Wait(); } catch (Exception E) { Logger.LogError(E, $"\n>>>>> {lang}: Failed to store model"); } bestScore = scoreTest; } else { Logger.LogInformation($"\n>>>>> {lang}: DEP BEST IS STILL : {bestScore:0.0}%"); } Parser = null; } } }); foreach (var lang in languages) { Language language; try { language = Languages.CodeToEnum(lang); } catch { Logger.LogInformation($"Unknown language {lang}"); return; } var arcNames = new HashSet <string>(); var trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], arcNames, language); var testDocuments = ReadCorpus(testFilesPerLanguage[lang], arcNames, language); if (language == Language.English) { //Merge with Ontonotes 5.0 corpus var ontonotesDocuments = ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true); trainDocuments.AddRange(ontonotesDocuments); } var Tagger = AveragePerceptronTagger.FromStoreAsync(language, 0, "").WaitResult(); Logger.LogInformation($"\n{lang} - TAGGER / TRAIN"); TestTagger(trainDocuments, Tagger); Logger.LogInformation($"\n{lang} - TAGGER / TEST"); TestTagger(testDocuments, Tagger); trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], arcNames, language); testDocuments = ReadCorpus(testFilesPerLanguage[lang], arcNames, language); var Parser = AveragePerceptronDependencyParser.FromStoreAsync(language, 0, "").WaitResult(); Logger.LogInformation($"\n{lang} - PARSER / TRAIN"); TestParser(trainDocuments, Parser); Logger.LogInformation($"\n{lang} - PARSER / TEST"); TestParser(testDocuments, Parser); } }
public static async Task Train(string udSource, string languagesDirectory) { var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories); var testFiles = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories); var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var testFilesPerLanguage = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var languages = new List <(Language language, string lang)>(); foreach (var lang in trainFilesPerLanguage.Keys) { try { var language = Languages.CodeToEnum(lang); languages.Add((language, lang)); } catch { Logger.LogWarning($"Unknown language {lang}"); } } Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages.Select(l => l.language))}"); foreach (var forceCase in new [] { EnumCase.Original, EnumCase.ForceUpper, EnumCase.ForceLower }) { await Task.WhenAll(languages.Select(async v => { await Task.Yield(); var(language, lang) = (v.language, v.lang); var modelTag = (forceCase == EnumCase.ForceUpper ? "upper" : (forceCase == EnumCase.ForceLower ? "lower" : "")); var sentenceDetector = new SentenceDetector(language, 0, modelTag); var trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], ConvertCase: forceCase, sentenceDetector: sentenceDetector); //TODO: Implement test //if(testFilesPerLanguage.TryGetValue(lang, out var testFile)) //{ // var testDocuments = ReadUniversalDependencyCorpus(testFile, ConvertCase: forceCase, sentenceDetector: sentenceDetector); //} Logger.LogInformation($"Now training {lang} in mode {forceCase} using files {string.Join(", ", trainFilesPerLanguage[lang])}"); var scoreTest = sentenceDetector.Train(trainDocuments); Logger.LogInformation($"Finished training {lang} in mode {forceCase}"); await sentenceDetector.StoreAsync(); if (scoreTest > 90) { //Prepare models for new nuget-based distribution var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources"); Directory.CreateDirectory(resDir); using (var f = File.OpenWrite(Path.Combine(resDir, $"sentence-detector{(string.IsNullOrEmpty(modelTag) ? "" : "-" + modelTag)}.bin"))) { await sentenceDetector.StoreAsync(f); } await File.WriteAllTextAsync(Path.Combine(resDir, $"sentence-detector{(string.IsNullOrEmpty(modelTag) ? "" : "-" + modelTag)}.score"), $"{scoreTest:0.0}%"); } }).ToArray()); } }
internal static async Task RunAsync(string spacyLookupsData, string languagesDirectory) { var rootLangFolder = Path.Combine(spacyLookupsData, @"spacy_lookups_data\data\"); if (!Directory.Exists(rootLangFolder)) { throw new Exception("data directory not found"); } //TODO Handle rules data var tasks = new List <Task>(); foreach (var(file, language) in Directory.GetFiles(rootLangFolder, "*_lemma_lookup*.json").Select(f => (file:f, language: Languages.CodeToEnum(Path.GetFileName(f).Substring(0, 2))))) { tasks.Add(Task.Run(async() => { Console.WriteLine($"\n\n\nBegin processing {file}\n\n"); var name = Path.GetFileNameWithoutExtension(file); var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources"); Directory.CreateDirectory(resDir); var outputFile = Path.Combine(resDir, name + ".bin"); if (File.Exists(outputFile)) { Console.WriteLine("Skipping..."); return; } var map = JsonConvert.DeserializeObject <Dictionary <string, string> >(FixWordsAsArrays(await File.ReadAllTextAsync(file))); var buffer = new char[map.Values.Sum(k => k.Length)]; var bufferLength = 0; var entries = new Dictionary <ulong, Lookups.Entry>(); int count = 0; foreach (var(k, v) in map.OrderByDescending(kv => kv.Value.Length).ThenBy(kv => kv.Value)) { var keyHash = Lookups.Hash(k); var invKeyHash = Lookups.InvariantHash(k); var index = buffer.AsSpan(0, bufferLength).IndexOf(v); if (index < 0) { v.AsSpan().CopyTo(buffer.AsSpan(bufferLength, v.Length)); entries.TryAdd(keyHash, new Lookups.Entry((byte)v.Length, (uint)bufferLength)); if (invKeyHash != keyHash) { entries.TryAdd(invKeyHash, new Lookups.Entry((byte)v.Length, (uint)bufferLength)); } bufferLength += v.Length; Console.Write("+"); } else { entries.TryAdd(keyHash, new Lookups.Entry((byte)v.Length, (uint)index)); if (invKeyHash != keyHash) { entries.TryAdd(invKeyHash, new Lookups.Entry((byte)v.Length, (uint)index)); } //Console.Write("."); } count++; if (count % 1000 == 0) { Console.WriteLine($"\nAt {count} of {map.Count}"); } } Array.Resize(ref buffer, bufferLength); var lookup = new Lookups(name, language, new string(buffer), entries); using (var f = File.OpenWrite(outputFile)) { await lookup.SerializeAsync(f); } Console.WriteLine($"\n\n\nWrote {outputFile}\n\n"); })); } foreach (var(file, language) in Directory.GetFiles(rootLangFolder, "*_lexeme_cluster*.json").Select(f => (file: f, language: Languages.CodeToEnum(Path.GetFileName(f).Substring(0, 2))))) { tasks.Add(Task.Run(async() => { var probFile = file.Replace("_lexeme_cluster", "_lexeme_prob"); if (!File.Exists(probFile)) { return; } Console.WriteLine($"\n\n\nBegin processing {file} + {probFile}\n\n"); var name = Path.GetFileNameWithoutExtension(file); var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources"); Directory.CreateDirectory(resDir); var outputFile = Path.Combine(resDir, name + "_prob.bin"); if (File.Exists(outputFile)) { Console.WriteLine("Skipping..."); return; } var cluster = JsonConvert.DeserializeObject <Dictionary <string, uint> >(FixWordsAsArrays(await File.ReadAllTextAsync(file))); var prob = JsonConvert.DeserializeObject <Dictionary <string, float> >(FixWordsAsArrays(await File.ReadAllTextAsync(probFile))); var entries = new Dictionary <ulong, Lookups.Entry>(); int count = 0; foreach (var(k, v) in cluster) { var keyHash = Lookups.Hash(k); var invKeyHash = Lookups.InvariantHash(k); var probVal = prob.TryGetValue(k, out var p) ? p : -25f; entries.TryAdd(keyHash, new Lookups.Entry(probVal, v)); if (invKeyHash != keyHash) { entries.TryAdd(invKeyHash, new Lookups.Entry(probVal, v)); } count++; if (count % 1000 == 0) { Console.WriteLine($"\nAt {count} of {cluster.Count}"); } } var lookup = new Lookups(name, language, null, entries); using (var f = File.OpenWrite(outputFile)) { await lookup.SerializeAsync(f); } Console.WriteLine($"\n\n\nWrote {outputFile}\n\n"); })); } await Task.WhenAll(tasks); }
public static Document FromJObject(JObject jo) { var emptyEntityTypes = new List <EntityType>(); var doc = new Document(); doc.Language = Languages.CodeToEnum((string)jo[nameof(Language)]); doc.Value = (string)jo[nameof(Value)]; doc.UID = UID128.TryParse((string)(jo[nameof(UID)]), out var uid) ? uid : default(UID128); var docmtd = jo[nameof(Metadata)]; if (!(docmtd is null) && docmtd.HasValues) { doc.Metadata = new Dictionary <string, string>(); foreach (JProperty md in docmtd) { doc.Metadata.Add(md.Name, (string)md.Value); } } if (jo.ContainsKey(nameof(Labels))) { doc.Labels = jo[nameof(Labels)].Select(jt => (string)jt).ToList(); } var td = jo[nameof(TokensData)]; foreach (var sp in td) { var tokens = new List <(int begin, int end, PartOfSpeech tag, int head, float frequency, List <EntityType> entityType, IDictionary <string, string> metadata, string replacement)>(); foreach (var tk in sp) { var ets = tk[nameof(EntityType)]; var entityTypes = emptyEntityTypes; if (!(ets is null) && ets.HasValues) { entityTypes = new List <EntityType>(); foreach (var et in ets) { Dictionary <string, string> entityMetadata = null; var etmtd = et[nameof(Metadata)]; if (!(etmtd is null) && etmtd.HasValues) { entityMetadata = new Dictionary <string, string>(); foreach (JProperty md in etmtd) { entityMetadata.Add(md.Name, (string)md.Value); } } entityTypes.Add(new EntityType((string)(et[nameof(EntityType.Type)]), (EntityTag)Enum.Parse(typeof(EntityTag), (string)(et[nameof(EntityType.Tag)])), entityMetadata, UID128.TryParse((string)(et[nameof(EntityType.TargetUID)]), out var uid2) ? uid2 : default(UID128))); } } IDictionary <string, string> metadata = null; var mtd = tk[nameof(Metadata)]; if (!(mtd is null) && mtd.HasValues) { metadata = new Dictionary <string, string>(); foreach (JProperty md in mtd) { metadata.Add(md.Name, (string)md.Value); } } tokens.Add((((int)(tk[nameof(TokenData.Bounds)][0])), ((int)(tk[nameof(TokenData.Bounds)][1])), (PartOfSpeech)Enum.Parse(typeof(PartOfSpeech), (string)(tk[nameof(TokenData.Tag)] ?? nameof(PartOfSpeech.NONE))), ((int)(tk[nameof(TokenData.Head)] ?? "-1")), (((float)(tk[nameof(TokenData.Frequency)] ?? 0f))), entityTypes, metadata, (string)tk[nameof(TokenData.Replacement)])); } if (tokens.Any()) { var span = doc.AddSpan(tokens.First().begin, tokens.Last().end); foreach (var tk in tokens) { var token = span.AddToken(tk.begin, tk.end); token.POS = tk.tag; token.Head = tk.head; token.Frequency = tk.frequency; foreach (var et in tk.entityType) { token.AddEntityType(et); } if (tk.metadata is object) { foreach (var kv in tk.metadata) { token.Metadata.Add(kv.Key, kv.Value); } } } } } return(doc); }
public static async Task Train(string udSource, string ontonotesSource, string languagesDirectory) { var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories); var testFiles = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories); List <string> trainFilesOntonotesEnglish = null; if (!string.IsNullOrWhiteSpace(ontonotesSource)) { trainFilesOntonotesEnglish = Directory.GetFiles(ontonotesSource, "*.parse.ddg", SearchOption.AllDirectories) .Where(fn => !fn.Contains("sel_") || int.Parse(Path.GetFileNameWithoutExtension(fn).Split(new char[] { '_', '.' }).Skip(1).First()) < 3654) .ToList(); } var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var testFilesPerLanguage = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var languages = new List <(Language language, string lang)>(); foreach (var lang in trainFilesPerLanguage.Keys.Intersect(testFilesPerLanguage.Keys)) { try { var language = Languages.CodeToEnum(lang); languages.Add((language, lang)); } catch { Logger.LogWarning($"Unknown language {lang}"); } } Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages.Select(l => l.language))}"); int attempts = 5; await Task.WhenAll(languages.Select(async v => { await Task.Yield(); var(language, lang) = (v.language, v.lang); var arcNames = new HashSet <string>(); if (trainFilesPerLanguage.TryGetValue(lang, out var langTrainFiles) && testFilesPerLanguage.TryGetValue(lang, out var langTestFiles)) { var trainDocuments = await ReadCorpusAsync(langTrainFiles, arcNames, language); var testDocuments = await ReadCorpusAsync(langTestFiles, arcNames, language); if (language == Language.English) { //Merge with Ontonotes 5.0 corpus var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count * testDocuments.Count / trainDocuments.Count); trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true)); testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true)); } double bestScore = double.MinValue; for (int i = 0; i < attempts; i++) { await Task.Run(async() => { var tagger = new AveragePerceptronTagger(language, 0); tagger.Train(trainDocuments, (5 + ThreadSafeRandom.Next(15))); var scoreTrain = TestTagger(trainDocuments, tagger); var scoreTest = TestTagger(testDocuments, tagger); if (scoreTest > bestScore) { Logger.LogInformation($"\n>>>>> {language}: NEW POS BEST: {scoreTest:0.0}%"); await tagger.StoreAsync(); if (scoreTest > 80) { //Prepare models for new nuget-based distribution var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources"); Directory.CreateDirectory(resDir); using (var f = File.OpenWrite(Path.Combine(resDir, "tagger.bin"))) { await tagger.StoreAsync(f); } await File.WriteAllTextAsync(Path.Combine(resDir, "tagger.score"), $"{scoreTest:0.0}%"); } bestScore = scoreTest; } else { Logger.LogInformation($"\n>>>>> {language}: POS BEST IS STILL : {bestScore:0.0}%"); } }); } bestScore = double.MinValue; for (int i = 0; i < attempts; i++) { await Task.Run(async() => { var parser = new AveragePerceptronDependencyParser(language, 0 /*, arcNames.ToList()*/); try { parser.Train(trainDocuments, (5 + ThreadSafeRandom.Next(10)), (float)(1D - ThreadSafeRandom.NextDouble() * ThreadSafeRandom.NextDouble())); } catch (Exception E) { Logger.LogError("FAIL", E); return; } trainDocuments = await ReadCorpusAsync(langTrainFiles, arcNames, language); testDocuments = await ReadCorpusAsync(langTestFiles, arcNames, language); if (language == Language.English) { //Merge with Ontonotes 5.0 corpus //Merge with Ontonotes 5.0 corpus var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count *testDocuments.Count / trainDocuments.Count); trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true)); testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true)); } var scoreTrain = TestParser(trainDocuments, parser); var scoreTest = TestParser(testDocuments, parser); if (scoreTest > bestScore) { Logger.LogInformation($"\n>>>>> {language}: NEW DEP BEST: {scoreTest:0.0}%"); if (scoreTest > 80) { //Prepare models for new nuget-based distribution var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources"); Directory.CreateDirectory(resDir); using (var f = File.OpenWrite(Path.Combine(resDir, "parser.bin"))) { await parser.StoreAsync(f); } await File.WriteAllTextAsync(Path.Combine(resDir, "parser.score"), $"{scoreTest:0.0}%"); } bestScore = scoreTest; } else { Logger.LogInformation($"\n>>>>> {language}: DEP BEST IS STILL : {bestScore:0.0}%"); } parser = null; }); } } })); foreach (var(language, lang) in languages) { var arcNames = new HashSet <string>(); var trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], arcNames, language); var testDocuments = await ReadCorpusAsync(testFilesPerLanguage[lang], arcNames, language); if (language == Language.English) { //Merge with Ontonotes 5.0 corpus var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count * testDocuments.Count / trainDocuments.Count); trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true)); testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true)); } var tagger = await AveragePerceptronTagger.FromStoreAsync(language, 0, ""); Logger.LogInformation($"\n{lang} - TAGGER / TRAIN"); TestTagger(trainDocuments, tagger); Logger.LogInformation($"\n{lang} - TAGGER / TEST"); TestTagger(testDocuments, tagger); trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], arcNames, language); testDocuments = await ReadCorpusAsync(testFilesPerLanguage[lang], arcNames, language); var parser = await AveragePerceptronDependencyParser.FromStoreAsync(language, 0, ""); Logger.LogInformation($"\n{lang} - PARSER / TRAIN"); TestParser(trainDocuments, parser); Logger.LogInformation($"\n{lang} - PARSER / TEST"); TestParser(testDocuments, parser); } }