public static void Train(string udSource) { var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories); var testFiles = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories); var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var testFilesPerLanguage = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var languages = trainFilesPerLanguage.Keys.ToList(); Console.WriteLine($"Found these languages for training: {string.Join(", ", languages)}"); foreach (var forceCase in new EnumCase[] { EnumCase.Original, EnumCase.ForceUpper, EnumCase.ForceLower }) //need tom fix the storage model first - maybe join all in one model { ParallelAsync.ForEachAsync(languages, new ParallelOptions(), async lang => { Language language; try { language = Languages.CodeToEnum(lang); } catch { Console.WriteLine($"Unknown language {lang}"); return; } var modelTag = (forceCase == EnumCase.ForceUpper ? "upper" : (forceCase == EnumCase.ForceLower ? "lower" : "")); var sentenceDetector = new SentenceDetector(language, 0, modelTag); var trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], ConvertCase: forceCase, sentenceDetector: sentenceDetector); //TODO: Implement test //if(testFilesPerLanguage.TryGetValue(lang, out var testFile)) //{ // var testDocuments = ReadUniversalDependencyCorpus(testFile, ConvertCase: forceCase, sentenceDetector: sentenceDetector); //} Console.WriteLine($"Now training {lang} in mode {forceCase} using files {string.Join(", ", trainFilesPerLanguage[lang])}"); sentenceDetector.Train(trainDocuments); await sentenceDetector.StoreAsync(); }); } }
public static async Task Train(string udSource, string languagesDirectory) { var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories); var testFiles = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories); var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var testFilesPerLanguage = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList()); var languages = new List <(Language language, string lang)>(); foreach (var lang in trainFilesPerLanguage.Keys) { try { var language = Languages.CodeToEnum(lang); languages.Add((language, lang)); } catch { Logger.LogWarning($"Unknown language {lang}"); } } Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages.Select(l => l.language))}"); foreach (var forceCase in new [] { EnumCase.Original, EnumCase.ForceUpper, EnumCase.ForceLower }) { await Task.WhenAll(languages.Select(async v => { await Task.Yield(); var(language, lang) = (v.language, v.lang); var modelTag = (forceCase == EnumCase.ForceUpper ? "upper" : (forceCase == EnumCase.ForceLower ? "lower" : "")); var sentenceDetector = new SentenceDetector(language, 0, modelTag); var trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], ConvertCase: forceCase, sentenceDetector: sentenceDetector); //TODO: Implement test //if(testFilesPerLanguage.TryGetValue(lang, out var testFile)) //{ // var testDocuments = ReadUniversalDependencyCorpus(testFile, ConvertCase: forceCase, sentenceDetector: sentenceDetector); //} Logger.LogInformation($"Now training {lang} in mode {forceCase} using files {string.Join(", ", trainFilesPerLanguage[lang])}"); var scoreTest = sentenceDetector.Train(trainDocuments); Logger.LogInformation($"Finished training {lang} in mode {forceCase}"); await sentenceDetector.StoreAsync(); if (scoreTest > 90) { //Prepare models for new nuget-based distribution var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources"); Directory.CreateDirectory(resDir); using (var f = File.OpenWrite(Path.Combine(resDir, $"sentence-detector{(string.IsNullOrEmpty(modelTag) ? "" : "-" + modelTag)}.bin"))) { await sentenceDetector.StoreAsync(f); } await File.WriteAllTextAsync(Path.Combine(resDir, $"sentence-detector{(string.IsNullOrEmpty(modelTag) ? "" : "-" + modelTag)}.score"), $"{scoreTest:0.0}%"); } }).ToArray()); } }