Ejemplo n.º 1
0
        public static void Train(string udSource)
        {
            var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories);
            var testFiles  = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories);

            var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var testFilesPerLanguage  = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var languages             = trainFilesPerLanguage.Keys.ToList();

            Console.WriteLine($"Found these languages for training: {string.Join(", ", languages)}");
            foreach (var forceCase in new EnumCase[] { EnumCase.Original, EnumCase.ForceUpper, EnumCase.ForceLower }) //need tom fix the storage model first - maybe join all in one model
            {
                ParallelAsync.ForEachAsync(languages, new ParallelOptions(), async lang =>
                {
                    Language language;
                    try
                    {
                        language = Languages.CodeToEnum(lang);
                    }
                    catch
                    {
                        Console.WriteLine($"Unknown language {lang}");
                        return;
                    }

                    var modelTag         = (forceCase == EnumCase.ForceUpper ? "upper" : (forceCase == EnumCase.ForceLower ? "lower" : ""));
                    var sentenceDetector = new SentenceDetector(language, 0, modelTag);

                    var trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], ConvertCase: forceCase, sentenceDetector: sentenceDetector);

                    //TODO: Implement test
                    //if(testFilesPerLanguage.TryGetValue(lang, out var testFile))
                    //{
                    //    var testDocuments = ReadUniversalDependencyCorpus(testFile, ConvertCase: forceCase, sentenceDetector: sentenceDetector);
                    //}

                    Console.WriteLine($"Now training {lang} in mode {forceCase} using files {string.Join(", ", trainFilesPerLanguage[lang])}");
                    sentenceDetector.Train(trainDocuments);
                    await sentenceDetector.StoreAsync();
                });
            }
        }
Ejemplo n.º 2
0
        public static async Task Train(string udSource, string languagesDirectory)
        {
            var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories);
            var testFiles  = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories);

            var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var testFilesPerLanguage  = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());

            var languages = new List <(Language language, string lang)>();

            foreach (var lang in trainFilesPerLanguage.Keys)
            {
                try
                {
                    var language = Languages.CodeToEnum(lang);
                    languages.Add((language, lang));
                }
                catch
                {
                    Logger.LogWarning($"Unknown language {lang}");
                }
            }

            Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages.Select(l => l.language))}");

            foreach (var forceCase in new [] { EnumCase.Original, EnumCase.ForceUpper, EnumCase.ForceLower })
            {
                await Task.WhenAll(languages.Select(async v =>
                {
                    await Task.Yield();

                    var(language, lang) = (v.language, v.lang);

                    var modelTag = (forceCase == EnumCase.ForceUpper ? "upper" : (forceCase == EnumCase.ForceLower ? "lower" : ""));
                    var sentenceDetector = new SentenceDetector(language, 0, modelTag);

                    var trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], ConvertCase: forceCase, sentenceDetector: sentenceDetector);

                    //TODO: Implement test
                    //if(testFilesPerLanguage.TryGetValue(lang, out var testFile))
                    //{
                    //    var testDocuments = ReadUniversalDependencyCorpus(testFile, ConvertCase: forceCase, sentenceDetector: sentenceDetector);
                    //}

                    Logger.LogInformation($"Now training {lang} in mode {forceCase} using files {string.Join(", ", trainFilesPerLanguage[lang])}");
                    var scoreTest = sentenceDetector.Train(trainDocuments);
                    Logger.LogInformation($"Finished training {lang} in mode {forceCase}");
                    await sentenceDetector.StoreAsync();

                    if (scoreTest > 90)
                    {
                        //Prepare models for new nuget-based distribution
                        var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources");

                        Directory.CreateDirectory(resDir);

                        using (var f = File.OpenWrite(Path.Combine(resDir, $"sentence-detector{(string.IsNullOrEmpty(modelTag) ? "" : "-" + modelTag)}.bin")))
                        {
                            await sentenceDetector.StoreAsync(f);
                        }
                        await File.WriteAllTextAsync(Path.Combine(resDir, $"sentence-detector{(string.IsNullOrEmpty(modelTag) ? "" : "-" + modelTag)}.score"), $"{scoreTest:0.0}%");
                    }
                }).ToArray());
            }
        }