Exemple #1
0
        public static void Train(string udSource, string ontonotesSource)
        {
            var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories);
            var testFiles  = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories);

            List <string> trainFilesOntonotesEnglish = null;

            if (!string.IsNullOrWhiteSpace(ontonotesSource))
            {
                trainFilesOntonotesEnglish = Directory.GetFiles(ontonotesSource, "*.parse.ddg", SearchOption.AllDirectories)
                                             .Where(fn => !fn.Contains("sel_") || int.Parse(Path.GetFileNameWithoutExtension(fn).Split(new char[] { '_', '.' }).Skip(1).First()) < 3654)
                                             .ToList();
            }

            var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var testFilesPerLanguage  = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var languages             = trainFilesPerLanguage.Keys.ToList();

            Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages)}");

            int N_training = 5;

            Parallel.ForEach(languages, lang =>
            {
                Language language;
                try
                {
                    language = Languages.CodeToEnum(lang);
                }
                catch
                {
                    Logger.LogWarning($"Unknown language {lang}");
                    return;
                }

                var arcNames = new HashSet <string>();

                if (trainFilesPerLanguage.TryGetValue(lang, out var langTrainFiles) && testFilesPerLanguage.TryGetValue(lang, out var langTestFiles))
                {
                    var trainDocuments = ReadCorpus(langTrainFiles, arcNames, language);
                    var testDocuments  = ReadCorpus(langTestFiles, arcNames, language);

                    if (language == Language.English)
                    {
                        //Merge with Ontonotes 5.0 corpus
                        trainDocuments.AddRange(ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true));
                    }

                    double bestScore = double.MinValue;

                    for (int i = 0; i < N_training; i++)
                    {
                        var Tagger = new AveragePerceptronTagger(language, 0);
                        Tagger.Train(trainDocuments.AsEnumerable(), (int)(5 + ThreadSafeRandom.Next(15)));
                        var scoreTrain = TestTagger(trainDocuments, Tagger);
                        var scoreTest  = TestTagger(testDocuments, Tagger);
                        if (scoreTest > bestScore)
                        {
                            Logger.LogInformation($"\n>>>>> {lang}: NEW POS BEST: {scoreTest:0.0}%");
                            try
                            {
                                Tagger.StoreAsync().Wait();
                            }
                            catch (Exception E)
                            {
                                Logger.LogError(E, $"\n>>>>> {lang}: Failed to store model");
                            }
                            bestScore = scoreTest;
                        }
                        else
                        {
                            Logger.LogInformation($"\n>>>>> {lang}: POS BEST IS STILL : {bestScore:0.0}%");
                        }
                    }


                    bestScore = double.MinValue;
                    for (int i = 0; i < N_training; i++)
                    {
                        var Parser = new AveragePerceptronDependencyParser(language, 0 /*, arcNames.ToList()*/);
                        try
                        {
                            Parser.Train(trainDocuments.AsEnumerable(), (int)(5 + ThreadSafeRandom.Next(10)), (float)(1D - ThreadSafeRandom.NextDouble() * ThreadSafeRandom.NextDouble()));
                        }
                        catch (Exception E)
                        {
                            Logger.LogInformation("FAIL: " + E.Message);
                            continue;
                        }

                        trainDocuments = ReadCorpus(langTrainFiles, arcNames, language);
                        testDocuments  = ReadCorpus(langTestFiles, arcNames, language);

                        if (language == Language.English)
                        {
                            //Merge with Ontonotes 5.0 corpus
                            trainDocuments.AddRange(ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true));
                        }

                        var scoreTrain = TestParser(trainDocuments, Parser);
                        var scoreTest  = TestParser(testDocuments, Parser);

                        if (scoreTest > bestScore)
                        {
                            Logger.LogInformation($"\n>>>>> {lang}: NEW DEP BEST: {scoreTest:0.0}%");
                            try
                            {
                                Parser.StoreAsync().Wait();
                            }
                            catch (Exception E)
                            {
                                Logger.LogError(E, $"\n>>>>> {lang}: Failed to store model");
                            }
                            bestScore = scoreTest;
                        }
                        else
                        {
                            Logger.LogInformation($"\n>>>>> {lang}: DEP BEST IS STILL : {bestScore:0.0}%");
                        }
                        Parser = null;
                    }
                }
            });

            foreach (var lang in languages)
            {
                Language language;
                try
                {
                    language = Languages.CodeToEnum(lang);
                }
                catch
                {
                    Logger.LogInformation($"Unknown language {lang}");
                    return;
                }

                var arcNames = new HashSet <string>();

                var trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], arcNames, language);
                var testDocuments  = ReadCorpus(testFilesPerLanguage[lang], arcNames, language);

                if (language == Language.English)
                {
                    //Merge with Ontonotes 5.0 corpus
                    var ontonotesDocuments = ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true);
                    trainDocuments.AddRange(ontonotesDocuments);
                }

                var Tagger = AveragePerceptronTagger.FromStoreAsync(language, 0, "").WaitResult();
                Logger.LogInformation($"\n{lang} - TAGGER / TRAIN");
                TestTagger(trainDocuments, Tagger);

                Logger.LogInformation($"\n{lang} - TAGGER / TEST");
                TestTagger(testDocuments, Tagger);

                trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], arcNames, language);
                testDocuments  = ReadCorpus(testFilesPerLanguage[lang], arcNames, language);

                var Parser = AveragePerceptronDependencyParser.FromStoreAsync(language, 0, "").WaitResult();
                Logger.LogInformation($"\n{lang} - PARSER / TRAIN");
                TestParser(trainDocuments, Parser);

                Logger.LogInformation($"\n{lang} - PARSER / TEST");
                TestParser(testDocuments, Parser);
            }
        }
Exemple #2
0
        public static async Task Train(string udSource, string ontonotesSource, string languagesDirectory)
        {
            var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories);
            var testFiles  = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories);

            List <string> trainFilesOntonotesEnglish = null;

            if (!string.IsNullOrWhiteSpace(ontonotesSource))
            {
                trainFilesOntonotesEnglish = Directory.GetFiles(ontonotesSource, "*.parse.ddg", SearchOption.AllDirectories)
                                             .Where(fn => !fn.Contains("sel_") || int.Parse(Path.GetFileNameWithoutExtension(fn).Split(new char[] { '_', '.' }).Skip(1).First()) < 3654)
                                             .ToList();
            }

            var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var testFilesPerLanguage  = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());

            var languages = new List <(Language language, string lang)>();

            foreach (var lang in trainFilesPerLanguage.Keys.Intersect(testFilesPerLanguage.Keys))
            {
                try
                {
                    var language = Languages.CodeToEnum(lang);
                    languages.Add((language, lang));
                }
                catch
                {
                    Logger.LogWarning($"Unknown language {lang}");
                }
            }

            Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages.Select(l => l.language))}");

            int attempts = 5;

            await Task.WhenAll(languages.Select(async v =>
            {
                await Task.Yield();

                var(language, lang) = (v.language, v.lang);

                var arcNames = new HashSet <string>();

                if (trainFilesPerLanguage.TryGetValue(lang, out var langTrainFiles) && testFilesPerLanguage.TryGetValue(lang, out var langTestFiles))
                {
                    var trainDocuments = await ReadCorpusAsync(langTrainFiles, arcNames, language);
                    var testDocuments  = await ReadCorpusAsync(langTestFiles, arcNames, language);

                    if (language == Language.English)
                    {
                        //Merge with Ontonotes 5.0 corpus
                        var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count * testDocuments.Count / trainDocuments.Count);

                        trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                        testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                    }

                    double bestScore = double.MinValue;

                    for (int i = 0; i < attempts; i++)
                    {
                        await Task.Run(async() =>
                        {
                            var tagger = new AveragePerceptronTagger(language, 0);
                            tagger.Train(trainDocuments, (5 + ThreadSafeRandom.Next(15)));
                            var scoreTrain = TestTagger(trainDocuments, tagger);
                            var scoreTest  = TestTagger(testDocuments, tagger);
                            if (scoreTest > bestScore)
                            {
                                Logger.LogInformation($"\n>>>>> {language}: NEW POS BEST: {scoreTest:0.0}%");
                                await tagger.StoreAsync();

                                if (scoreTest > 80)
                                {
                                    //Prepare models for new nuget-based distribution
                                    var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources");
                                    Directory.CreateDirectory(resDir);
                                    using (var f = File.OpenWrite(Path.Combine(resDir, "tagger.bin")))
                                    {
                                        await tagger.StoreAsync(f);
                                    }
                                    await File.WriteAllTextAsync(Path.Combine(resDir, "tagger.score"), $"{scoreTest:0.0}%");
                                }

                                bestScore = scoreTest;
                            }
                            else
                            {
                                Logger.LogInformation($"\n>>>>> {language}: POS BEST IS STILL : {bestScore:0.0}%");
                            }
                        });
                    }


                    bestScore = double.MinValue;
                    for (int i = 0; i < attempts; i++)
                    {
                        await Task.Run(async() =>
                        {
                            var parser = new AveragePerceptronDependencyParser(language, 0 /*, arcNames.ToList()*/);
                            try
                            {
                                parser.Train(trainDocuments, (5 + ThreadSafeRandom.Next(10)), (float)(1D - ThreadSafeRandom.NextDouble() * ThreadSafeRandom.NextDouble()));
                            }
                            catch (Exception E)
                            {
                                Logger.LogError("FAIL", E);
                                return;
                            }

                            trainDocuments = await ReadCorpusAsync(langTrainFiles, arcNames, language);
                            testDocuments  = await ReadCorpusAsync(langTestFiles, arcNames, language);

                            if (language == Language.English)
                            {
                                //Merge with Ontonotes 5.0 corpus
                                //Merge with Ontonotes 5.0 corpus
                                var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count *testDocuments.Count / trainDocuments.Count);

                                trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                                testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                            }

                            var scoreTrain = TestParser(trainDocuments, parser);
                            var scoreTest  = TestParser(testDocuments, parser);

                            if (scoreTest > bestScore)
                            {
                                Logger.LogInformation($"\n>>>>> {language}: NEW DEP BEST: {scoreTest:0.0}%");

                                if (scoreTest > 80)
                                {
                                    //Prepare models for new nuget-based distribution
                                    var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources");
                                    Directory.CreateDirectory(resDir);
                                    using (var f = File.OpenWrite(Path.Combine(resDir, "parser.bin")))
                                    {
                                        await parser.StoreAsync(f);
                                    }
                                    await File.WriteAllTextAsync(Path.Combine(resDir, "parser.score"), $"{scoreTest:0.0}%");
                                }

                                bestScore = scoreTest;
                            }
                            else
                            {
                                Logger.LogInformation($"\n>>>>> {language}: DEP BEST IS STILL : {bestScore:0.0}%");
                            }
                            parser = null;
                        });
                    }
                }
            }));

            foreach (var(language, lang) in languages)
            {
                var arcNames = new HashSet <string>();

                var trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], arcNames, language);

                var testDocuments = await ReadCorpusAsync(testFilesPerLanguage[lang], arcNames, language);

                if (language == Language.English)
                {
                    //Merge with Ontonotes 5.0 corpus
                    var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count * testDocuments.Count / trainDocuments.Count);

                    trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                    testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                }

                var tagger = await AveragePerceptronTagger.FromStoreAsync(language, 0, "");

                Logger.LogInformation($"\n{lang} - TAGGER / TRAIN");
                TestTagger(trainDocuments, tagger);

                Logger.LogInformation($"\n{lang} - TAGGER / TEST");
                TestTagger(testDocuments, tagger);

                trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], arcNames, language);

                testDocuments = await ReadCorpusAsync(testFilesPerLanguage[lang], arcNames, language);

                var parser = await AveragePerceptronDependencyParser.FromStoreAsync(language, 0, "");

                Logger.LogInformation($"\n{lang} - PARSER / TRAIN");
                TestParser(trainDocuments, parser);

                Logger.LogInformation($"\n{lang} - PARSER / TEST");
                TestParser(testDocuments, parser);
            }
        }