Пример #1
0
        public static void TransformJsonDataInModelData(string pathToFiles)
        {
            var files = Directory.GetFiles(pathToFiles);
            var ld    = new LanguageDetector(version: 0);

            foreach (var f in files)
            {
                var language = Languages.CodeToEnum(Path.GetFileNameWithoutExtension(f));
                ld.Data.Languages.Add(language);

                string json        = File.ReadAllText(f);
                var    jsonProfile = JsonConvert.DeserializeObject <JsonLanguageProfile>(json);
                foreach (var word in jsonProfile.freq.Keys)
                {
                    int hash = GetHash(word.ToLowerInvariant().Trim().AsSpan());
                    if (!ld.Data.WordLanguageProbabilities.ContainsKey(hash))
                    {
                        ld.Data.WordLanguageProbabilities[hash] = new Dictionary <Language, double>();
                    }

                    if (word.Length >= 1 && word.Length <= ld.NGramLength)
                    {
                        double prob = (double)jsonProfile.freq[word] / jsonProfile.n_words[word.Length - 1];
                        ld.Data.WordLanguageProbabilities[hash][language] = prob;
                    }
                }
            }

            ld.StoreAsync().Wait();
        }
        public void Process(IDocument document)
        {
            if (document.Length == 0 || (document.Language != Language.Unknown && document.Language != Language.Any))
            {
                return;
            }                                                                                                                     //Don't try to identify documents that already have their language set or is empty

            IDocument tempDocument = document;

            if (document.SpansCount == 0) // Have to tokenize temporarily the document
            {
                if (document.Length > 200)
                {
                    tempDocument = new Document(document.Value.Substring(0, 200));
                }
                else
                {
                    tempDocument = new Document(document.Value);
                }
                Tokenizer.Process(tempDocument);
            }

            var tag = Model.PredictMax(tempDocument, 200);

            document.Language = Languages.CodeToEnum(tag.label);
        }
        public void Process(IDocument document)
        {
            if (document.Length == 0 || (document.Language != Language.Unknown && document.Language != Language.Any))
            {
                return;
            }                                                                                                                     //Don't try to identify documents that already have their language set or is empty

            IDocument tempDocument = Prepare(document);

            try
            {
                var tag = Model.PredictMax(tempDocument, 200);
                if (tag.label is null)
                {
                    document.Language = Language.Unknown;
                }
                else
                {
                    document.Language = Languages.CodeToEnum(tag.label);
                }
            }
            catch
            {
                document.Language = Language.Unknown;
            }
        }
        public static void Test(string pathToSentences)
        {
            var vectorizer = FastTextLanguageDetector.FromStoreAsync(Language.Any, 0, null).WaitResult();

            var pipe = new Pipeline();

            pipe.Add(new SpaceTokenizer());


            var docs = File.ReadAllLines(pathToSentences).Shuffle()
                       .Where(txt => !string.IsNullOrWhiteSpace(txt))
                       .Select(txt => txt.Split('\t'))
                       .Where(s => s.Length == 3 && Languages.IsValid3LetterCode(s[1]))
                       //.GroupBy(l => l[1])
                       //.Where(g => g.Count() > 10_000)
                       //.SelectMany(g => g)
                       .Select(s =>
            {
                var doc = new Document(s[2]);
                doc.Labels.Add(Languages.EnumToCode(Languages.ThreeLetterCodeToEnum(s[1])));
                return(doc as IDocument);
            });


            docs = pipe.Process(docs).WithCaching(Language.Any, 0, "language-detector-corpus", 100_000).ToList();


            int TP = 0, FP = 0, FN = 0;
            int k  = 0;
            var sw = Stopwatch.StartNew();

            Parallel.ForEach(docs, (doc) =>
            {
                k++;
                vectorizer.Process(doc);
                if (doc.Language == Languages.CodeToEnum(doc.Labels.First()))
                {
                    Interlocked.Increment(ref TP);
                }
                else
                {
                    Interlocked.Increment(ref FP);
                    Interlocked.Increment(ref FN);
                }
            });

            sw.Stop();

            var precision = (double)TP / (double)(TP + FN);
            var recall    = (double)TP / (double)(TP + FN);

            var f1 = 2 * (precision * recall) / (precision + recall);

            Console.WriteLine($"F1= {f1 * 100:0.0}% P= {precision * 100:0.0}% R={recall * 100:0.0}% in {sw.Elapsed.TotalSeconds:0.00}s or {k / sw.Elapsed.TotalSeconds} doc/s");
        }
        public Dictionary <Language, float> Predict(IDocument document)
        {
            IDocument tempDocument = Prepare(document);

            try
            {
                var predictions = Model.Predict(tempDocument);
                return(predictions.ToDictionary(kv => Languages.CodeToEnum(kv.Key), kv => kv.Value));
            }
            catch
            {
                return(new Dictionary <Language, float>()
                {
                    [Language.Unknown] = 1f
                });
            }
        }
Пример #6
0
        public static void Train(string udSource)
        {
            var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories);
            var testFiles  = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories);

            var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var testFilesPerLanguage  = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var languages             = trainFilesPerLanguage.Keys.ToList();

            Console.WriteLine($"Found these languages for training: {string.Join(", ", languages)}");
            foreach (var forceCase in new EnumCase[] { EnumCase.Original, EnumCase.ForceUpper, EnumCase.ForceLower }) //need tom fix the storage model first - maybe join all in one model
            {
                ParallelAsync.ForEachAsync(languages, new ParallelOptions(), async lang =>
                {
                    Language language;
                    try
                    {
                        language = Languages.CodeToEnum(lang);
                    }
                    catch
                    {
                        Console.WriteLine($"Unknown language {lang}");
                        return;
                    }

                    var modelTag         = (forceCase == EnumCase.ForceUpper ? "upper" : (forceCase == EnumCase.ForceLower ? "lower" : ""));
                    var sentenceDetector = new SentenceDetector(language, 0, modelTag);

                    var trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], ConvertCase: forceCase, sentenceDetector: sentenceDetector);

                    //TODO: Implement test
                    //if(testFilesPerLanguage.TryGetValue(lang, out var testFile))
                    //{
                    //    var testDocuments = ReadUniversalDependencyCorpus(testFile, ConvertCase: forceCase, sentenceDetector: sentenceDetector);
                    //}

                    Console.WriteLine($"Now training {lang} in mode {forceCase} using files {string.Join(", ", trainFilesPerLanguage[lang])}");
                    sentenceDetector.Train(trainDocuments);
                    await sentenceDetector.StoreAsync();
                });
            }
        }
Пример #7
0
        public static void Train(string udSource, string ontonotesSource)
        {
            var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories);
            var testFiles  = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories);

            List <string> trainFilesOntonotesEnglish = null;

            if (!string.IsNullOrWhiteSpace(ontonotesSource))
            {
                trainFilesOntonotesEnglish = Directory.GetFiles(ontonotesSource, "*.parse.ddg", SearchOption.AllDirectories)
                                             .Where(fn => !fn.Contains("sel_") || int.Parse(Path.GetFileNameWithoutExtension(fn).Split(new char[] { '_', '.' }).Skip(1).First()) < 3654)
                                             .ToList();
            }

            var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var testFilesPerLanguage  = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var languages             = trainFilesPerLanguage.Keys.ToList();

            Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages)}");

            int N_training = 5;

            Parallel.ForEach(languages, lang =>
            {
                Language language;
                try
                {
                    language = Languages.CodeToEnum(lang);
                }
                catch
                {
                    Logger.LogWarning($"Unknown language {lang}");
                    return;
                }

                var arcNames = new HashSet <string>();

                if (trainFilesPerLanguage.TryGetValue(lang, out var langTrainFiles) && testFilesPerLanguage.TryGetValue(lang, out var langTestFiles))
                {
                    var trainDocuments = ReadCorpus(langTrainFiles, arcNames, language);
                    var testDocuments  = ReadCorpus(langTestFiles, arcNames, language);

                    if (language == Language.English)
                    {
                        //Merge with Ontonotes 5.0 corpus
                        trainDocuments.AddRange(ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true));
                    }

                    double bestScore = double.MinValue;

                    for (int i = 0; i < N_training; i++)
                    {
                        var Tagger = new AveragePerceptronTagger(language, 0);
                        Tagger.Train(trainDocuments.AsEnumerable(), (int)(5 + ThreadSafeRandom.Next(15)));
                        var scoreTrain = TestTagger(trainDocuments, Tagger);
                        var scoreTest  = TestTagger(testDocuments, Tagger);
                        if (scoreTest > bestScore)
                        {
                            Logger.LogInformation($"\n>>>>> {lang}: NEW POS BEST: {scoreTest:0.0}%");
                            try
                            {
                                Tagger.StoreAsync().Wait();
                            }
                            catch (Exception E)
                            {
                                Logger.LogError(E, $"\n>>>>> {lang}: Failed to store model");
                            }
                            bestScore = scoreTest;
                        }
                        else
                        {
                            Logger.LogInformation($"\n>>>>> {lang}: POS BEST IS STILL : {bestScore:0.0}%");
                        }
                    }


                    bestScore = double.MinValue;
                    for (int i = 0; i < N_training; i++)
                    {
                        var Parser = new AveragePerceptronDependencyParser(language, 0 /*, arcNames.ToList()*/);
                        try
                        {
                            Parser.Train(trainDocuments.AsEnumerable(), (int)(5 + ThreadSafeRandom.Next(10)), (float)(1D - ThreadSafeRandom.NextDouble() * ThreadSafeRandom.NextDouble()));
                        }
                        catch (Exception E)
                        {
                            Logger.LogInformation("FAIL: " + E.Message);
                            continue;
                        }

                        trainDocuments = ReadCorpus(langTrainFiles, arcNames, language);
                        testDocuments  = ReadCorpus(langTestFiles, arcNames, language);

                        if (language == Language.English)
                        {
                            //Merge with Ontonotes 5.0 corpus
                            trainDocuments.AddRange(ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true));
                        }

                        var scoreTrain = TestParser(trainDocuments, Parser);
                        var scoreTest  = TestParser(testDocuments, Parser);

                        if (scoreTest > bestScore)
                        {
                            Logger.LogInformation($"\n>>>>> {lang}: NEW DEP BEST: {scoreTest:0.0}%");
                            try
                            {
                                Parser.StoreAsync().Wait();
                            }
                            catch (Exception E)
                            {
                                Logger.LogError(E, $"\n>>>>> {lang}: Failed to store model");
                            }
                            bestScore = scoreTest;
                        }
                        else
                        {
                            Logger.LogInformation($"\n>>>>> {lang}: DEP BEST IS STILL : {bestScore:0.0}%");
                        }
                        Parser = null;
                    }
                }
            });

            foreach (var lang in languages)
            {
                Language language;
                try
                {
                    language = Languages.CodeToEnum(lang);
                }
                catch
                {
                    Logger.LogInformation($"Unknown language {lang}");
                    return;
                }

                var arcNames = new HashSet <string>();

                var trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], arcNames, language);
                var testDocuments  = ReadCorpus(testFilesPerLanguage[lang], arcNames, language);

                if (language == Language.English)
                {
                    //Merge with Ontonotes 5.0 corpus
                    var ontonotesDocuments = ReadCorpus(trainFilesOntonotesEnglish, arcNames, language, isOntoNotes: true);
                    trainDocuments.AddRange(ontonotesDocuments);
                }

                var Tagger = AveragePerceptronTagger.FromStoreAsync(language, 0, "").WaitResult();
                Logger.LogInformation($"\n{lang} - TAGGER / TRAIN");
                TestTagger(trainDocuments, Tagger);

                Logger.LogInformation($"\n{lang} - TAGGER / TEST");
                TestTagger(testDocuments, Tagger);

                trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], arcNames, language);
                testDocuments  = ReadCorpus(testFilesPerLanguage[lang], arcNames, language);

                var Parser = AveragePerceptronDependencyParser.FromStoreAsync(language, 0, "").WaitResult();
                Logger.LogInformation($"\n{lang} - PARSER / TRAIN");
                TestParser(trainDocuments, Parser);

                Logger.LogInformation($"\n{lang} - PARSER / TEST");
                TestParser(testDocuments, Parser);
            }
        }
Пример #8
0
        public static async Task Train(string udSource, string languagesDirectory)
        {
            var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories);
            var testFiles  = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories);

            var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var testFilesPerLanguage  = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());

            var languages = new List <(Language language, string lang)>();

            foreach (var lang in trainFilesPerLanguage.Keys)
            {
                try
                {
                    var language = Languages.CodeToEnum(lang);
                    languages.Add((language, lang));
                }
                catch
                {
                    Logger.LogWarning($"Unknown language {lang}");
                }
            }

            Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages.Select(l => l.language))}");

            foreach (var forceCase in new [] { EnumCase.Original, EnumCase.ForceUpper, EnumCase.ForceLower })
            {
                await Task.WhenAll(languages.Select(async v =>
                {
                    await Task.Yield();

                    var(language, lang) = (v.language, v.lang);

                    var modelTag = (forceCase == EnumCase.ForceUpper ? "upper" : (forceCase == EnumCase.ForceLower ? "lower" : ""));
                    var sentenceDetector = new SentenceDetector(language, 0, modelTag);

                    var trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], ConvertCase: forceCase, sentenceDetector: sentenceDetector);

                    //TODO: Implement test
                    //if(testFilesPerLanguage.TryGetValue(lang, out var testFile))
                    //{
                    //    var testDocuments = ReadUniversalDependencyCorpus(testFile, ConvertCase: forceCase, sentenceDetector: sentenceDetector);
                    //}

                    Logger.LogInformation($"Now training {lang} in mode {forceCase} using files {string.Join(", ", trainFilesPerLanguage[lang])}");
                    var scoreTest = sentenceDetector.Train(trainDocuments);
                    Logger.LogInformation($"Finished training {lang} in mode {forceCase}");
                    await sentenceDetector.StoreAsync();

                    if (scoreTest > 90)
                    {
                        //Prepare models for new nuget-based distribution
                        var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources");

                        Directory.CreateDirectory(resDir);

                        using (var f = File.OpenWrite(Path.Combine(resDir, $"sentence-detector{(string.IsNullOrEmpty(modelTag) ? "" : "-" + modelTag)}.bin")))
                        {
                            await sentenceDetector.StoreAsync(f);
                        }
                        await File.WriteAllTextAsync(Path.Combine(resDir, $"sentence-detector{(string.IsNullOrEmpty(modelTag) ? "" : "-" + modelTag)}.score"), $"{scoreTest:0.0}%");
                    }
                }).ToArray());
            }
        }
Пример #9
0
        internal static async Task RunAsync(string spacyLookupsData, string languagesDirectory)
        {
            var rootLangFolder = Path.Combine(spacyLookupsData, @"spacy_lookups_data\data\");

            if (!Directory.Exists(rootLangFolder))
            {
                throw new Exception("data directory not found");
            }

            //TODO Handle rules data

            var tasks = new List <Task>();

            foreach (var(file, language) in Directory.GetFiles(rootLangFolder, "*_lemma_lookup*.json").Select(f => (file:f, language: Languages.CodeToEnum(Path.GetFileName(f).Substring(0, 2)))))
            {
                tasks.Add(Task.Run(async() =>
                {
                    Console.WriteLine($"\n\n\nBegin processing {file}\n\n");

                    var name = Path.GetFileNameWithoutExtension(file);

                    var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources");
                    Directory.CreateDirectory(resDir);

                    var outputFile = Path.Combine(resDir, name + ".bin");

                    if (File.Exists(outputFile))
                    {
                        Console.WriteLine("Skipping...");
                        return;
                    }

                    var map          = JsonConvert.DeserializeObject <Dictionary <string, string> >(FixWordsAsArrays(await File.ReadAllTextAsync(file)));
                    var buffer       = new char[map.Values.Sum(k => k.Length)];
                    var bufferLength = 0;

                    var entries = new Dictionary <ulong, Lookups.Entry>();
                    int count   = 0;

                    foreach (var(k, v) in map.OrderByDescending(kv => kv.Value.Length).ThenBy(kv => kv.Value))
                    {
                        var keyHash    = Lookups.Hash(k);
                        var invKeyHash = Lookups.InvariantHash(k);

                        var index = buffer.AsSpan(0, bufferLength).IndexOf(v);

                        if (index < 0)
                        {
                            v.AsSpan().CopyTo(buffer.AsSpan(bufferLength, v.Length));
                            entries.TryAdd(keyHash, new Lookups.Entry((byte)v.Length, (uint)bufferLength));
                            if (invKeyHash != keyHash)
                            {
                                entries.TryAdd(invKeyHash, new Lookups.Entry((byte)v.Length, (uint)bufferLength));
                            }
                            bufferLength += v.Length;
                            Console.Write("+");
                        }
                        else
                        {
                            entries.TryAdd(keyHash, new Lookups.Entry((byte)v.Length, (uint)index));
                            if (invKeyHash != keyHash)
                            {
                                entries.TryAdd(invKeyHash, new Lookups.Entry((byte)v.Length, (uint)index));
                            }
                            //Console.Write(".");
                        }
                        count++;
                        if (count % 1000 == 0)
                        {
                            Console.WriteLine($"\nAt {count} of {map.Count}");
                        }
                    }

                    Array.Resize(ref buffer, bufferLength);

                    var lookup = new Lookups(name, language, new string(buffer), entries);

                    using (var f = File.OpenWrite(outputFile))
                    {
                        await lookup.SerializeAsync(f);
                    }
                    Console.WriteLine($"\n\n\nWrote {outputFile}\n\n");
                }));
            }

            foreach (var(file, language) in Directory.GetFiles(rootLangFolder, "*_lexeme_cluster*.json").Select(f => (file: f, language: Languages.CodeToEnum(Path.GetFileName(f).Substring(0, 2)))))
            {
                tasks.Add(Task.Run(async() =>
                {
                    var probFile = file.Replace("_lexeme_cluster", "_lexeme_prob");

                    if (!File.Exists(probFile))
                    {
                        return;
                    }

                    Console.WriteLine($"\n\n\nBegin processing {file} + {probFile}\n\n");

                    var name = Path.GetFileNameWithoutExtension(file);

                    var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources");
                    Directory.CreateDirectory(resDir);

                    var outputFile = Path.Combine(resDir, name + "_prob.bin");

                    if (File.Exists(outputFile))
                    {
                        Console.WriteLine("Skipping...");
                        return;
                    }

                    var cluster = JsonConvert.DeserializeObject <Dictionary <string, uint> >(FixWordsAsArrays(await File.ReadAllTextAsync(file)));
                    var prob    = JsonConvert.DeserializeObject <Dictionary <string, float> >(FixWordsAsArrays(await File.ReadAllTextAsync(probFile)));

                    var entries = new Dictionary <ulong, Lookups.Entry>();
                    int count   = 0;

                    foreach (var(k, v) in cluster)
                    {
                        var keyHash    = Lookups.Hash(k);
                        var invKeyHash = Lookups.InvariantHash(k);

                        var probVal = prob.TryGetValue(k, out var p) ? p : -25f;

                        entries.TryAdd(keyHash, new Lookups.Entry(probVal, v));
                        if (invKeyHash != keyHash)
                        {
                            entries.TryAdd(invKeyHash, new Lookups.Entry(probVal, v));
                        }
                        count++;
                        if (count % 1000 == 0)
                        {
                            Console.WriteLine($"\nAt {count} of {cluster.Count}");
                        }
                    }

                    var lookup = new Lookups(name, language, null, entries);

                    using (var f = File.OpenWrite(outputFile))
                    {
                        await lookup.SerializeAsync(f);
                    }

                    Console.WriteLine($"\n\n\nWrote {outputFile}\n\n");
                }));
            }


            await Task.WhenAll(tasks);
        }
Пример #10
0
        public static Document FromJObject(JObject jo)
        {
            var emptyEntityTypes = new List <EntityType>();

            var doc = new Document();

            doc.Language = Languages.CodeToEnum((string)jo[nameof(Language)]);
            doc.Value    = (string)jo[nameof(Value)];
            doc.UID      = UID128.TryParse((string)(jo[nameof(UID)]), out var uid) ? uid : default(UID128);

            var docmtd = jo[nameof(Metadata)];

            if (!(docmtd is null) && docmtd.HasValues)
            {
                doc.Metadata = new Dictionary <string, string>();
                foreach (JProperty md in docmtd)
                {
                    doc.Metadata.Add(md.Name, (string)md.Value);
                }
            }

            if (jo.ContainsKey(nameof(Labels)))
            {
                doc.Labels = jo[nameof(Labels)].Select(jt => (string)jt).ToList();
            }

            var td = jo[nameof(TokensData)];

            foreach (var sp in td)
            {
                var tokens = new List <(int begin, int end, PartOfSpeech tag, int head, float frequency, List <EntityType> entityType, IDictionary <string, string> metadata, string replacement)>();

                foreach (var tk in sp)
                {
                    var ets         = tk[nameof(EntityType)];
                    var entityTypes = emptyEntityTypes;
                    if (!(ets is null) && ets.HasValues)
                    {
                        entityTypes = new List <EntityType>();
                        foreach (var et in ets)
                        {
                            Dictionary <string, string> entityMetadata = null;
                            var etmtd = et[nameof(Metadata)];
                            if (!(etmtd is null) && etmtd.HasValues)
                            {
                                entityMetadata = new Dictionary <string, string>();
                                foreach (JProperty md in etmtd)
                                {
                                    entityMetadata.Add(md.Name, (string)md.Value);
                                }
                            }

                            entityTypes.Add(new EntityType((string)(et[nameof(EntityType.Type)]),
                                                           (EntityTag)Enum.Parse(typeof(EntityTag), (string)(et[nameof(EntityType.Tag)])),
                                                           entityMetadata,
                                                           UID128.TryParse((string)(et[nameof(EntityType.TargetUID)]), out var uid2) ? uid2 : default(UID128)));
                        }
                    }

                    IDictionary <string, string> metadata = null;

                    var mtd = tk[nameof(Metadata)];
                    if (!(mtd is null) && mtd.HasValues)
                    {
                        metadata = new Dictionary <string, string>();
                        foreach (JProperty md in mtd)
                        {
                            metadata.Add(md.Name, (string)md.Value);
                        }
                    }

                    tokens.Add((((int)(tk[nameof(TokenData.Bounds)][0])),
                                ((int)(tk[nameof(TokenData.Bounds)][1])),
                                (PartOfSpeech)Enum.Parse(typeof(PartOfSpeech), (string)(tk[nameof(TokenData.Tag)] ?? nameof(PartOfSpeech.NONE))),
                                ((int)(tk[nameof(TokenData.Head)] ?? "-1")),
                                (((float)(tk[nameof(TokenData.Frequency)] ?? 0f))),
                                entityTypes,
                                metadata,
                                (string)tk[nameof(TokenData.Replacement)]));
                }

                if (tokens.Any())
                {
                    var span = doc.AddSpan(tokens.First().begin, tokens.Last().end);

                    foreach (var tk in tokens)
                    {
                        var token = span.AddToken(tk.begin, tk.end);
                        token.POS       = tk.tag;
                        token.Head      = tk.head;
                        token.Frequency = tk.frequency;
                        foreach (var et in tk.entityType)
                        {
                            token.AddEntityType(et);
                        }

                        if (tk.metadata is object)
                        {
                            foreach (var kv in tk.metadata)
                            {
                                token.Metadata.Add(kv.Key, kv.Value);
                            }
                        }
                    }
                }
            }

            return(doc);
        }
Пример #11
0
        public static async Task Train(string udSource, string ontonotesSource, string languagesDirectory)
        {
            var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories);
            var testFiles  = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories);

            List <string> trainFilesOntonotesEnglish = null;

            if (!string.IsNullOrWhiteSpace(ontonotesSource))
            {
                trainFilesOntonotesEnglish = Directory.GetFiles(ontonotesSource, "*.parse.ddg", SearchOption.AllDirectories)
                                             .Where(fn => !fn.Contains("sel_") || int.Parse(Path.GetFileNameWithoutExtension(fn).Split(new char[] { '_', '.' }).Skip(1).First()) < 3654)
                                             .ToList();
            }

            var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var testFilesPerLanguage  = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());

            var languages = new List <(Language language, string lang)>();

            foreach (var lang in trainFilesPerLanguage.Keys.Intersect(testFilesPerLanguage.Keys))
            {
                try
                {
                    var language = Languages.CodeToEnum(lang);
                    languages.Add((language, lang));
                }
                catch
                {
                    Logger.LogWarning($"Unknown language {lang}");
                }
            }

            Logger.LogInformation($"Found these languages for training: {string.Join(", ", languages.Select(l => l.language))}");

            int attempts = 5;

            await Task.WhenAll(languages.Select(async v =>
            {
                await Task.Yield();

                var(language, lang) = (v.language, v.lang);

                var arcNames = new HashSet <string>();

                if (trainFilesPerLanguage.TryGetValue(lang, out var langTrainFiles) && testFilesPerLanguage.TryGetValue(lang, out var langTestFiles))
                {
                    var trainDocuments = await ReadCorpusAsync(langTrainFiles, arcNames, language);
                    var testDocuments  = await ReadCorpusAsync(langTestFiles, arcNames, language);

                    if (language == Language.English)
                    {
                        //Merge with Ontonotes 5.0 corpus
                        var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count * testDocuments.Count / trainDocuments.Count);

                        trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                        testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                    }

                    double bestScore = double.MinValue;

                    for (int i = 0; i < attempts; i++)
                    {
                        await Task.Run(async() =>
                        {
                            var tagger = new AveragePerceptronTagger(language, 0);
                            tagger.Train(trainDocuments, (5 + ThreadSafeRandom.Next(15)));
                            var scoreTrain = TestTagger(trainDocuments, tagger);
                            var scoreTest  = TestTagger(testDocuments, tagger);
                            if (scoreTest > bestScore)
                            {
                                Logger.LogInformation($"\n>>>>> {language}: NEW POS BEST: {scoreTest:0.0}%");
                                await tagger.StoreAsync();

                                if (scoreTest > 80)
                                {
                                    //Prepare models for new nuget-based distribution
                                    var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources");
                                    Directory.CreateDirectory(resDir);
                                    using (var f = File.OpenWrite(Path.Combine(resDir, "tagger.bin")))
                                    {
                                        await tagger.StoreAsync(f);
                                    }
                                    await File.WriteAllTextAsync(Path.Combine(resDir, "tagger.score"), $"{scoreTest:0.0}%");
                                }

                                bestScore = scoreTest;
                            }
                            else
                            {
                                Logger.LogInformation($"\n>>>>> {language}: POS BEST IS STILL : {bestScore:0.0}%");
                            }
                        });
                    }


                    bestScore = double.MinValue;
                    for (int i = 0; i < attempts; i++)
                    {
                        await Task.Run(async() =>
                        {
                            var parser = new AveragePerceptronDependencyParser(language, 0 /*, arcNames.ToList()*/);
                            try
                            {
                                parser.Train(trainDocuments, (5 + ThreadSafeRandom.Next(10)), (float)(1D - ThreadSafeRandom.NextDouble() * ThreadSafeRandom.NextDouble()));
                            }
                            catch (Exception E)
                            {
                                Logger.LogError("FAIL", E);
                                return;
                            }

                            trainDocuments = await ReadCorpusAsync(langTrainFiles, arcNames, language);
                            testDocuments  = await ReadCorpusAsync(langTestFiles, arcNames, language);

                            if (language == Language.English)
                            {
                                //Merge with Ontonotes 5.0 corpus
                                //Merge with Ontonotes 5.0 corpus
                                var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count *testDocuments.Count / trainDocuments.Count);

                                trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                                testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                            }

                            var scoreTrain = TestParser(trainDocuments, parser);
                            var scoreTest  = TestParser(testDocuments, parser);

                            if (scoreTest > bestScore)
                            {
                                Logger.LogInformation($"\n>>>>> {language}: NEW DEP BEST: {scoreTest:0.0}%");

                                if (scoreTest > 80)
                                {
                                    //Prepare models for new nuget-based distribution
                                    var resDir = Path.Combine(languagesDirectory, language.ToString(), "Resources");
                                    Directory.CreateDirectory(resDir);
                                    using (var f = File.OpenWrite(Path.Combine(resDir, "parser.bin")))
                                    {
                                        await parser.StoreAsync(f);
                                    }
                                    await File.WriteAllTextAsync(Path.Combine(resDir, "parser.score"), $"{scoreTest:0.0}%");
                                }

                                bestScore = scoreTest;
                            }
                            else
                            {
                                Logger.LogInformation($"\n>>>>> {language}: DEP BEST IS STILL : {bestScore:0.0}%");
                            }
                            parser = null;
                        });
                    }
                }
            }));

            foreach (var(language, lang) in languages)
            {
                var arcNames = new HashSet <string>();

                var trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], arcNames, language);

                var testDocuments = await ReadCorpusAsync(testFilesPerLanguage[lang], arcNames, language);

                if (language == Language.English)
                {
                    //Merge with Ontonotes 5.0 corpus
                    var testToTrain = (int)((float)trainFilesOntonotesEnglish.Count * testDocuments.Count / trainDocuments.Count);

                    trainDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Take(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                    testDocuments.AddRange(await ReadCorpusAsync(trainFilesOntonotesEnglish.Skip(testToTrain).ToList(), arcNames, language, isOntoNotes: true));
                }

                var tagger = await AveragePerceptronTagger.FromStoreAsync(language, 0, "");

                Logger.LogInformation($"\n{lang} - TAGGER / TRAIN");
                TestTagger(trainDocuments, tagger);

                Logger.LogInformation($"\n{lang} - TAGGER / TEST");
                TestTagger(testDocuments, tagger);

                trainDocuments = await ReadCorpusAsync(trainFilesPerLanguage[lang], arcNames, language);

                testDocuments = await ReadCorpusAsync(testFilesPerLanguage[lang], arcNames, language);

                var parser = await AveragePerceptronDependencyParser.FromStoreAsync(language, 0, "");

                Logger.LogInformation($"\n{lang} - PARSER / TRAIN");
                TestParser(trainDocuments, parser);

                Logger.LogInformation($"\n{lang} - PARSER / TEST");
                TestParser(testDocuments, parser);
            }
        }