public static void WriteAll(
     string path, WordAnalysisDictionary dictionary, IProgressHandle progress = null)
 {
     using (var writer = new SystemDataAnalysisWriter(path))
     {
         writer.WriteAll(dictionary.Values.ToArray(), progress);
     }
 }
Example #2
0
 private WsdProject(
     WsdProjectInfo projectInfo, WordDictionary dictionary,
     TextData[] trainData, TextData[] testData,
     EmbeddingDictionary wordEmbeddings, EmbeddingDictionary meaningEmbeddings,
     WordAnalysisDictionary dataAnalysis, DictionaryStatistics dictionaryStatistics,
     DataStatistics dataStatistics, EmbeddingStatistics wordEmbeddingStatistics,
     EmbeddingStatistics meaningEmbeddingStatistics)
 {
     ProjectInfo                = projectInfo;
     Dictionary                 = dictionary;
     TrainData                  = trainData;
     TestData                   = testData;
     WordEmbeddings             = wordEmbeddings;
     MeaningEmbeddings          = meaningEmbeddings;
     DataAnalysis               = dataAnalysis;
     DictionaryStatistics       = dictionaryStatistics;
     DataStatistics             = dataStatistics;
     WordEmbeddingStatistics    = wordEmbeddingStatistics;
     MeaningEmbeddingStatistics = meaningEmbeddingStatistics;
     PosList    = new WsdPosList(trainData);
     PluginData = new PluginData();
 }
Example #3
0
        public DataStatistics Compute(
            WordDictionary dictionary, WordAnalysisDictionary dataAnalysis, IProgressHandle progress = null)
        {
            var scope = progress?.Scope(1);

            try
            {
                MonosemanticTrainExamples = dataAnalysis.Values
                                            .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) == 1)
                                            .Sum(x => x.TrainEncounters.Values.Sum(y => y.Encounters));

                PolysemanticTrainExamples = dataAnalysis.Values
                                            .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1)
                                            .Sum(x => x.TrainEncounters.Values.Sum(y => y.Encounters));

                MonosemanticTestExamples = dataAnalysis.Values
                                           .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) == 1)
                                           .Sum(x => x.TestEncounters.Values.Sum(y => y.Encounters));

                PolysemanticTestExamples = dataAnalysis.Values
                                           .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1)
                                           .Sum(x => x.TestEncounters.Values.Sum(y => y.Encounters));

                var commonAnalysis = dataAnalysis.Values
                                     .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1)
                                     .Where(x => x.TrainEncounters.Count != 0)
                                     .SelectMany(x => x.TestEncounters
                                                 .Select(y => new
                {
                    x.Word,
                    y.Value.Meaning,
                    y.Value.PartOfSpeech,
                    y.Value.Encounters
                }))
                                     .ToArray();

                var learnableAnalysis = dataAnalysis.Values
                                        .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1)
                                        .Where(x => x.TrainEncounters.Count != 0)
                                        .SelectMany(x => x.TestEncounters
                                                    .Where(y => x.TrainEncounters.ContainsKey(y.Key))
                                                    .Select(y => new
                {
                    x.Word,
                    y.Value.Meaning,
                    y.Value.PartOfSpeech,
                    y.Value.Encounters
                }))
                                        .ToArray();

                var nonLearnableAnalysis = dataAnalysis.Values
                                           .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1)
                                           .Where(x => x.TrainEncounters.Count != 0)
                                           .SelectMany(x => x.TestEncounters
                                                       .Where(y => !x.TrainEncounters.ContainsKey(y.Key))
                                                       .Select(y => new
                {
                    x.Word,
                    y.Value.Meaning,
                    y.Value.PartOfSpeech,
                    y.Value.Encounters
                }))
                                           .ToArray();

                var testOnlyAnalysis = dataAnalysis.Values
                                       .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1)
                                       .Where(x => x.TrainEncounters.Count == 0)
                                       .SelectMany(x => x.TestEncounters
                                                   .Select(y => new
                {
                    x.Word,
                    y.Value.Meaning,
                    y.Value.PartOfSpeech,
                    y.Value.Encounters
                }))
                                       .ToArray();

                CommonTestExamples       = commonAnalysis.Sum(x => x.Encounters);
                LearnableTestExamples    = learnableAnalysis.Sum(x => x.Encounters);
                NonLearnableTestExamples = nonLearnableAnalysis.Sum(x => x.Encounters);
                TestOnlyExamples         = testOnlyAnalysis.Sum(x => x.Encounters);

                CorrectDictionaryBasedLearnableGuesses = learnableAnalysis
                                                         .Where(x => dictionary
                                                                .GetByName(x.Word)
                                                                .Meanings.Values
                                                                .OrderByDescending(y => y.Encounters)
                                                                .First()
                                                                .Meaning ==
                                                                x.Meaning)
                                                         .Sum(x => x.Encounters);

                CorrectDictionaryBasedNonLearnableGuesses = nonLearnableAnalysis
                                                            .Where(x => dictionary
                                                                   .GetByName(x.Word)
                                                                   .Meanings.Values
                                                                   .OrderByDescending(y => y.Encounters)
                                                                   .First()
                                                                   .Meaning ==
                                                                   x.Meaning)
                                                            .Sum(x => x.Encounters);

                CorrectDictionaryBasedTestOnlyGuesses = testOnlyAnalysis
                                                        .Where(x => dictionary
                                                               .GetByName(x.Word)
                                                               .Meanings.Values
                                                               .OrderByDescending(y => y.Encounters)
                                                               .First()
                                                               .Meaning ==
                                                               x.Meaning)
                                                        .Sum(x => x.Encounters);

                CorrectPosDictionaryBasedLearnableGuesses = learnableAnalysis
                                                            .Where(x => dictionary
                                                                   .GetByName(x.Word)
                                                                   .Meanings.Values
                                                                   .OrderByDescending(y => y.Encounters)
                                                                   .First(y => y.PartOfSpeech == x.PartOfSpeech)
                                                                   .Meaning ==
                                                                   x.Meaning)
                                                            .Sum(x => x.Encounters);

                CorrectPosDictionaryBasedNonLearnableGuesses = nonLearnableAnalysis
                                                               .Where(x => dictionary
                                                                      .GetByName(x.Word)
                                                                      .Meanings.Values
                                                                      .OrderByDescending(y => y.Encounters)
                                                                      .First(y => y.PartOfSpeech == x.PartOfSpeech)
                                                                      .Meaning ==
                                                                      x.Meaning)
                                                               .Sum(x => x.Encounters);

                CorrectPosDictionaryBasedTestOnlyGuesses = testOnlyAnalysis
                                                           .Where(x => dictionary
                                                                  .GetByName(x.Word)
                                                                  .Meanings.Values
                                                                  .OrderByDescending(y => y.Encounters)
                                                                  .First(y => y.PartOfSpeech == x.PartOfSpeech)
                                                                  .Meaning ==
                                                                  x.Meaning)
                                                           .Sum(x => x.Encounters);

                CorrectTrainingBasedLearnableGuesses = learnableAnalysis
                                                       .Where(x => dataAnalysis[x.Word]
                                                              .TrainEncounters
                                                              .Values
                                                              .OrderByDescending(y => y.Encounters)
                                                              .FirstOrDefault()
                                                              .Meaning ==
                                                              x.Meaning)
                                                       .Sum(x => x.Encounters);

                CorrectPosTrainingBasedLearnableGuesses = learnableAnalysis
                                                          .Where(x => dataAnalysis[x.Word]
                                                                 .TrainEncounters
                                                                 .Values
                                                                 .Where(y => y.PartOfSpeech == x.PartOfSpeech)
                                                                 .OrderByDescending(y => y.Encounters)
                                                                 .FirstOrDefault()
                                                                 .Meaning ==
                                                                 x.Meaning)
                                                          .Sum(x => x.Encounters);

                FirstSenseDictionaryBaseline = (CorrectDictionaryBasedLearnableGuesses +
                                                CorrectDictionaryBasedNonLearnableGuesses) /
                                               (double)CommonTestExamples;

                All_FirstSenseDictionaryBaseline = (CorrectDictionaryBasedLearnableGuesses +
                                                    CorrectDictionaryBasedNonLearnableGuesses +
                                                    CorrectDictionaryBasedTestOnlyGuesses +
                                                    MonosemanticTestExamples) /
                                                   (double)(PolysemanticTestExamples + MonosemanticTestExamples);

                FirstSensePosDictionaryBaseline = (CorrectPosDictionaryBasedLearnableGuesses +
                                                   CorrectPosDictionaryBasedNonLearnableGuesses) /
                                                  (double)CommonTestExamples;

                All_FirstSensePosDictionaryBaseline = (CorrectPosDictionaryBasedLearnableGuesses +
                                                       CorrectPosDictionaryBasedNonLearnableGuesses +
                                                       CorrectPosDictionaryBasedTestOnlyGuesses +
                                                       MonosemanticTestExamples) /
                                                      (double)(PolysemanticTestExamples +
                                                               MonosemanticTestExamples);

                FirstSenseBaseline = CorrectTrainingBasedLearnableGuesses /
                                     (double)CommonTestExamples;

                All_FirstSenseBaseline = (CorrectTrainingBasedLearnableGuesses + MonosemanticTestExamples) /
                                         (double)(PolysemanticTestExamples + MonosemanticTestExamples);

                FirstSensePosBaseline = CorrectPosTrainingBasedLearnableGuesses /
                                        (double)CommonTestExamples;

                All_FirstSensePosBaseline = (CorrectPosTrainingBasedLearnableGuesses + MonosemanticTestExamples) /
                                            (double)(PolysemanticTestExamples + MonosemanticTestExamples);

                BestCaseBaseline = (CorrectDictionaryBasedTestOnlyGuesses +
                                    LearnableTestExamples) /
                                   (double)PolysemanticTestExamples;

                All_BestCaseBaseline = (CorrectDictionaryBasedTestOnlyGuesses +
                                        LearnableTestExamples + MonosemanticTestExamples) /
                                       (double)(PolysemanticTestExamples + MonosemanticTestExamples);
            }
            finally
            {
                scope?.Dispose();
            }

            return(this);
        }
Example #4
0
        public static WsdProject CreateAndSave(
            WsdProjectCreateInfo info, string destinationPath, IProgressHandle progress)
        {
            if (info == null)
            {
                throw new ArgumentNullException(nameof(info));
            }

            if (string.IsNullOrEmpty(destinationPath))
            {
                throw new ArgumentNullException(nameof(destinationPath));
            }

            if (PathEx.Identify(destinationPath) != PathIdentity.Directory ||
                Directory.GetFiles(destinationPath, "*", SearchOption.AllDirectories).Length > 0)
            {
                throw new ArgumentException(ExceptionMessage.DestinationPathMustBeEmptyAndExisting);
            }

            if (progress == null)
            {
                throw new ArgumentNullException(nameof(progress));
            }

            info.AssertIsValid();

            progress.SetMessageFormat(MessageFormat.LoadingDictionary_Bytes);

            var dictionary = InputDictionaryReader.ReadAll(info.DictionaryPath, progress);

            progress.SetMessageFormat(MessageFormat.ComputingDictionaryStatistics);

            var dictionaryStatistics = new DictionaryStatistics().Compute(dictionary, progress);

            TextData[] trainData;
            TextData[] testData;

            if (info.DataType == InputDataType.PlainText)
            {
                progress.SetMessageFormat(MessageFormat.LoadingTrainData_Files);

                trainData = InputPlainTextDataReader.ReadAllFiles(info.TrainDataPath, progress);

                progress.SetMessageFormat(MessageFormat.LoadingTestData_Files);

                testData = InputPlainTextDataReader.ReadAllFiles(info.TestDataPath, progress);
            }
            else
            {
                progress.SetMessageFormat(MessageFormat.LoadingSynsetMappings_Bytes);

                var synsetMappings = InputSynsetMappingReader.ReadAll(info.SynsetMappingsPath, progress);

                progress.SetMessageFormat(MessageFormat.LoadingTrainData_Files);

                trainData = InputXmlDataReader.Read(
                    info.TrainDataPath, info.TrainGoldKeyPath, synsetMappings, dictionary,
                    out var trainXmlParseErrors, progress);

                if (trainXmlParseErrors != null && trainXmlParseErrors.Any())
                {
                    XmlParseErrorWriter.WriteAll(
                        Path.Combine(destinationPath, FileName.TrainXmlParseErrors + FileExtension.Text),
                        trainXmlParseErrors);
                }

                progress.SetMessageFormat(MessageFormat.LoadingTestData_Files);

                testData = InputXmlDataReader.Read(
                    info.TestDataPath, info.TestGoldKeyPath, synsetMappings, dictionary,
                    out var testXmlParseErrors, progress);

                if (testXmlParseErrors != null && testXmlParseErrors.Any())
                {
                    XmlParseErrorWriter.WriteAll(
                        Path.Combine(destinationPath, FileName.TestXmlParseErrors + FileExtension.Text),
                        testXmlParseErrors);
                }
            }

            progress.SetMessageFormat(MessageFormat.AnalyzingData_Files);

            var dataAnalysis = new WordAnalysisDictionary()
                               .Analyze(dictionary, trainData, testData, progress);

            progress.SetMessageFormat(MessageFormat.ComputingDataStatistics);

            var dataStatistics = new DataStatistics()
                                 .Compute(dictionary, dataAnalysis, progress);

            progress.SetMessageFormat(MessageFormat.LoadingWordEmbeddings_Bytes);

            var wordEmbeddings = InputEmbeddingReader.ReadAll(
                info.WordEmbeddingsPath, dataAnalysis.GetAllWordOccurrences(), progress);

            var wordEmbeddingStatistics = new EmbeddingStatistics().Compute(wordEmbeddings);

            EmbeddingDictionary meaningEmbeddings = null;

            var meaningEmbeddingStatistics = new EmbeddingStatistics();

            if (!string.IsNullOrWhiteSpace(info.MeaningEmbeddingsPath))
            {
                progress.SetMessageFormat(MessageFormat.LoadingMeaningEmbeddings_Bytes);

                meaningEmbeddings = InputEmbeddingReader.ReadAll(
                    info.MeaningEmbeddingsPath, dataAnalysis.GetAllMeaningOccurrences(), progress);

                meaningEmbeddingStatistics.Compute(meaningEmbeddings);
            }

            var projectInfo = new WsdProjectInfo
            {
                ProjectName        = Path.GetFileName(destinationPath),
                ProjectVersion     = CurrentProjectVersion,
                ApplicationVersion = typeof(WsdProject).Assembly.GetName().Version.ToString(),
                Dictionary         = FileName.Dictionary + FileExtension.WsdData,
                TrainData          = trainData.Select(x => new WsdProjectTextDataInfo
                {
                    Name = x.TextName,
                    Path = Path.Combine(FolderName.Train, x.TextName + FileExtension.WsdData)
                }).ToArray(),
                TestData = testData.Select(x => new WsdProjectTextDataInfo
                {
                    Name = x.TextName,
                    Path = Path.Combine(FolderName.Test, x.TextName + FileExtension.WsdData)
                }).ToArray(),
                WordEmbeddings    = FileName.WordEmbeddings + FileExtension.WsdData,
                MeaningEmbeddings = meaningEmbeddings != null
                    ? FileName.MeaningEmbeddings + FileExtension.WsdData
                    : string.Empty,
                DataAnalysis                = FileName.DataAnalysis + FileExtension.WsdData,
                DictionaryStatistics        = FileName.DictionaryStatistics + FileExtension.WsdData,
                DataStatistics              = FileName.DataStatistics + FileExtension.WsdData,
                WordEmbeddingsStatistics    = FileName.WordEmbeddingsStatistics + FileExtension.WsdData,
                MeaningEmbeddingsStatistics = FileName.MeaningEmbeddingsStatistics + FileExtension.WsdData
            };

            progress.SetMessageFormat(MessageFormat.SavingDictionary_Words);

            SystemDictionaryWriter.WriteAll(
                Path.Combine(destinationPath, projectInfo.Dictionary), dictionary, progress);

            progress.SetMessageFormat(MessageFormat.SavingTrainData_Files);

            SystemDataWriter.WriteAllFiles(
                destinationPath,
                projectInfo.TrainData
                .Select(x => (x.Path, trainData.Single(y => y.TextName == x.Name).Data))
                .ToArray(),
                progress);

            progress.SetMessageFormat(MessageFormat.SavingTestData_Files);

            SystemDataWriter.WriteAllFiles(
                destinationPath,
                projectInfo.TestData
                .Select(x => (x.Path, testData.Single(y => y.TextName == x.Name).Data))
                .ToArray(),
                progress);

            progress.SetMessageFormat(MessageFormat.SavingWordEmbeddings_Embeddings);

            SystemEmbeddingWriter.WriteAll(
                Path.Combine(destinationPath, projectInfo.WordEmbeddings), wordEmbeddings, progress);

            if (meaningEmbeddings != null)
            {
                progress.SetMessageFormat(MessageFormat.SavingMeaningEmbeddings_Embeddings);

                SystemEmbeddingWriter.WriteAll(
                    Path.Combine(destinationPath, projectInfo.MeaningEmbeddings), meaningEmbeddings, progress);
            }

            progress.SetMessageFormat(MessageFormat.SavingDataAnalysis_Words);

            SystemDataAnalysisWriter.WriteAll(
                Path.Combine(destinationPath, projectInfo.DataAnalysis), dataAnalysis, progress);

            SystemJsonWriter.Write(
                Path.Combine(destinationPath, projectInfo.DictionaryStatistics), dictionaryStatistics);

            SystemJsonWriter.Write(
                Path.Combine(destinationPath, projectInfo.DataStatistics), dataStatistics);

            SystemJsonWriter.Write(
                Path.Combine(destinationPath, projectInfo.WordEmbeddingsStatistics), wordEmbeddingStatistics);

            SystemJsonWriter.Write(
                Path.Combine(destinationPath, projectInfo.MeaningEmbeddingsStatistics),
                meaningEmbeddingStatistics);

            SystemJsonWriter.Write(
                Path.Combine(destinationPath, projectInfo.ProjectName + FileExtension.WsdProj),
                projectInfo);

            return(new WsdProject(
                       projectInfo, dictionary, trainData, testData, wordEmbeddings, meaningEmbeddings,
                       dataAnalysis, dictionaryStatistics, dataStatistics, wordEmbeddingStatistics,
                       meaningEmbeddingStatistics));
        }