public static void WriteAll( string path, WordAnalysisDictionary dictionary, IProgressHandle progress = null) { using (var writer = new SystemDataAnalysisWriter(path)) { writer.WriteAll(dictionary.Values.ToArray(), progress); } }
private WsdProject( WsdProjectInfo projectInfo, WordDictionary dictionary, TextData[] trainData, TextData[] testData, EmbeddingDictionary wordEmbeddings, EmbeddingDictionary meaningEmbeddings, WordAnalysisDictionary dataAnalysis, DictionaryStatistics dictionaryStatistics, DataStatistics dataStatistics, EmbeddingStatistics wordEmbeddingStatistics, EmbeddingStatistics meaningEmbeddingStatistics) { ProjectInfo = projectInfo; Dictionary = dictionary; TrainData = trainData; TestData = testData; WordEmbeddings = wordEmbeddings; MeaningEmbeddings = meaningEmbeddings; DataAnalysis = dataAnalysis; DictionaryStatistics = dictionaryStatistics; DataStatistics = dataStatistics; WordEmbeddingStatistics = wordEmbeddingStatistics; MeaningEmbeddingStatistics = meaningEmbeddingStatistics; PosList = new WsdPosList(trainData); PluginData = new PluginData(); }
public DataStatistics Compute( WordDictionary dictionary, WordAnalysisDictionary dataAnalysis, IProgressHandle progress = null) { var scope = progress?.Scope(1); try { MonosemanticTrainExamples = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) == 1) .Sum(x => x.TrainEncounters.Values.Sum(y => y.Encounters)); PolysemanticTrainExamples = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Sum(x => x.TrainEncounters.Values.Sum(y => y.Encounters)); MonosemanticTestExamples = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) == 1) .Sum(x => x.TestEncounters.Values.Sum(y => y.Encounters)); PolysemanticTestExamples = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Sum(x => x.TestEncounters.Values.Sum(y => y.Encounters)); var commonAnalysis = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Where(x => x.TrainEncounters.Count != 0) .SelectMany(x => x.TestEncounters .Select(y => new { x.Word, y.Value.Meaning, y.Value.PartOfSpeech, y.Value.Encounters })) .ToArray(); var learnableAnalysis = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Where(x => x.TrainEncounters.Count != 0) .SelectMany(x => x.TestEncounters .Where(y => x.TrainEncounters.ContainsKey(y.Key)) .Select(y => new { x.Word, y.Value.Meaning, y.Value.PartOfSpeech, y.Value.Encounters })) .ToArray(); var nonLearnableAnalysis = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Where(x => x.TrainEncounters.Count != 0) .SelectMany(x => x.TestEncounters .Where(y => !x.TrainEncounters.ContainsKey(y.Key)) .Select(y => new { x.Word, y.Value.Meaning, y.Value.PartOfSpeech, y.Value.Encounters })) .ToArray(); var testOnlyAnalysis = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Where(x => x.TrainEncounters.Count == 0) .SelectMany(x => x.TestEncounters .Select(y => new { x.Word, y.Value.Meaning, y.Value.PartOfSpeech, y.Value.Encounters })) .ToArray(); CommonTestExamples = commonAnalysis.Sum(x => x.Encounters); LearnableTestExamples = learnableAnalysis.Sum(x => x.Encounters); NonLearnableTestExamples = nonLearnableAnalysis.Sum(x => x.Encounters); TestOnlyExamples = testOnlyAnalysis.Sum(x => x.Encounters); CorrectDictionaryBasedLearnableGuesses = learnableAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First() .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectDictionaryBasedNonLearnableGuesses = nonLearnableAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First() .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectDictionaryBasedTestOnlyGuesses = testOnlyAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First() .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectPosDictionaryBasedLearnableGuesses = learnableAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First(y => y.PartOfSpeech == x.PartOfSpeech) .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectPosDictionaryBasedNonLearnableGuesses = nonLearnableAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First(y => y.PartOfSpeech == x.PartOfSpeech) .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectPosDictionaryBasedTestOnlyGuesses = testOnlyAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First(y => y.PartOfSpeech == x.PartOfSpeech) .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectTrainingBasedLearnableGuesses = learnableAnalysis .Where(x => dataAnalysis[x.Word] .TrainEncounters .Values .OrderByDescending(y => y.Encounters) .FirstOrDefault() .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectPosTrainingBasedLearnableGuesses = learnableAnalysis .Where(x => dataAnalysis[x.Word] .TrainEncounters .Values .Where(y => y.PartOfSpeech == x.PartOfSpeech) .OrderByDescending(y => y.Encounters) .FirstOrDefault() .Meaning == x.Meaning) .Sum(x => x.Encounters); FirstSenseDictionaryBaseline = (CorrectDictionaryBasedLearnableGuesses + CorrectDictionaryBasedNonLearnableGuesses) / (double)CommonTestExamples; All_FirstSenseDictionaryBaseline = (CorrectDictionaryBasedLearnableGuesses + CorrectDictionaryBasedNonLearnableGuesses + CorrectDictionaryBasedTestOnlyGuesses + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); FirstSensePosDictionaryBaseline = (CorrectPosDictionaryBasedLearnableGuesses + CorrectPosDictionaryBasedNonLearnableGuesses) / (double)CommonTestExamples; All_FirstSensePosDictionaryBaseline = (CorrectPosDictionaryBasedLearnableGuesses + CorrectPosDictionaryBasedNonLearnableGuesses + CorrectPosDictionaryBasedTestOnlyGuesses + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); FirstSenseBaseline = CorrectTrainingBasedLearnableGuesses / (double)CommonTestExamples; All_FirstSenseBaseline = (CorrectTrainingBasedLearnableGuesses + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); FirstSensePosBaseline = CorrectPosTrainingBasedLearnableGuesses / (double)CommonTestExamples; All_FirstSensePosBaseline = (CorrectPosTrainingBasedLearnableGuesses + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); BestCaseBaseline = (CorrectDictionaryBasedTestOnlyGuesses + LearnableTestExamples) / (double)PolysemanticTestExamples; All_BestCaseBaseline = (CorrectDictionaryBasedTestOnlyGuesses + LearnableTestExamples + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); } finally { scope?.Dispose(); } return(this); }
public static WsdProject CreateAndSave( WsdProjectCreateInfo info, string destinationPath, IProgressHandle progress) { if (info == null) { throw new ArgumentNullException(nameof(info)); } if (string.IsNullOrEmpty(destinationPath)) { throw new ArgumentNullException(nameof(destinationPath)); } if (PathEx.Identify(destinationPath) != PathIdentity.Directory || Directory.GetFiles(destinationPath, "*", SearchOption.AllDirectories).Length > 0) { throw new ArgumentException(ExceptionMessage.DestinationPathMustBeEmptyAndExisting); } if (progress == null) { throw new ArgumentNullException(nameof(progress)); } info.AssertIsValid(); progress.SetMessageFormat(MessageFormat.LoadingDictionary_Bytes); var dictionary = InputDictionaryReader.ReadAll(info.DictionaryPath, progress); progress.SetMessageFormat(MessageFormat.ComputingDictionaryStatistics); var dictionaryStatistics = new DictionaryStatistics().Compute(dictionary, progress); TextData[] trainData; TextData[] testData; if (info.DataType == InputDataType.PlainText) { progress.SetMessageFormat(MessageFormat.LoadingTrainData_Files); trainData = InputPlainTextDataReader.ReadAllFiles(info.TrainDataPath, progress); progress.SetMessageFormat(MessageFormat.LoadingTestData_Files); testData = InputPlainTextDataReader.ReadAllFiles(info.TestDataPath, progress); } else { progress.SetMessageFormat(MessageFormat.LoadingSynsetMappings_Bytes); var synsetMappings = InputSynsetMappingReader.ReadAll(info.SynsetMappingsPath, progress); progress.SetMessageFormat(MessageFormat.LoadingTrainData_Files); trainData = InputXmlDataReader.Read( info.TrainDataPath, info.TrainGoldKeyPath, synsetMappings, dictionary, out var trainXmlParseErrors, progress); if (trainXmlParseErrors != null && trainXmlParseErrors.Any()) { XmlParseErrorWriter.WriteAll( Path.Combine(destinationPath, FileName.TrainXmlParseErrors + FileExtension.Text), trainXmlParseErrors); } progress.SetMessageFormat(MessageFormat.LoadingTestData_Files); testData = InputXmlDataReader.Read( info.TestDataPath, info.TestGoldKeyPath, synsetMappings, dictionary, out var testXmlParseErrors, progress); if (testXmlParseErrors != null && testXmlParseErrors.Any()) { XmlParseErrorWriter.WriteAll( Path.Combine(destinationPath, FileName.TestXmlParseErrors + FileExtension.Text), testXmlParseErrors); } } progress.SetMessageFormat(MessageFormat.AnalyzingData_Files); var dataAnalysis = new WordAnalysisDictionary() .Analyze(dictionary, trainData, testData, progress); progress.SetMessageFormat(MessageFormat.ComputingDataStatistics); var dataStatistics = new DataStatistics() .Compute(dictionary, dataAnalysis, progress); progress.SetMessageFormat(MessageFormat.LoadingWordEmbeddings_Bytes); var wordEmbeddings = InputEmbeddingReader.ReadAll( info.WordEmbeddingsPath, dataAnalysis.GetAllWordOccurrences(), progress); var wordEmbeddingStatistics = new EmbeddingStatistics().Compute(wordEmbeddings); EmbeddingDictionary meaningEmbeddings = null; var meaningEmbeddingStatistics = new EmbeddingStatistics(); if (!string.IsNullOrWhiteSpace(info.MeaningEmbeddingsPath)) { progress.SetMessageFormat(MessageFormat.LoadingMeaningEmbeddings_Bytes); meaningEmbeddings = InputEmbeddingReader.ReadAll( info.MeaningEmbeddingsPath, dataAnalysis.GetAllMeaningOccurrences(), progress); meaningEmbeddingStatistics.Compute(meaningEmbeddings); } var projectInfo = new WsdProjectInfo { ProjectName = Path.GetFileName(destinationPath), ProjectVersion = CurrentProjectVersion, ApplicationVersion = typeof(WsdProject).Assembly.GetName().Version.ToString(), Dictionary = FileName.Dictionary + FileExtension.WsdData, TrainData = trainData.Select(x => new WsdProjectTextDataInfo { Name = x.TextName, Path = Path.Combine(FolderName.Train, x.TextName + FileExtension.WsdData) }).ToArray(), TestData = testData.Select(x => new WsdProjectTextDataInfo { Name = x.TextName, Path = Path.Combine(FolderName.Test, x.TextName + FileExtension.WsdData) }).ToArray(), WordEmbeddings = FileName.WordEmbeddings + FileExtension.WsdData, MeaningEmbeddings = meaningEmbeddings != null ? FileName.MeaningEmbeddings + FileExtension.WsdData : string.Empty, DataAnalysis = FileName.DataAnalysis + FileExtension.WsdData, DictionaryStatistics = FileName.DictionaryStatistics + FileExtension.WsdData, DataStatistics = FileName.DataStatistics + FileExtension.WsdData, WordEmbeddingsStatistics = FileName.WordEmbeddingsStatistics + FileExtension.WsdData, MeaningEmbeddingsStatistics = FileName.MeaningEmbeddingsStatistics + FileExtension.WsdData }; progress.SetMessageFormat(MessageFormat.SavingDictionary_Words); SystemDictionaryWriter.WriteAll( Path.Combine(destinationPath, projectInfo.Dictionary), dictionary, progress); progress.SetMessageFormat(MessageFormat.SavingTrainData_Files); SystemDataWriter.WriteAllFiles( destinationPath, projectInfo.TrainData .Select(x => (x.Path, trainData.Single(y => y.TextName == x.Name).Data)) .ToArray(), progress); progress.SetMessageFormat(MessageFormat.SavingTestData_Files); SystemDataWriter.WriteAllFiles( destinationPath, projectInfo.TestData .Select(x => (x.Path, testData.Single(y => y.TextName == x.Name).Data)) .ToArray(), progress); progress.SetMessageFormat(MessageFormat.SavingWordEmbeddings_Embeddings); SystemEmbeddingWriter.WriteAll( Path.Combine(destinationPath, projectInfo.WordEmbeddings), wordEmbeddings, progress); if (meaningEmbeddings != null) { progress.SetMessageFormat(MessageFormat.SavingMeaningEmbeddings_Embeddings); SystemEmbeddingWriter.WriteAll( Path.Combine(destinationPath, projectInfo.MeaningEmbeddings), meaningEmbeddings, progress); } progress.SetMessageFormat(MessageFormat.SavingDataAnalysis_Words); SystemDataAnalysisWriter.WriteAll( Path.Combine(destinationPath, projectInfo.DataAnalysis), dataAnalysis, progress); SystemJsonWriter.Write( Path.Combine(destinationPath, projectInfo.DictionaryStatistics), dictionaryStatistics); SystemJsonWriter.Write( Path.Combine(destinationPath, projectInfo.DataStatistics), dataStatistics); SystemJsonWriter.Write( Path.Combine(destinationPath, projectInfo.WordEmbeddingsStatistics), wordEmbeddingStatistics); SystemJsonWriter.Write( Path.Combine(destinationPath, projectInfo.MeaningEmbeddingsStatistics), meaningEmbeddingStatistics); SystemJsonWriter.Write( Path.Combine(destinationPath, projectInfo.ProjectName + FileExtension.WsdProj), projectInfo); return(new WsdProject( projectInfo, dictionary, trainData, testData, wordEmbeddings, meaningEmbeddings, dataAnalysis, dictionaryStatistics, dataStatistics, wordEmbeddingStatistics, meaningEmbeddingStatistics)); }