public T[] ReadAll(IProgressHandle progress = null) { var scope = progress?.Scope(BaseReader.BaseStream.Length); try { var list = new List <T>(); while (!BaseReader.EndOfStream) { var data = Read(); if (data != null) { list.Add(data); } scope?.TrySet(BaseReader.BaseStream.Position); } return(list.ToArray()); } finally { scope?.Dispose(); } }
public static T Read <T>(string path, IProgressHandle progress = null) { if (string.IsNullOrEmpty(path)) { throw new ArgumentNullException(nameof(path)); } var scope = progress?.Scope(1); try { var data = File.ReadAllText(path); try { return(JsonConvert.DeserializeObject <T>(data, new JsonSerializerSettings { Formatting = Formatting.Indented, TypeNameHandling = TypeNameHandling.Objects })); } catch (Exception ex) { throw new Exception(ExceptionMessage.UnableToLoadProjectData, ex); } } finally { scope?.Dispose(); } }
public static void Write <T>( string path, T data, IProgressHandle progress = null, bool includeTypeNames = true) { if (string.IsNullOrEmpty(path)) { throw new ArgumentNullException(nameof(path)); } if (data == null) { throw new ArgumentNullException(nameof(data)); } var scope = progress?.Scope(1); try { var json = JsonConvert.SerializeObject(data, new JsonSerializerSettings { Formatting = Formatting.Indented, TypeNameHandling = includeTypeNames ? TypeNameHandling.Objects : TypeNameHandling.None }); File.WriteAllText(path, json); } finally { scope?.Dispose(); } }
public static TextData[] ReadAllFiles(string path, IProgressHandle progress = null) { var dataFiles = Directory.GetFiles(path); var scope = progress?.Scope(dataFiles.Length); try { var result = new List <TextData>(); for (var i = 0; i < dataFiles.Length; i++) { var file = dataFiles[i]; var textName = Path.GetFileNameWithoutExtension(file); result.Add(new TextData(textName, ReadAll(file))); scope.TrySet(i + 1); } return(result.ToArray()); } finally { scope?.Dispose(); } }
public WordAnalysisDictionary Analyze( WordDictionary dictionary, TextData[] trainData, TextData[] testData, IProgressHandle progress = null) { var max = trainData.Length + testData.Length; var scope = progress?.Scope(max); var counter = 0; try { foreach (var text in trainData) { foreach (var encounter in text.Data) { if (string.IsNullOrWhiteSpace(encounter.Word) || string.IsNullOrWhiteSpace(encounter.Meaning) || encounter.Word == RawWordEncounter.EmptyWord || encounter.Word == RawWordEncounter.EndOfSentence) { continue; } var wordAnalysis = GetOrAdd(dictionary, encounter); wordAnalysis.TrainEncounters.AddEncounter(dictionary, encounter); wordAnalysis.AllEncounters.AddEncounter(dictionary, encounter); } scope?.TrySet(++counter); } foreach (var text in testData) { foreach (var encounter in text.Data) { if (string.IsNullOrWhiteSpace(encounter.Word) || string.IsNullOrWhiteSpace(encounter.Meaning) || encounter.Word == RawWordEncounter.EmptyWord || encounter.Word == RawWordEncounter.EndOfSentence) { continue; } var wordAnalysis = GetOrAdd(dictionary, encounter); wordAnalysis.TestEncounters.AddEncounter(dictionary, encounter); wordAnalysis.AllEncounters.AddEncounter(dictionary, encounter); } scope?.TrySet(++counter); } } finally { scope?.Dispose(); } return(this); }
public void Extract( IList <DataSetGroup> dataSetGroups, GenerationInfo info, IProgressHandle progress) { using (var scope = progress.Scope(dataSetGroups.Count, MessageFormat.ExtractingValidationSet_Groups)) { var counter = 0; foreach (var dataSetGroup in dataSetGroups) { scope.TrySet(counter++); var oldTrainSet = dataSetGroup.DataSets.GetByName(DataSetName.Train); if (oldTrainSet == null) { continue; } var validationSplit = oldTrainSet.Data .GroupBy(x => x.Word + x.Meaning) .Select(x => { var groupCount = x.Count(); var validationGroupCount = (int)Math.Ceiling( groupCount * 0.01 * info.ValidationSetPercentage); return(new { TrainGroup = x.Skip(validationGroupCount).ToArray(), ValidationGroup = x.Take(validationGroupCount).ToArray() }); }) .ToArray(); var trainExamples = validationSplit.SelectMany(x => x.TrainGroup).ToArray(); var validationExamples = validationSplit.SelectMany(x => x.ValidationGroup).ToArray(); if (trainExamples.Length > 0) { dataSetGroup.DataSets[DataSetName.Train] = new DataSet(DataSetName.Train, trainExamples); } else { dataSetGroup.DataSets.Remove(DataSetName.Train); } if (validationExamples.Length > 0) { dataSetGroup.DataSets[DataSetName.Validation] = new DataSet( DataSetName.Validation, validationExamples); } else { dataSetGroup.DataSets.Remove(DataSetName.Validation); } } } }
public void BeforeDataWritten( IList <DataSetGroup> dataSetGroups, WsdProject project, GenerationInfo info, IProgressHandle progress) { var config = project.PluginData.GetData <StatisticsPlugin, StatisticsConfig>(string.Empty); if (!config.PluginEnabled) { return; } var dictionary = project.PluginData.GetData <StatisticsPlugin, WordDictionary>(string.Empty); var filePath = Path.Combine( info.DestinationFolder, FileName.DataSetStatistics + FileExtension.Csv); using (var streamWriter = new StreamWriter(filePath)) using (var writer = new CsvWriter(streamWriter)) using (var scope = progress.Scope(dataSetGroups.Count, MessageFormat.ComputingStatistics_Groups)) { writer.WriteLine( "Group", "Train examples", "Validation examples", "Test examples", "Test-only examples", "Majority vote", "Train classes", "Test classes", "Train entropy", "Test entropy"); for (var i = 0; i < dataSetGroups.Count; i++) { scope.TrySet(i); var dataSetGroup = dataSetGroups[i]; var statistics = DataSetGroupStatistics.Compute(dictionary, dataSetGroup); if (config.RequireTrainingSet && statistics.TrainExamples == 0 || config.RequireTestSet && statistics.TestExamples == 0 || statistics.TrainExamples + statistics.ValidationExamples < config.MinimumTrainingValidationExamples) { continue; } writer.WriteLine( dataSetGroup.GroupName, statistics.TrainExamples, statistics.ValidationExamples, statistics.TestExamples, statistics.TestOnlyExamples, statistics.MajorityVote, statistics.TrainClasses, statistics.TestClasses, statistics.TrainEntropy, statistics.TestEntropy); } } if (config.AbortGenerationAfterStatisticsAreComputed) { throw new OperationCanceledException(); } }
public void Extract( IList <DataSetGroup> dataSetGroups, WsdProject project, GenerationInfo info, IProgressHandle progress) { using (var scope = progress.Scope(dataSetGroups.Count, MessageFormat.ExtractingTestOnlySet_Groups)) { var counter = 0; foreach (var dataSetGroup in dataSetGroups) { scope.TrySet(counter++); var oldTestSet = dataSetGroup.DataSets.GetByName(DataSetName.Test); if (oldTestSet == null) { continue; } var testExamples = oldTestSet.Data .Where(x => project.DataAnalysis[x.Word].TrainEncounters.Any()) .ToArray(); var testOnlyExamples = oldTestSet.Data .Where(x => !project.DataAnalysis[x.Word].TrainEncounters.Any()) .ToArray(); if (testExamples.Length > 0) { dataSetGroup.DataSets[DataSetName.Test] = new DataSet(DataSetName.Test, testExamples); } else { dataSetGroup.DataSets.Remove(DataSetName.Test); } if (testOnlyExamples.Length > 0) { dataSetGroup.DataSets[DataSetName.TestOnly] = new DataSet(DataSetName.TestOnly, testOnlyExamples); } else { dataSetGroup.DataSets.Remove(DataSetName.TestOnly); } } } }
public void ShuffleData(IList <DataSetGroup> dataSetGroups, IProgressHandle progress) { using (var scope = progress.Scope(dataSetGroups.Count, MessageFormat.ExtractingValidationSet_Groups)) { var counter = 0; foreach (var dataSetGroup in dataSetGroups) { scope.TrySet(counter++); foreach (var dataSet in dataSetGroup.DataSets.Values) { dataSet.Data.Shuffle(); } } } }
public static PluginInfo[] LoadPlugins(IProgressHandle progress) { var plugins = new List <PluginInfo>(); var assemblyFiles = Directory.GetFiles(PluginDirectory, "*.dll", SearchOption.AllDirectories); using (var scope = progress.Scope(assemblyFiles.Length, MessageFormat.LoadingPlugins_Files)) { var counter = 0; foreach (var assemblyFile in assemblyFiles) { try { var assembly = Assembly.LoadFile(assemblyFile); var pluginTypes = assembly.GetTypes() .Where(x => typeof(IPlugin).IsAssignableFrom(x)) .ToArray(); foreach (var pluginType in pluginTypes) { try { var pluginInstance = (IPlugin)Activator.CreateInstance(pluginType); var pluginComponents = pluginInstance.GetComponents() ?? new IPluginComponent[0]; plugins.Add(new PluginInfo( true, assemblyFile, assembly.GetName(), pluginInstance, pluginComponents)); } catch { plugins.Add(new PluginInfo( true, assemblyFile, assembly.GetName(), null, null)); } } } catch { } scope.TrySet(++counter); } } return(plugins.ToArray()); }
public void WriteAll(IList <T> list, IProgressHandle progress = null) { var scope = progress?.Scope(list.Count); try { for (var i = 0; i < list.Count; i++) { Write(list[i]); scope?.TrySet(i + 1); } } finally { scope?.Dispose(); } }
public IList <GeneratedTextData> GenerateRecords( IList <TextData> data, WsdProject project, GenerationInfo info, IProgressHandle progress) { var result = new List <GeneratedTextData>(); using (var scope = progress.Scope(data.Count, MessageFormat.GeneratingRecords_Texts)) { for (var i = 0; i < data.Count; i++) { scope.TrySet(i); result.Add(new GeneratedTextData( data[i].TextName, GenerateRecords(data[i].Data, project, info))); } } return(result); }
public DictionaryStatistics Compute(WordDictionary dictionary, IProgressHandle progress = null) { var scope = progress?.Scope(1); try { WordCount = dictionary.Count; MonosemanticWordCount = dictionary.Values.Count(x => x.Meanings.Count == 1); PolysemanticWordCount = dictionary.Values.Count(x => x.Meanings.Count > 1); MaxMeaningsPerWord = dictionary.Values.MaxOrDefault(x => x.Meanings.Count); AverageMeaningsPerWord = dictionary.Values.AverageOrDefault(x => x.Meanings.Count); UniqueMeaningsCount = dictionary.Values.SelectMany(x => x.Meanings.Keys).Distinct().Count(); } finally { scope?.Dispose(); } return(this); }
public DataStatistics Compute( WordDictionary dictionary, WordAnalysisDictionary dataAnalysis, IProgressHandle progress = null) { var scope = progress?.Scope(1); try { MonosemanticTrainExamples = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) == 1) .Sum(x => x.TrainEncounters.Values.Sum(y => y.Encounters)); PolysemanticTrainExamples = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Sum(x => x.TrainEncounters.Values.Sum(y => y.Encounters)); MonosemanticTestExamples = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) == 1) .Sum(x => x.TestEncounters.Values.Sum(y => y.Encounters)); PolysemanticTestExamples = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Sum(x => x.TestEncounters.Values.Sum(y => y.Encounters)); var commonAnalysis = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Where(x => x.TrainEncounters.Count != 0) .SelectMany(x => x.TestEncounters .Select(y => new { x.Word, y.Value.Meaning, y.Value.PartOfSpeech, y.Value.Encounters })) .ToArray(); var learnableAnalysis = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Where(x => x.TrainEncounters.Count != 0) .SelectMany(x => x.TestEncounters .Where(y => x.TrainEncounters.ContainsKey(y.Key)) .Select(y => new { x.Word, y.Value.Meaning, y.Value.PartOfSpeech, y.Value.Encounters })) .ToArray(); var nonLearnableAnalysis = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Where(x => x.TrainEncounters.Count != 0) .SelectMany(x => x.TestEncounters .Where(y => !x.TrainEncounters.ContainsKey(y.Key)) .Select(y => new { x.Word, y.Value.Meaning, y.Value.PartOfSpeech, y.Value.Encounters })) .ToArray(); var testOnlyAnalysis = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Where(x => x.TrainEncounters.Count == 0) .SelectMany(x => x.TestEncounters .Select(y => new { x.Word, y.Value.Meaning, y.Value.PartOfSpeech, y.Value.Encounters })) .ToArray(); CommonTestExamples = commonAnalysis.Sum(x => x.Encounters); LearnableTestExamples = learnableAnalysis.Sum(x => x.Encounters); NonLearnableTestExamples = nonLearnableAnalysis.Sum(x => x.Encounters); TestOnlyExamples = testOnlyAnalysis.Sum(x => x.Encounters); CorrectDictionaryBasedLearnableGuesses = learnableAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First() .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectDictionaryBasedNonLearnableGuesses = nonLearnableAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First() .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectDictionaryBasedTestOnlyGuesses = testOnlyAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First() .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectPosDictionaryBasedLearnableGuesses = learnableAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First(y => y.PartOfSpeech == x.PartOfSpeech) .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectPosDictionaryBasedNonLearnableGuesses = nonLearnableAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First(y => y.PartOfSpeech == x.PartOfSpeech) .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectPosDictionaryBasedTestOnlyGuesses = testOnlyAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First(y => y.PartOfSpeech == x.PartOfSpeech) .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectTrainingBasedLearnableGuesses = learnableAnalysis .Where(x => dataAnalysis[x.Word] .TrainEncounters .Values .OrderByDescending(y => y.Encounters) .FirstOrDefault() .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectPosTrainingBasedLearnableGuesses = learnableAnalysis .Where(x => dataAnalysis[x.Word] .TrainEncounters .Values .Where(y => y.PartOfSpeech == x.PartOfSpeech) .OrderByDescending(y => y.Encounters) .FirstOrDefault() .Meaning == x.Meaning) .Sum(x => x.Encounters); FirstSenseDictionaryBaseline = (CorrectDictionaryBasedLearnableGuesses + CorrectDictionaryBasedNonLearnableGuesses) / (double)CommonTestExamples; All_FirstSenseDictionaryBaseline = (CorrectDictionaryBasedLearnableGuesses + CorrectDictionaryBasedNonLearnableGuesses + CorrectDictionaryBasedTestOnlyGuesses + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); FirstSensePosDictionaryBaseline = (CorrectPosDictionaryBasedLearnableGuesses + CorrectPosDictionaryBasedNonLearnableGuesses) / (double)CommonTestExamples; All_FirstSensePosDictionaryBaseline = (CorrectPosDictionaryBasedLearnableGuesses + CorrectPosDictionaryBasedNonLearnableGuesses + CorrectPosDictionaryBasedTestOnlyGuesses + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); FirstSenseBaseline = CorrectTrainingBasedLearnableGuesses / (double)CommonTestExamples; All_FirstSenseBaseline = (CorrectTrainingBasedLearnableGuesses + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); FirstSensePosBaseline = CorrectPosTrainingBasedLearnableGuesses / (double)CommonTestExamples; All_FirstSensePosBaseline = (CorrectPosTrainingBasedLearnableGuesses + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); BestCaseBaseline = (CorrectDictionaryBasedTestOnlyGuesses + LearnableTestExamples) / (double)PolysemanticTestExamples; All_BestCaseBaseline = (CorrectDictionaryBasedTestOnlyGuesses + LearnableTestExamples + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); } finally { scope?.Dispose(); } return(this); }
public WordDictionary GetReorderedDictionary( WsdProject project, GenerationInfo info, IProgressHandle progress) { using (progress.Scope(1, MessageFormat.ReorderingDictionary)) { Func <IEnumerable <DictionaryMeaning>, string, IEnumerable <DictionaryMeaning> > meaningOrderFunc = (enumerable, word) => { if (info.OrderMeanings == OrderMeanings.ByDictionary || info.OrderMeanings == OrderMeanings.ByTrainingSet) { enumerable = enumerable .OrderByDescending(z => z.Encounters); } else if (info.OrderMeanings == OrderMeanings.ByDictionaryAndTrainingSet) { enumerable = enumerable .OrderByDescending(z => z.Encounters) .ThenByDescending(z => project.DataAnalysis.GetByName(word)? .TrainEncounters .GetByName(z.Meaning)?.Encounters ?? 0); } return(enumerable.Select((z, i) => new DictionaryMeaning { Id = i + 1, Meaning = z.Meaning, PartOfSpeech = z.PartOfSpeech, Encounters = z.Encounters })); }; var result = (info.OrderMeanings == OrderMeanings.ByTrainingSet ? project.DataAnalysis .Values .Where(x => x.TrainEncounters.Any()) .Select(x => new DictionaryWord { Id = project.Dictionary[x.Word].Id, Word = x.Word, Meanings = x.TrainEncounters.Values .OrderByDescending(y => y.Encounters) .Select((y, i) => new DictionaryMeaning { Id = i + 1, Meaning = y.Meaning, PartOfSpeech = y.PartOfSpeech, Encounters = y.Encounters }).ToMeaningDictionary() }) : project.Dictionary.Values) .Select(x => { var meanings = (IEnumerable <DictionaryMeaning>)x.Meanings.Values; if (info.OrderMeaningsStrategy == OrderMeaningsStrategy.GroupByWordAndPos) { meanings = meanings .GroupBy(y => y.PartOfSpeech) .SelectMany(y => meaningOrderFunc.Invoke(y, x.Word)); } else { meanings = meaningOrderFunc.Invoke(meanings, x.Word); } return(new DictionaryWord { Id = x.Id, Word = x.Word, Meanings = meanings.ToMeaningDictionary() }); }) .Where(x => x.Meanings.Count > 0) .ToWordDictionary(); return(result); } }
public IList <DataSetGroup> FormGroups( Dictionary <DataSetName, DataSetByText> dataSets, WsdProject project, GenerationInfo info, IProgressHandle progress) { var dataSetGroups = new Dictionary <string, DataSetGroup>(); using (var scope = progress.Scope(dataSets.Count, MessageFormat.FormingGroups_DataSets)) { var counter = 0; foreach (var dataSet in dataSets.Values) { scope.TrySet(counter++); IEnumerable <(string groupName, IEnumerable <RawRecord> data)> dataByGroup; switch (info.SavingStrategy) { case SavingStrategy.SingleFile: { dataByGroup = dataSet.Texts .SelectMany(x => x.Data) .GroupBy(x => string.Empty) .Select(x => (x.Key, (IEnumerable <RawRecord>)x)); break; } case SavingStrategy.FilePerWord: { dataByGroup = dataSet.Texts .SelectMany(x => x.Data) .GroupBy(x => x.Word + "__" + project.Dictionary.GetByName(x.Word).Id) .Select(x => (x.Key, (IEnumerable <RawRecord>)x)); break; } case SavingStrategy.FilePerPos: { dataByGroup = dataSet.Texts .SelectMany(x => x.Data) .GroupBy(x => x.Pos) .Select(x => (x.Key, (IEnumerable <RawRecord>)x)); break; } case SavingStrategy.FilePerWordAndPos: { dataByGroup = dataSet.Texts .SelectMany(x => x.Data) .GroupBy(x => x.Word + "__" + x.Pos + "__" + project.Dictionary.GetByName(x.Word).Id) .Select(x => (x.Key, (IEnumerable <RawRecord>)x)); break; } case SavingStrategy.OriginalFiles: { dataByGroup = dataSet.Texts .Select(x => (x.TextName, (IEnumerable <RawRecord>)x.Data)); break; } default: { throw new NotSupportedException( $"Saving stragegy {info.SavingStrategy} is not supported."); } } foreach (var(groupName, data) in dataByGroup) { if (!dataSetGroups.ContainsKey(groupName)) { dataSetGroups[groupName] = new DataSetGroup(groupName); } dataSetGroups[groupName].DataSets[dataSet.Name] = new DataSet(dataSet.Name, data.ToArray()); } } return(dataSetGroups.Values.ToArray()); } }
public static TextData[] Read( string dataPath, string goldKeyPath, SynsetDictionary synsetMappings, WordDictionary dictionary, out XmlParseError[] errors, IProgressHandle progress = null) { var scope = progress?.Scope(1); try { var result = new List <TextData>(); var serializer = new XmlSerializer(typeof(UefXmlData)); var xmlParseErrors = new List <XmlParseError>(); using (var reader = new StreamReader(dataPath)) { var goldKeys = File.ReadAllLines(goldKeyPath) .Where(x => !string.IsNullOrWhiteSpace(x)) .Select(x => x.Trim(' ').Split(' ')) .Where(x => x.Length > 1) .DistinctBy(x => x[0]) .ToDictionary(x => x[0], x => string.Join(" ", x.Skip(1))); var dataXml = (UefXmlData)serializer.Deserialize(reader); foreach (var text in dataXml.Texts) { var encounters = new List <RawWordEncounter>(); foreach (var sentence in text.Sentences) { for (var i = 0; i < sentence.Encounters.Length; i++) { var encounter = sentence.Encounters[i]; var encounterType = sentence.EnumTypes[i]; var rawWordEncounter = new RawWordEncounter { Word = encounter.Lemma, Pos = encounter.Pos, Meaning = string.Empty }; if (encounterType == ItemChoiceType.instance) { var status = SynsetHelper.TryGetMeaning( dictionary, goldKeys, synsetMappings, encounter.Lemma, encounter.Id, out var meaning); if (status == TryGetMeaningStatus.OK) { rawWordEncounter.Meaning = meaning; } else { xmlParseErrors.Add(new XmlParseError { EncounterId = encounter.Id, Error = status }); } } encounters.Add(rawWordEncounter); } encounters.Add(RawWordEncounter.EndOfSentenceEncounter); } result.Add(new TextData(text.Id, encounters.ToArray())); } } errors = xmlParseErrors.ToArray(); return(result.ToArray()); } finally { scope?.Dispose(); } }