public T[] ReadAll(IProgressHandle progress = null) { var scope = progress?.Scope(BaseReader.BaseStream.Length); try { var list = new List <T>(); while (!BaseReader.EndOfStream) { var data = Read(); if (data != null) { list.Add(data); } scope?.TrySet(BaseReader.BaseStream.Position); } return(list.ToArray()); } finally { scope?.Dispose(); } }
public static WordDictionary ReadAll(string path, IProgressHandle progress = null) { using (var reader = new InputDictionaryReader(path)) { return(reader.ReadAll(progress).ToWordDictionary()); } }
public static EmbeddingDictionary ReadAll(string path, IProgressHandle progress = null) { using (var reader = new SystemEmbeddingReader(path)) { return(reader.ReadAll(progress).ToEmbeddingDictionary()); } }
public static void Write <T>( string path, T data, IProgressHandle progress = null, bool includeTypeNames = true) { if (string.IsNullOrEmpty(path)) { throw new ArgumentNullException(nameof(path)); } if (data == null) { throw new ArgumentNullException(nameof(data)); } var scope = progress?.Scope(1); try { var json = JsonConvert.SerializeObject(data, new JsonSerializerSettings { Formatting = Formatting.Indented, TypeNameHandling = includeTypeNames ? TypeNameHandling.Objects : TypeNameHandling.None }); File.WriteAllText(path, json); } finally { scope?.Dispose(); } }
public static RawWordEncounter[] ReadAll(string path, IProgressHandle progress = null) { using (var reader = new InputPlainTextDataReader(path)) { return(reader.ReadAll(progress)); } }
public static TextData[] ReadAllFiles(string path, IProgressHandle progress = null) { var dataFiles = Directory.GetFiles(path); var scope = progress?.Scope(dataFiles.Length); try { var result = new List <TextData>(); for (var i = 0; i < dataFiles.Length; i++) { var file = dataFiles[i]; var textName = Path.GetFileNameWithoutExtension(file); result.Add(new TextData(textName, ReadAll(file))); scope.TrySet(i + 1); } return(result.ToArray()); } finally { scope?.Dispose(); } }
public static T Read <T>(string path, IProgressHandle progress = null) { if (string.IsNullOrEmpty(path)) { throw new ArgumentNullException(nameof(path)); } var scope = progress?.Scope(1); try { var data = File.ReadAllText(path); try { return(JsonConvert.DeserializeObject <T>(data, new JsonSerializerSettings { Formatting = Formatting.Indented, TypeNameHandling = TypeNameHandling.Objects })); } catch (Exception ex) { throw new Exception(ExceptionMessage.UnableToLoadProjectData, ex); } } finally { scope?.Dispose(); } }
public static WordAnalysisDictionary ReadAll(string path, IProgressHandle progress = null) { using (var reader = new SystemDataAnalysisReader(path)) { return(reader.ReadAll(progress).ToWordAnalysisDictionary()); } }
public WordAnalysisDictionary Analyze( WordDictionary dictionary, TextData[] trainData, TextData[] testData, IProgressHandle progress = null) { var max = trainData.Length + testData.Length; var scope = progress?.Scope(max); var counter = 0; try { foreach (var text in trainData) { foreach (var encounter in text.Data) { if (string.IsNullOrWhiteSpace(encounter.Word) || string.IsNullOrWhiteSpace(encounter.Meaning) || encounter.Word == RawWordEncounter.EmptyWord || encounter.Word == RawWordEncounter.EndOfSentence) { continue; } var wordAnalysis = GetOrAdd(dictionary, encounter); wordAnalysis.TrainEncounters.AddEncounter(dictionary, encounter); wordAnalysis.AllEncounters.AddEncounter(dictionary, encounter); } scope?.TrySet(++counter); } foreach (var text in testData) { foreach (var encounter in text.Data) { if (string.IsNullOrWhiteSpace(encounter.Word) || string.IsNullOrWhiteSpace(encounter.Meaning) || encounter.Word == RawWordEncounter.EmptyWord || encounter.Word == RawWordEncounter.EndOfSentence) { continue; } var wordAnalysis = GetOrAdd(dictionary, encounter); wordAnalysis.TestEncounters.AddEncounter(dictionary, encounter); wordAnalysis.AllEncounters.AddEncounter(dictionary, encounter); } scope?.TrySet(++counter); } } finally { scope?.Dispose(); } return(this); }
public void AfterValidationSetExtracted( IList <DataSetGroup> dataSetGroups, WsdProject project, GenerationInfo info, IProgressHandle progress) { var logger = project.PluginData.GetData <LoggingPlugin, EventLogger>(string.Empty); logger.LogMessage("AfterValidationSetExtracted() called."); }
public static EmbeddingDictionary ReadAll( string path, string[] occurrencesInData, IProgressHandle progress = null) { using (var reader = new InputEmbeddingReader(path, occurrencesInData)) { return(reader.ReadAll(progress).ToEmbeddingDictionary()); } }
public void AfterDictionaryReordered( WordDictionary reorderedDictionary, WsdProject project, GenerationInfo info, IProgressHandle progress) { var logger = project.PluginData.GetData <LoggingPlugin, EventLogger>(string.Empty); logger.LogMessage("AfterDictionaryReordered() called."); }
public void BeforeDataWritten( IList <DataSetGroup> dataSetGroups, WsdProject project, GenerationInfo info, IProgressHandle progress) { var logger = project.PluginData.GetData <LoggingPlugin, EventLogger>(string.Empty); logger.LogMessage("BeforeDataWritten() called."); }
public static void WriteAll( string path, XmlParseError[] errors, IProgressHandle progress = null) { using (var writer = new XmlParseErrorWriter(path)) { writer.WriteAll(errors, progress); } }
public static void WriteAll( string path, EmbeddingDictionary embeddings, IProgressHandle progress = null) { using (var writer = new SystemEmbeddingWriter(path)) { writer.WriteAll(embeddings.Values.ToArray(), progress); } }
public void AfterRecordsGenerated( Dictionary <DataSetName, DataSetByText> dataSets, WsdProject project, GenerationInfo info, IProgressHandle progress) { var logger = project.PluginData.GetData <LoggingPlugin, EventLogger>(string.Empty); logger.LogMessage("AfterRecordsGenerated() called."); }
public static void WriteAll( string path, WordDictionary dictionary, IProgressHandle progress = null) { using (var writer = new SystemDictionaryWriter(path)) { writer.WriteAll(dictionary.Values.ToArray(), progress); } }
public void Extract( IList <DataSetGroup> dataSetGroups, GenerationInfo info, IProgressHandle progress) { using (var scope = progress.Scope(dataSetGroups.Count, MessageFormat.ExtractingValidationSet_Groups)) { var counter = 0; foreach (var dataSetGroup in dataSetGroups) { scope.TrySet(counter++); var oldTrainSet = dataSetGroup.DataSets.GetByName(DataSetName.Train); if (oldTrainSet == null) { continue; } var validationSplit = oldTrainSet.Data .GroupBy(x => x.Word + x.Meaning) .Select(x => { var groupCount = x.Count(); var validationGroupCount = (int)Math.Ceiling( groupCount * 0.01 * info.ValidationSetPercentage); return(new { TrainGroup = x.Skip(validationGroupCount).ToArray(), ValidationGroup = x.Take(validationGroupCount).ToArray() }); }) .ToArray(); var trainExamples = validationSplit.SelectMany(x => x.TrainGroup).ToArray(); var validationExamples = validationSplit.SelectMany(x => x.ValidationGroup).ToArray(); if (trainExamples.Length > 0) { dataSetGroup.DataSets[DataSetName.Train] = new DataSet(DataSetName.Train, trainExamples); } else { dataSetGroup.DataSets.Remove(DataSetName.Train); } if (validationExamples.Length > 0) { dataSetGroup.DataSets[DataSetName.Validation] = new DataSet( DataSetName.Validation, validationExamples); } else { dataSetGroup.DataSets.Remove(DataSetName.Validation); } } } }
public void BeforeGenerationStarted(WsdProject project, GenerationInfo info, IProgressHandle progress) { var logger = project.PluginData.GetData <LoggingPlugin, EventLogger>(string.Empty); logger.LogMessage("Generation started."); logger.LogMessage(""); logger.LogMessage("BeforeGenerationStarted() called."); }
public static SynsetDictionary ReadAll(string path, IProgressHandle progress = null) { using (var reader = new InputSynsetMappingReader(path)) { return(new SynsetDictionary( reader.ReadAll(progress) .Select(x => x.GetValueOrDefault()) .DistinctBy(x => x.Key))); } }
public ProgressHandleScope( IProgressHandle progress, long max, Func <long, long, string> messageFormat = null) { _progress = progress; _max = max; if (messageFormat != null) { _progress.SetMessageFormat(messageFormat); } _progress.Restart(max); }
public void BeforeDataWritten( IList <DataSetGroup> dataSetGroups, WsdProject project, GenerationInfo info, IProgressHandle progress) { var config = project.PluginData.GetData <StatisticsPlugin, StatisticsConfig>(string.Empty); if (!config.PluginEnabled) { return; } var dictionary = project.PluginData.GetData <StatisticsPlugin, WordDictionary>(string.Empty); var filePath = Path.Combine( info.DestinationFolder, FileName.DataSetStatistics + FileExtension.Csv); using (var streamWriter = new StreamWriter(filePath)) using (var writer = new CsvWriter(streamWriter)) using (var scope = progress.Scope(dataSetGroups.Count, MessageFormat.ComputingStatistics_Groups)) { writer.WriteLine( "Group", "Train examples", "Validation examples", "Test examples", "Test-only examples", "Majority vote", "Train classes", "Test classes", "Train entropy", "Test entropy"); for (var i = 0; i < dataSetGroups.Count; i++) { scope.TrySet(i); var dataSetGroup = dataSetGroups[i]; var statistics = DataSetGroupStatistics.Compute(dictionary, dataSetGroup); if (config.RequireTrainingSet && statistics.TrainExamples == 0 || config.RequireTestSet && statistics.TestExamples == 0 || statistics.TrainExamples + statistics.ValidationExamples < config.MinimumTrainingValidationExamples) { continue; } writer.WriteLine( dataSetGroup.GroupName, statistics.TrainExamples, statistics.ValidationExamples, statistics.TestExamples, statistics.TestOnlyExamples, statistics.MajorityVote, statistics.TrainClasses, statistics.TestClasses, statistics.TrainEntropy, statistics.TestEntropy); } } if (config.AbortGenerationAfterStatisticsAreComputed) { throw new OperationCanceledException(); } }
public void Extract( IList <DataSetGroup> dataSetGroups, WsdProject project, GenerationInfo info, IProgressHandle progress) { using (var scope = progress.Scope(dataSetGroups.Count, MessageFormat.ExtractingTestOnlySet_Groups)) { var counter = 0; foreach (var dataSetGroup in dataSetGroups) { scope.TrySet(counter++); var oldTestSet = dataSetGroup.DataSets.GetByName(DataSetName.Test); if (oldTestSet == null) { continue; } var testExamples = oldTestSet.Data .Where(x => project.DataAnalysis[x.Word].TrainEncounters.Any()) .ToArray(); var testOnlyExamples = oldTestSet.Data .Where(x => !project.DataAnalysis[x.Word].TrainEncounters.Any()) .ToArray(); if (testExamples.Length > 0) { dataSetGroup.DataSets[DataSetName.Test] = new DataSet(DataSetName.Test, testExamples); } else { dataSetGroup.DataSets.Remove(DataSetName.Test); } if (testOnlyExamples.Length > 0) { dataSetGroup.DataSets[DataSetName.TestOnly] = new DataSet(DataSetName.TestOnly, testOnlyExamples); } else { dataSetGroup.DataSets.Remove(DataSetName.TestOnly); } } } }
public void AfterGenerationCompleted(WsdProject project, GenerationInfo info, IProgressHandle progress) { var logger = project.PluginData.GetData <LoggingPlugin, EventLogger>(string.Empty); var statistics = project.PluginData.GetData <LoggingPlugin, UsageStatistics>(string.Empty); logger.LogMessage("AfterGenerationCompleted() called."); logger.LogMessage(""); logger.LogMessage("Generation completed."); logger.LogMessage(""); logger.LogMessage("Usage statistics:"); logger.LogMessage($" Colocation source - {statistics.ColocationSourceCounter}"); logger.LogMessage($" CosThetaUnitary function - {statistics.CosThetaUnitaryCounter}"); logger.LogMessage($" String concat - {statistics.StringConcatCounter}"); logger.LogMessage($" Word element - {statistics.WordElementCounter}"); }
public static PluginInfo[] LoadPlugins(IProgressHandle progress) { var plugins = new List <PluginInfo>(); var assemblyFiles = Directory.GetFiles(PluginDirectory, "*.dll", SearchOption.AllDirectories); using (var scope = progress.Scope(assemblyFiles.Length, MessageFormat.LoadingPlugins_Files)) { var counter = 0; foreach (var assemblyFile in assemblyFiles) { try { var assembly = Assembly.LoadFile(assemblyFile); var pluginTypes = assembly.GetTypes() .Where(x => typeof(IPlugin).IsAssignableFrom(x)) .ToArray(); foreach (var pluginType in pluginTypes) { try { var pluginInstance = (IPlugin)Activator.CreateInstance(pluginType); var pluginComponents = pluginInstance.GetComponents() ?? new IPluginComponent[0]; plugins.Add(new PluginInfo( true, assemblyFile, assembly.GetName(), pluginInstance, pluginComponents)); } catch { plugins.Add(new PluginInfo( true, assemblyFile, assembly.GetName(), null, null)); } } } catch { } scope.TrySet(++counter); } } return(plugins.ToArray()); }
public void ShuffleData(IList <DataSetGroup> dataSetGroups, IProgressHandle progress) { using (var scope = progress.Scope(dataSetGroups.Count, MessageFormat.ExtractingValidationSet_Groups)) { var counter = 0; foreach (var dataSetGroup in dataSetGroups) { scope.TrySet(counter++); foreach (var dataSet in dataSetGroup.DataSets.Values) { dataSet.Data.Shuffle(); } } } }
public void WriteAll(IList <T> list, IProgressHandle progress = null) { var scope = progress?.Scope(list.Count); try { for (var i = 0; i < list.Count; i++) { Write(list[i]); scope?.TrySet(i + 1); } } finally { scope?.Dispose(); } }
public IList <GeneratedTextData> GenerateRecords( IList <TextData> data, WsdProject project, GenerationInfo info, IProgressHandle progress) { var result = new List <GeneratedTextData>(); using (var scope = progress.Scope(data.Count, MessageFormat.GeneratingRecords_Texts)) { for (var i = 0; i < data.Count; i++) { scope.TrySet(i); result.Add(new GeneratedTextData( data[i].TextName, GenerateRecords(data[i].Data, project, info))); } } return(result); }
public DictionaryStatistics Compute(WordDictionary dictionary, IProgressHandle progress = null) { var scope = progress?.Scope(1); try { WordCount = dictionary.Count; MonosemanticWordCount = dictionary.Values.Count(x => x.Meanings.Count == 1); PolysemanticWordCount = dictionary.Values.Count(x => x.Meanings.Count > 1); MaxMeaningsPerWord = dictionary.Values.MaxOrDefault(x => x.Meanings.Count); AverageMeaningsPerWord = dictionary.Values.AverageOrDefault(x => x.Meanings.Count); UniqueMeaningsCount = dictionary.Values.SelectMany(x => x.Meanings.Keys).Distinct().Count(); } finally { scope?.Dispose(); } return(this); }
public DataStatistics Compute( WordDictionary dictionary, WordAnalysisDictionary dataAnalysis, IProgressHandle progress = null) { var scope = progress?.Scope(1); try { MonosemanticTrainExamples = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) == 1) .Sum(x => x.TrainEncounters.Values.Sum(y => y.Encounters)); PolysemanticTrainExamples = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Sum(x => x.TrainEncounters.Values.Sum(y => y.Encounters)); MonosemanticTestExamples = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) == 1) .Sum(x => x.TestEncounters.Values.Sum(y => y.Encounters)); PolysemanticTestExamples = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Sum(x => x.TestEncounters.Values.Sum(y => y.Encounters)); var commonAnalysis = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Where(x => x.TrainEncounters.Count != 0) .SelectMany(x => x.TestEncounters .Select(y => new { x.Word, y.Value.Meaning, y.Value.PartOfSpeech, y.Value.Encounters })) .ToArray(); var learnableAnalysis = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Where(x => x.TrainEncounters.Count != 0) .SelectMany(x => x.TestEncounters .Where(y => x.TrainEncounters.ContainsKey(y.Key)) .Select(y => new { x.Word, y.Value.Meaning, y.Value.PartOfSpeech, y.Value.Encounters })) .ToArray(); var nonLearnableAnalysis = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Where(x => x.TrainEncounters.Count != 0) .SelectMany(x => x.TestEncounters .Where(y => !x.TrainEncounters.ContainsKey(y.Key)) .Select(y => new { x.Word, y.Value.Meaning, y.Value.PartOfSpeech, y.Value.Encounters })) .ToArray(); var testOnlyAnalysis = dataAnalysis.Values .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1) .Where(x => x.TrainEncounters.Count == 0) .SelectMany(x => x.TestEncounters .Select(y => new { x.Word, y.Value.Meaning, y.Value.PartOfSpeech, y.Value.Encounters })) .ToArray(); CommonTestExamples = commonAnalysis.Sum(x => x.Encounters); LearnableTestExamples = learnableAnalysis.Sum(x => x.Encounters); NonLearnableTestExamples = nonLearnableAnalysis.Sum(x => x.Encounters); TestOnlyExamples = testOnlyAnalysis.Sum(x => x.Encounters); CorrectDictionaryBasedLearnableGuesses = learnableAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First() .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectDictionaryBasedNonLearnableGuesses = nonLearnableAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First() .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectDictionaryBasedTestOnlyGuesses = testOnlyAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First() .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectPosDictionaryBasedLearnableGuesses = learnableAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First(y => y.PartOfSpeech == x.PartOfSpeech) .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectPosDictionaryBasedNonLearnableGuesses = nonLearnableAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First(y => y.PartOfSpeech == x.PartOfSpeech) .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectPosDictionaryBasedTestOnlyGuesses = testOnlyAnalysis .Where(x => dictionary .GetByName(x.Word) .Meanings.Values .OrderByDescending(y => y.Encounters) .First(y => y.PartOfSpeech == x.PartOfSpeech) .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectTrainingBasedLearnableGuesses = learnableAnalysis .Where(x => dataAnalysis[x.Word] .TrainEncounters .Values .OrderByDescending(y => y.Encounters) .FirstOrDefault() .Meaning == x.Meaning) .Sum(x => x.Encounters); CorrectPosTrainingBasedLearnableGuesses = learnableAnalysis .Where(x => dataAnalysis[x.Word] .TrainEncounters .Values .Where(y => y.PartOfSpeech == x.PartOfSpeech) .OrderByDescending(y => y.Encounters) .FirstOrDefault() .Meaning == x.Meaning) .Sum(x => x.Encounters); FirstSenseDictionaryBaseline = (CorrectDictionaryBasedLearnableGuesses + CorrectDictionaryBasedNonLearnableGuesses) / (double)CommonTestExamples; All_FirstSenseDictionaryBaseline = (CorrectDictionaryBasedLearnableGuesses + CorrectDictionaryBasedNonLearnableGuesses + CorrectDictionaryBasedTestOnlyGuesses + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); FirstSensePosDictionaryBaseline = (CorrectPosDictionaryBasedLearnableGuesses + CorrectPosDictionaryBasedNonLearnableGuesses) / (double)CommonTestExamples; All_FirstSensePosDictionaryBaseline = (CorrectPosDictionaryBasedLearnableGuesses + CorrectPosDictionaryBasedNonLearnableGuesses + CorrectPosDictionaryBasedTestOnlyGuesses + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); FirstSenseBaseline = CorrectTrainingBasedLearnableGuesses / (double)CommonTestExamples; All_FirstSenseBaseline = (CorrectTrainingBasedLearnableGuesses + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); FirstSensePosBaseline = CorrectPosTrainingBasedLearnableGuesses / (double)CommonTestExamples; All_FirstSensePosBaseline = (CorrectPosTrainingBasedLearnableGuesses + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); BestCaseBaseline = (CorrectDictionaryBasedTestOnlyGuesses + LearnableTestExamples) / (double)PolysemanticTestExamples; All_BestCaseBaseline = (CorrectDictionaryBasedTestOnlyGuesses + LearnableTestExamples + MonosemanticTestExamples) / (double)(PolysemanticTestExamples + MonosemanticTestExamples); } finally { scope?.Dispose(); } return(this); }