public static DataSetGroupStatistics Compute(WordDictionary dictionary, DataSetGroup group) { var statistics = new DataSetGroupStatistics(); var trainSet = group.DataSets.GetByName(DataSetName.Train); var validationSet = group.DataSets.GetByName(DataSetName.Validation); var testSet = group.DataSets.GetByName(DataSetName.Test); var testOnlySet = group.DataSets.GetByName(DataSetName.TestOnly); if (trainSet != null && trainSet.Data.Any()) { var trainStats = trainSet.Data .GroupBy(x => dictionary.GetByName(x.Word)?.Meanings.GetByName(x.Meaning)?.Id ?? 0) .Select(x => new ClassStatistics { Class = x.Key, Encounters = x.Count() }) .ToArray(); statistics.TrainExamples = trainSet.Data.Count; statistics.TrainClasses = trainStats.Length; statistics.TrainEntropy = -trainStats .Sum(x => x.Encounters / (double)statistics.TrainExamples * Math.Log(x.Encounters / (double)statistics.TrainExamples, 2)); } if (testSet != null && testSet.Data.Any()) { var testStats = testSet.Data .GroupBy(x => dictionary.GetByName(x.Word)?.Meanings.GetByName(x.Meaning)?.Id ?? 0) .Select(x => new ClassStatistics { Class = x.Key, Encounters = x.Count() }) .ToArray(); statistics.TestExamples = testSet.Data.Count; statistics.TestClasses = testStats.Length; statistics.TestEntropy = -testStats .Sum(x => x.Encounters / (double)statistics.TestExamples * Math.Log(x.Encounters / (double)statistics.TestExamples, 2)); statistics.MajorityVote = (testStats.SingleOrDefault(x => x.Class == 1)?.Encounters ?? 0) / (double)statistics.TestExamples; } if (validationSet != null && validationSet.Data.Any()) { statistics.ValidationExamples = validationSet.Data.Count; } if (testOnlySet != null && testOnlySet.Data.Any()) { statistics.TestOnlyExamples = testOnlySet.Data.Count; } return(statistics); }
public IList <DataSetGroup> FormGroups( Dictionary <DataSetName, DataSetByText> dataSets, WsdProject project, GenerationInfo info, IProgressHandle progress) { var dataSetGroups = new Dictionary <string, DataSetGroup>(); using (var scope = progress.Scope(dataSets.Count, MessageFormat.FormingGroups_DataSets)) { var counter = 0; foreach (var dataSet in dataSets.Values) { scope.TrySet(counter++); IEnumerable <(string groupName, IEnumerable <RawRecord> data)> dataByGroup; switch (info.SavingStrategy) { case SavingStrategy.SingleFile: { dataByGroup = dataSet.Texts .SelectMany(x => x.Data) .GroupBy(x => string.Empty) .Select(x => (x.Key, (IEnumerable <RawRecord>)x)); break; } case SavingStrategy.FilePerWord: { dataByGroup = dataSet.Texts .SelectMany(x => x.Data) .GroupBy(x => x.Word + "__" + project.Dictionary.GetByName(x.Word).Id) .Select(x => (x.Key, (IEnumerable <RawRecord>)x)); break; } case SavingStrategy.FilePerPos: { dataByGroup = dataSet.Texts .SelectMany(x => x.Data) .GroupBy(x => x.Pos) .Select(x => (x.Key, (IEnumerable <RawRecord>)x)); break; } case SavingStrategy.FilePerWordAndPos: { dataByGroup = dataSet.Texts .SelectMany(x => x.Data) .GroupBy(x => x.Word + "__" + x.Pos + "__" + project.Dictionary.GetByName(x.Word).Id) .Select(x => (x.Key, (IEnumerable <RawRecord>)x)); break; } case SavingStrategy.OriginalFiles: { dataByGroup = dataSet.Texts .Select(x => (x.TextName, (IEnumerable <RawRecord>)x.Data)); break; } default: { throw new NotSupportedException( $"Saving stragegy {info.SavingStrategy} is not supported."); } } foreach (var(groupName, data) in dataByGroup) { if (!dataSetGroups.ContainsKey(groupName)) { dataSetGroups[groupName] = new DataSetGroup(groupName); } dataSetGroups[groupName].DataSets[dataSet.Name] = new DataSet(dataSet.Name, data.ToArray()); } } return(dataSetGroups.Values.ToArray()); } }