Example #1
0
        public static DataSetGroupStatistics Compute(WordDictionary dictionary, DataSetGroup group)
        {
            var statistics    = new DataSetGroupStatistics();
            var trainSet      = group.DataSets.GetByName(DataSetName.Train);
            var validationSet = group.DataSets.GetByName(DataSetName.Validation);
            var testSet       = group.DataSets.GetByName(DataSetName.Test);
            var testOnlySet   = group.DataSets.GetByName(DataSetName.TestOnly);

            if (trainSet != null && trainSet.Data.Any())
            {
                var trainStats = trainSet.Data
                                 .GroupBy(x => dictionary.GetByName(x.Word)?.Meanings.GetByName(x.Meaning)?.Id ?? 0)
                                 .Select(x => new ClassStatistics
                {
                    Class      = x.Key,
                    Encounters = x.Count()
                })
                                 .ToArray();

                statistics.TrainExamples = trainSet.Data.Count;
                statistics.TrainClasses  = trainStats.Length;
                statistics.TrainEntropy  = -trainStats
                                           .Sum(x => x.Encounters / (double)statistics.TrainExamples *
                                                Math.Log(x.Encounters / (double)statistics.TrainExamples, 2));
            }

            if (testSet != null && testSet.Data.Any())
            {
                var testStats = testSet.Data
                                .GroupBy(x => dictionary.GetByName(x.Word)?.Meanings.GetByName(x.Meaning)?.Id ?? 0)
                                .Select(x => new ClassStatistics
                {
                    Class      = x.Key,
                    Encounters = x.Count()
                })
                                .ToArray();

                statistics.TestExamples = testSet.Data.Count;
                statistics.TestClasses  = testStats.Length;
                statistics.TestEntropy  = -testStats
                                          .Sum(x => x.Encounters / (double)statistics.TestExamples *
                                               Math.Log(x.Encounters / (double)statistics.TestExamples, 2));
                statistics.MajorityVote = (testStats.SingleOrDefault(x => x.Class == 1)?.Encounters ?? 0) /
                                          (double)statistics.TestExamples;
            }

            if (validationSet != null && validationSet.Data.Any())
            {
                statistics.ValidationExamples = validationSet.Data.Count;
            }

            if (testOnlySet != null && testOnlySet.Data.Any())
            {
                statistics.TestOnlyExamples = testOnlySet.Data.Count;
            }

            return(statistics);
        }
Example #2
0
        public IList <DataSetGroup> FormGroups(
            Dictionary <DataSetName, DataSetByText> dataSets, WsdProject project, GenerationInfo info,
            IProgressHandle progress)
        {
            var dataSetGroups = new Dictionary <string, DataSetGroup>();

            using (var scope = progress.Scope(dataSets.Count, MessageFormat.FormingGroups_DataSets))
            {
                var counter = 0;

                foreach (var dataSet in dataSets.Values)
                {
                    scope.TrySet(counter++);

                    IEnumerable <(string groupName, IEnumerable <RawRecord> data)> dataByGroup;

                    switch (info.SavingStrategy)
                    {
                    case SavingStrategy.SingleFile:
                    {
                        dataByGroup = dataSet.Texts
                                      .SelectMany(x => x.Data)
                                      .GroupBy(x => string.Empty)
                                      .Select(x => (x.Key, (IEnumerable <RawRecord>)x));
                        break;
                    }

                    case SavingStrategy.FilePerWord:
                    {
                        dataByGroup = dataSet.Texts
                                      .SelectMany(x => x.Data)
                                      .GroupBy(x => x.Word + "__" + project.Dictionary.GetByName(x.Word).Id)
                                      .Select(x => (x.Key, (IEnumerable <RawRecord>)x));
                        break;
                    }

                    case SavingStrategy.FilePerPos:
                    {
                        dataByGroup = dataSet.Texts
                                      .SelectMany(x => x.Data)
                                      .GroupBy(x => x.Pos)
                                      .Select(x => (x.Key, (IEnumerable <RawRecord>)x));
                        break;
                    }

                    case SavingStrategy.FilePerWordAndPos:
                    {
                        dataByGroup = dataSet.Texts
                                      .SelectMany(x => x.Data)
                                      .GroupBy(x => x.Word + "__" + x.Pos + "__" + project.Dictionary.GetByName(x.Word).Id)
                                      .Select(x => (x.Key, (IEnumerable <RawRecord>)x));
                        break;
                    }

                    case SavingStrategy.OriginalFiles:
                    {
                        dataByGroup = dataSet.Texts
                                      .Select(x => (x.TextName, (IEnumerable <RawRecord>)x.Data));
                        break;
                    }

                    default:
                    {
                        throw new NotSupportedException(
                                  $"Saving stragegy {info.SavingStrategy} is not supported.");
                    }
                    }

                    foreach (var(groupName, data) in dataByGroup)
                    {
                        if (!dataSetGroups.ContainsKey(groupName))
                        {
                            dataSetGroups[groupName] = new DataSetGroup(groupName);
                        }

                        dataSetGroups[groupName].DataSets[dataSet.Name] =
                            new DataSet(dataSet.Name, data.ToArray());
                    }
                }

                return(dataSetGroups.Values.ToArray());
            }
        }