public void TestEvenlyDistributed()
        {
            int size = DatasetSize;

            for (int numLabels = 2; numLabels <= size / 2; numLabels++)
            {
                var labelCounts = new int[numLabels, 2];
                for (int label = 1; label <= numLabels; label++)
                {
                    int segSize = size / numLabels;
                    if (label <= size % numLabels)
                    {
                        segSize++;
                    }
                    labelCounts[label - 1, 0] = label;
                    labelCounts[label - 1, 1] = segSize;
                }
                double labelDistr            = 1.0 / numLabels;
                LabeledDataset <int, int> ld = NewData(labelCounts, true);
                for (int numFolds = 2; numFolds <= size / numLabels; numFolds++)
                {
                    var aggTestSet = new LabeledDataset <int, int>();
                    for (int i = 0; i < numFolds; i++)
                    {
                        LabeledDataset <int, int> trainSet, testSet;
                        ld.SplitForStratifiedCrossValidation(numFolds, i + 1, out trainSet, out testSet);
                        AssertSetEquality(trainSet.Concat(testSet), ld);
                        aggTestSet.AddRange(testSet);

                        foreach (double distr in testSet.GroupBy(le => le.Label).Select(g => (double)g.Count() / testSet.Count))
                        {
                            Assert.IsTrue(Math.Abs(labelDistr - distr) <= 1.0 / testSet.Count);
                        }
                        foreach (double distr in trainSet.GroupBy(le => le.Label).Select(g => (double)g.Count() / trainSet.Count))
                        {
                            Assert.IsTrue(Math.Abs(labelDistr - distr) <= 1.0 / trainSet.Count);
                        }
                    }
                    AssertSetEquality(aggTestSet, ld);
                }
            }
        }
 public void TestFolding()
 {
     for (int size = 2; size <= DatasetSize; size++)
     {
         LabeledDataset <int, int> ld = NewData(new[, ] {
             { 1, size }
         }, true);
         for (int numFolds = 2; numFolds <= size; numFolds++)
         {
             var aggTestSet = new LabeledDataset <int, int>();
             for (int i = 0; i < numFolds; i++)
             {
                 LabeledDataset <int, int> trainSet, testSet;
                 ld.SplitForStratifiedCrossValidation(numFolds, i + 1, out trainSet, out testSet);
                 AssertSetEquality(trainSet.Concat(testSet), ld);
                 aggTestSet.AddRange(testSet);
             }
             AssertSetEquality(aggTestSet, ld);
         }
     }
 }
        public void TestUnevenlyDistributed()
        {
            int size = DatasetSize;

            double[] labelDistrs = { 0.2, 0.4, 0.1, 0.3 };

            var labelCounts = new int[labelDistrs.Length, 2];
            int addedCount  = 0;

            for (int label = 1; label <= labelDistrs.Length; label++)
            {
                labelCounts[label - 1, 0] = label;
                var labelCount = (int)Math.Truncate(labelDistrs[label - 1] * size);
                labelCounts[label - 1, 1] = labelCount;
                addedCount += labelCount;
            }
            for (int i = 0; i < size - addedCount; i++)
            {
                int idx = i % labelCounts.Length;
                labelCounts[idx, 1]++;
                labelDistrs[idx] = (double)labelCounts[idx, 1] / size;
            }

            LabeledDataset <int, int> ld = NewData(labelCounts, true);

            for (int numFolds = 2; numFolds <= size / labelDistrs.Length; numFolds++)
            {
                var aggTestSet = new LabeledDataset <int, int>();
                for (int i = 0; i < numFolds; i++)
                {
                    LabeledDataset <int, int> trainSet, testSet;
                    ld.SplitForStratifiedCrossValidation(numFolds, i + 1, out trainSet, out testSet);
                    AssertSetEquality(trainSet.Concat(testSet), ld);
                    aggTestSet.AddRange(testSet);

                    var test = new List <double>();
                    foreach (IGrouping <int, LabeledExample <int, int> > group in testSet.GroupBy(le => le.Label))
                    {
                        double distr = (double)group.Count() / testSet.Count;
                        int    label = group.Key;
                        int    j     = 0;
                        for (; labelCounts[j, 0] != label; j++)
                        {
                        }
                        Assert.IsTrue(Math.Abs(labelDistrs[j] - distr) <= 1.0 / testSet.Count + 0.00001);
                        test.Add((double)group.Count() / testSet.Count);
                    }

                    var train = new List <double>();
                    foreach (IGrouping <int, LabeledExample <int, int> > group in trainSet.GroupBy(le => le.Label))
                    {
                        double distr = (double)group.Count() / trainSet.Count;
                        int    label = group.Key;
                        int    j     = 0;
                        for (; labelCounts[j, 0] != label; j++)
                        {
                        }
                        Assert.IsTrue(Math.Abs(labelDistrs[j] - distr) <= 1.0 / trainSet.Count + 0.00001);
                        train.Add((double)group.Count() / trainSet.Count);
                    }
                }
                AssertSetEquality(aggTestSet, ld);
            }
        }