/// <summary> /// Splits this bundle to a collection of smaller folds (sub-bundles) suitable for the cross-validation. /// </summary> /// <param name="foldDataRatio">The requested ratio of the samples constituting the single fold (sub-bundle).</param> /// <param name="binBorder">When the binBorder is specified then all the output features are considered as binary features within the one-takes-all group and function then keeps balanced ratios of 0 and 1 for every output feature and the fold.</param> /// <returns>A collection of the created folds.</returns> public List <VectorBundle> Folderize(double foldDataRatio, double binBorder = double.NaN) { if (OutputVectorCollection.Count < 2) { throw new InvalidOperationException($"Insufficient number of samples ({OutputVectorCollection.Count.ToString(CultureInfo.InvariantCulture)})."); } List <VectorBundle> foldCollection = new List <VectorBundle>(); //Fold data ratio basic correction if (foldDataRatio > MaxRatioOfFoldData) { foldDataRatio = MaxRatioOfFoldData; } //Prelimitary fold size estimation int foldSize = Math.Max(1, (int)Math.Round(OutputVectorCollection.Count * foldDataRatio, 0)); //Prelimitary number of folds int numOfFolds = (int)Math.Round((double)OutputVectorCollection.Count / foldSize); //Folds creation if (double.IsNaN(binBorder)) { //No binary output -> simple split int samplesPos = 0; for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++) { VectorBundle fold = new VectorBundle(); for (int i = 0; i < foldSize && samplesPos < OutputVectorCollection.Count; i++) { fold.InputVectorCollection.Add(InputVectorCollection[samplesPos]); fold.OutputVectorCollection.Add(OutputVectorCollection[samplesPos]); ++samplesPos; } foldCollection.Add(fold); } //Remaining samples for (int i = 0; i < OutputVectorCollection.Count - samplesPos; i++) { int foldIdx = i % foldCollection.Count; foldCollection[foldIdx].InputVectorCollection.Add(InputVectorCollection[samplesPos + i]); foldCollection[foldIdx].OutputVectorCollection.Add(OutputVectorCollection[samplesPos + i]); } }//Indifferent output else { //Binary outputs -> keep balanced ratios of outputs int numOfOutputs = OutputVectorCollection[0].Length; if (numOfOutputs == 1) { //Special case there is only one binary output //Investigation of the output data metrics BinDistribution refBinDistr = new BinDistribution(binBorder); refBinDistr.Update(OutputVectorCollection, 0); int min01 = Math.Min(refBinDistr.NumOf[0], refBinDistr.NumOf[1]); if (min01 < 2) { throw new InvalidOperationException($"Insufficient bin 0 or 1 samples (less than 2)."); } if (numOfFolds > min01) { numOfFolds = min01; } //Scan data int[] bin0SampleIdxs = new int[refBinDistr.NumOf[0]]; int bin0SamplesPos = 0; int[] bin1SampleIdxs = new int[refBinDistr.NumOf[1]]; int bin1SamplesPos = 0; for (int i = 0; i < OutputVectorCollection.Count; i++) { if (OutputVectorCollection[i][0] >= refBinDistr.BinBorder) { bin1SampleIdxs[bin1SamplesPos++] = i; } else { bin0SampleIdxs[bin0SamplesPos++] = i; } } //Determine distributions of 0 and 1 for one fold int bundleBin0Count = Math.Max(1, refBinDistr.NumOf[0] / numOfFolds); int bundleBin1Count = Math.Max(1, refBinDistr.NumOf[1] / numOfFolds); //Bundles creation bin0SamplesPos = 0; bin1SamplesPos = 0; for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++) { VectorBundle fold = new VectorBundle(); //Bin 0 for (int i = 0; i < bundleBin0Count; i++) { fold.InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]); fold.OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]); ++bin0SamplesPos; } //Bin 1 for (int i = 0; i < bundleBin1Count; i++) { fold.InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]); fold.OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]); ++bin1SamplesPos; } foldCollection.Add(fold); } //Remaining samples for (int i = 0; i < bin0SampleIdxs.Length - bin0SamplesPos; i++) { int foldIdx = i % foldCollection.Count; foldCollection[foldIdx].InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]); foldCollection[foldIdx].OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]); } for (int i = 0; i < bin1SampleIdxs.Length - bin1SamplesPos; i++) { int foldIdx = i % foldCollection.Count; foldCollection[foldIdx].InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]); foldCollection[foldIdx].OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]); } }//Only 1 binary output else { //There is more than 1 binary output - "one takes all approach" //Investigation of the output data metrics //Collect bin 1 sample indexes and check "one takes all" consistency for every output feature List <int>[] outBin1SampleIdxs = new List <int> [numOfOutputs]; for (int i = 0; i < numOfOutputs; i++) { outBin1SampleIdxs[i] = new List <int>(); } for (int sampleIdx = 0; sampleIdx < OutputVectorCollection.Count; sampleIdx++) { int numOf1 = 0; for (int outFeatureIdx = 0; outFeatureIdx < numOfOutputs; outFeatureIdx++) { if (OutputVectorCollection[sampleIdx][outFeatureIdx] >= binBorder) { outBin1SampleIdxs[outFeatureIdx].Add(sampleIdx); ++numOf1; } } if (numOf1 != 1) { throw new ArgumentException($"Data are inconsistent on data index {sampleIdx.ToString(CultureInfo.InvariantCulture)}. Output vector has {numOf1.ToString(CultureInfo.InvariantCulture)} feature(s) having bin value 1.", "binBorder"); } } //Determine max possible number of folds int maxNumOfFolds = OutputVectorCollection.Count; for (int outFeatureIdx = 0; outFeatureIdx < numOfOutputs; outFeatureIdx++) { int outFeatureMaxFolds = Math.Min(outBin1SampleIdxs[outFeatureIdx].Count, OutputVectorCollection.Count - outBin1SampleIdxs[outFeatureIdx].Count); maxNumOfFolds = Math.Min(outFeatureMaxFolds, maxNumOfFolds); } //Correct the number of folds to be created if (numOfFolds > maxNumOfFolds) { numOfFolds = maxNumOfFolds; } //Create the folds for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++) { foldCollection.Add(new VectorBundle()); } //Samples distribution for (int outFeatureIdx = 0; outFeatureIdx < numOfOutputs; outFeatureIdx++) { for (int bin1SampleRefIdx = 0; bin1SampleRefIdx < outBin1SampleIdxs[outFeatureIdx].Count; bin1SampleRefIdx++) { int foldIdx = bin1SampleRefIdx % foldCollection.Count; int dataIdx = outBin1SampleIdxs[outFeatureIdx][bin1SampleRefIdx]; foldCollection[foldIdx].AddPair(InputVectorCollection[dataIdx], OutputVectorCollection[dataIdx]); } } } //More binary outputs } //Binary output return(foldCollection); }
//Methods /// <summary> /// Splits this bundle to a collection of smaller bundles. /// Method expects length of the output vectors = 1. /// </summary> /// <param name="subBundleSize">Sub-bundle size</param> /// <param name="binBorder">If specified and there is only one output value, method will keep balanced number of output values GE to binBorder in the each sub-bundle</param> /// <returns>Collection of extracted sub-bundles</returns> public List <VectorBundle> Split(int subBundleSize, double binBorder = double.NaN) { int numOfBundles = OutputVectorCollection.Count / subBundleSize; List <VectorBundle> bundleCollection = new List <VectorBundle>(numOfBundles); if (!double.IsNaN(binBorder) && OutputVectorCollection[0].Length == 1) { BinDistribution refBinDistr = new BinDistribution(binBorder); refBinDistr.Update(OutputVectorCollection, 0); //Scan int[] bin0SampleIdxs = new int[refBinDistr.NumOf[0]]; int bin0SamplesPos = 0; int[] bin1SampleIdxs = new int[refBinDistr.NumOf[1]]; int bin1SamplesPos = 0; for (int i = 0; i < OutputVectorCollection.Count; i++) { if (OutputVectorCollection[i][0] >= refBinDistr.BinBorder) { bin1SampleIdxs[bin1SamplesPos++] = i; } else { bin0SampleIdxs[bin0SamplesPos++] = i; } } //Division int bundleBin0Count = Math.Max(1, refBinDistr.NumOf[0] / numOfBundles); int bundleBin1Count = Math.Max(1, refBinDistr.NumOf[1] / numOfBundles); if (bundleBin0Count * numOfBundles > bin0SampleIdxs.Length) { throw new InvalidOperationException($"Insufficient bin 0 samples"); } if (bundleBin1Count * numOfBundles > bin1SampleIdxs.Length) { throw new InvalidOperationException($"Insufficient bin 1 samples"); } //Bundles creation bin0SamplesPos = 0; bin1SamplesPos = 0; for (int bundleNum = 0; bundleNum < numOfBundles; bundleNum++) { VectorBundle bundle = new VectorBundle(); //Bin 0 for (int i = 0; i < bundleBin0Count; i++) { bundle.InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]); bundle.OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]); ++bin0SamplesPos; } //Bin 1 for (int i = 0; i < bundleBin1Count; i++) { bundle.InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]); bundle.OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]); ++bin1SamplesPos; } bundleCollection.Add(bundle); } //Remaining samples for (int i = 0; i < bin0SampleIdxs.Length - bin0SamplesPos; i++) { int bundleIdx = i % bundleCollection.Count; bundleCollection[bundleIdx].InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]); bundleCollection[bundleIdx].OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]); } for (int i = 0; i < bin1SampleIdxs.Length - bin1SamplesPos; i++) { int bundleIdx = i % bundleCollection.Count; bundleCollection[bundleIdx].InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]); bundleCollection[bundleIdx].OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]); } } else { //Bundles creation int samplesPos = 0; for (int bundleNum = 0; bundleNum < numOfBundles; bundleNum++) { VectorBundle bundle = new VectorBundle(); for (int i = 0; i < subBundleSize && samplesPos < OutputVectorCollection.Count; i++) { bundle.InputVectorCollection.Add(InputVectorCollection[samplesPos]); bundle.OutputVectorCollection.Add(OutputVectorCollection[samplesPos]); ++samplesPos; } bundleCollection.Add(bundle); } //Remaining samples for (int i = 0; i < OutputVectorCollection.Count - samplesPos; i++) { int bundleIdx = i % bundleCollection.Count; bundleCollection[bundleIdx].InputVectorCollection.Add(InputVectorCollection[samplesPos + i]); bundleCollection[bundleIdx].OutputVectorCollection.Add(OutputVectorCollection[samplesPos + i]); } } return(bundleCollection); }