/// <summary> /// Splits this bundle to a collection of smaller folds (sub-bundles) suitable for the cross-validation. /// </summary> /// <param name="foldDataRatio">The requested ratio of the samples constituting the single fold (sub-bundle).</param> /// <param name="binBorder">When the binBorder is specified then all the output features are considered as binary features within the one-takes-all group and function then keeps balanced ratios of 0 and 1 for every output feature and the fold.</param> /// <returns>A collection of the created folds.</returns> public List <VectorBundle> Folderize(double foldDataRatio, double binBorder = double.NaN) { if (OutputVectorCollection.Count < 2) { throw new InvalidOperationException($"Insufficient number of samples ({OutputVectorCollection.Count.ToString(CultureInfo.InvariantCulture)})."); } List <VectorBundle> foldCollection = new List <VectorBundle>(); //Fold data ratio basic correction if (foldDataRatio > MaxRatioOfFoldData) { foldDataRatio = MaxRatioOfFoldData; } //Prelimitary fold size estimation int foldSize = Math.Max(1, (int)Math.Round(OutputVectorCollection.Count * foldDataRatio, 0)); //Prelimitary number of folds int numOfFolds = (int)Math.Round((double)OutputVectorCollection.Count / foldSize); //Folds creation if (double.IsNaN(binBorder)) { //No binary output -> simple split int samplesPos = 0; for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++) { VectorBundle fold = new VectorBundle(); for (int i = 0; i < foldSize && samplesPos < OutputVectorCollection.Count; i++) { fold.InputVectorCollection.Add(InputVectorCollection[samplesPos]); fold.OutputVectorCollection.Add(OutputVectorCollection[samplesPos]); ++samplesPos; } foldCollection.Add(fold); } //Remaining samples for (int i = 0; i < OutputVectorCollection.Count - samplesPos; i++) { int foldIdx = i % foldCollection.Count; foldCollection[foldIdx].InputVectorCollection.Add(InputVectorCollection[samplesPos + i]); foldCollection[foldIdx].OutputVectorCollection.Add(OutputVectorCollection[samplesPos + i]); } }//Indifferent output else { //Binary outputs -> keep balanced ratios of outputs int numOfOutputs = OutputVectorCollection[0].Length; if (numOfOutputs == 1) { //Special case there is only one binary output //Investigation of the output data metrics BinDistribution refBinDistr = new BinDistribution(binBorder); refBinDistr.Update(OutputVectorCollection, 0); int min01 = Math.Min(refBinDistr.NumOf[0], refBinDistr.NumOf[1]); if (min01 < 2) { throw new InvalidOperationException($"Insufficient bin 0 or 1 samples (less than 2)."); } if (numOfFolds > min01) { numOfFolds = min01; } //Scan data int[] bin0SampleIdxs = new int[refBinDistr.NumOf[0]]; int bin0SamplesPos = 0; int[] bin1SampleIdxs = new int[refBinDistr.NumOf[1]]; int bin1SamplesPos = 0; for (int i = 0; i < OutputVectorCollection.Count; i++) { if (OutputVectorCollection[i][0] >= refBinDistr.BinBorder) { bin1SampleIdxs[bin1SamplesPos++] = i; } else { bin0SampleIdxs[bin0SamplesPos++] = i; } } //Determine distributions of 0 and 1 for one fold int bundleBin0Count = Math.Max(1, refBinDistr.NumOf[0] / numOfFolds); int bundleBin1Count = Math.Max(1, refBinDistr.NumOf[1] / numOfFolds); //Bundles creation bin0SamplesPos = 0; bin1SamplesPos = 0; for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++) { VectorBundle fold = new VectorBundle(); //Bin 0 for (int i = 0; i < bundleBin0Count; i++) { fold.InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]); fold.OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]); ++bin0SamplesPos; } //Bin 1 for (int i = 0; i < bundleBin1Count; i++) { fold.InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]); fold.OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]); ++bin1SamplesPos; } foldCollection.Add(fold); } //Remaining samples for (int i = 0; i < bin0SampleIdxs.Length - bin0SamplesPos; i++) { int foldIdx = i % foldCollection.Count; foldCollection[foldIdx].InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]); foldCollection[foldIdx].OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]); } for (int i = 0; i < bin1SampleIdxs.Length - bin1SamplesPos; i++) { int foldIdx = i % foldCollection.Count; foldCollection[foldIdx].InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]); foldCollection[foldIdx].OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]); } }//Only 1 binary output else { //There is more than 1 binary output - "one takes all approach" //Investigation of the output data metrics //Collect bin 1 sample indexes and check "one takes all" consistency for every output feature List <int>[] outBin1SampleIdxs = new List <int> [numOfOutputs]; for (int i = 0; i < numOfOutputs; i++) { outBin1SampleIdxs[i] = new List <int>(); } for (int sampleIdx = 0; sampleIdx < OutputVectorCollection.Count; sampleIdx++) { int numOf1 = 0; for (int outFeatureIdx = 0; outFeatureIdx < numOfOutputs; outFeatureIdx++) { if (OutputVectorCollection[sampleIdx][outFeatureIdx] >= binBorder) { outBin1SampleIdxs[outFeatureIdx].Add(sampleIdx); ++numOf1; } } if (numOf1 != 1) { throw new ArgumentException($"Data are inconsistent on data index {sampleIdx.ToString(CultureInfo.InvariantCulture)}. Output vector has {numOf1.ToString(CultureInfo.InvariantCulture)} feature(s) having bin value 1.", "binBorder"); } } //Determine max possible number of folds int maxNumOfFolds = OutputVectorCollection.Count; for (int outFeatureIdx = 0; outFeatureIdx < numOfOutputs; outFeatureIdx++) { int outFeatureMaxFolds = Math.Min(outBin1SampleIdxs[outFeatureIdx].Count, OutputVectorCollection.Count - outBin1SampleIdxs[outFeatureIdx].Count); maxNumOfFolds = Math.Min(outFeatureMaxFolds, maxNumOfFolds); } //Correct the number of folds to be created if (numOfFolds > maxNumOfFolds) { numOfFolds = maxNumOfFolds; } //Create the folds for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++) { foldCollection.Add(new VectorBundle()); } //Samples distribution for (int outFeatureIdx = 0; outFeatureIdx < numOfOutputs; outFeatureIdx++) { for (int bin1SampleRefIdx = 0; bin1SampleRefIdx < outBin1SampleIdxs[outFeatureIdx].Count; bin1SampleRefIdx++) { int foldIdx = bin1SampleRefIdx % foldCollection.Count; int dataIdx = outBin1SampleIdxs[outFeatureIdx][bin1SampleRefIdx]; foldCollection[foldIdx].AddPair(InputVectorCollection[dataIdx], OutputVectorCollection[dataIdx]); } } } //More binary outputs } //Binary output return(foldCollection); }
/// <summary> /// Prepares trained readout unit for specified output field and task. /// </summary> /// <param name="taskType">Type of the task</param> /// <param name="readoutUnitIdx">Index of the readout unit (informative only)</param> /// <param name="foldNum">Current fold number</param> /// <param name="numOfFolds">Total number of the folds</param> /// <param name="refBinDistr">Reference bin distribution (if task type is Classification)</param> /// <param name="trainingPredictorsCollection">Collection of the predictors for training</param> /// <param name="trainingIdealOutputsCollection">Collection of ideal outputs for training. Note that the double array always has only one member.</param> /// <param name="testingPredictorsCollection">Collection of the predictors for testing</param> /// <param name="testingIdealOutputsCollection">Collection of ideal outputs for testing. Note that the double array always has only one member.</param> /// <param name="rand">Random object to be used</param> /// <param name="readoutUnitSettings">Readout unit configuration parameters</param> /// <param name="controller">Regression controller</param> /// <param name="controllerUserObject">An user object to be passed to controller</param> /// <returns>Prepared readout unit</returns> public static ReadoutUnit CreateTrained(CommonEnums.TaskType taskType, int readoutUnitIdx, int foldNum, int numOfFolds, BinDistribution refBinDistr, List <double[]> trainingPredictorsCollection, List <double[]> trainingIdealOutputsCollection, List <double[]> testingPredictorsCollection, List <double[]> testingIdealOutputsCollection, Random rand, ReadoutLayerSettings.ReadoutUnitSettings readoutUnitSettings, RegressionCallbackDelegate controller = null, Object controllerUserObject = null ) { ReadoutUnit bestReadoutUnit = null; //Regression attempts bool stopRegression = false; for (int regrAttemptNumber = 1; regrAttemptNumber <= readoutUnitSettings.RegressionAttempts; regrAttemptNumber++) { //Create network and trainer CreateNetAndTreainer(readoutUnitSettings, trainingPredictorsCollection, trainingIdealOutputsCollection, rand, out INonRecurrentNetwork net, out INonRecurrentNetworkTrainer trainer ); //Reference binary distribution //Iterate training cycles for (int epoch = 1; epoch <= readoutUnitSettings.RegressionAttemptEpochs; epoch++) { trainer.Iteration(); List <double[]> testingComputedOutputsCollection = null; //Compute current error statistics after training iteration ReadoutUnit currReadoutUnit = new ReadoutUnit(); currReadoutUnit.Network = net; currReadoutUnit.TrainingErrorStat = net.ComputeBatchErrorStat(trainingPredictorsCollection, trainingIdealOutputsCollection, out List <double[]> trainingComputedOutputsCollection); if (taskType == CommonEnums.TaskType.Classification) { currReadoutUnit.TrainingBinErrorStat = new BinErrStat(refBinDistr, trainingComputedOutputsCollection, trainingIdealOutputsCollection); currReadoutUnit.CombinedBinaryError = currReadoutUnit.TrainingBinErrorStat.TotalErrStat.Sum; //currReadoutUnit.CombinedBinaryError = currReadoutUnit.TrainingBinErrorStat.ProportionalErr; } currReadoutUnit.CombinedPrecisionError = currReadoutUnit.TrainingErrorStat.ArithAvg; if (testingPredictorsCollection != null && testingPredictorsCollection.Count > 0) { currReadoutUnit.TestingErrorStat = net.ComputeBatchErrorStat(testingPredictorsCollection, testingIdealOutputsCollection, out testingComputedOutputsCollection); currReadoutUnit.CombinedPrecisionError = Math.Max(currReadoutUnit.CombinedPrecisionError, currReadoutUnit.TestingErrorStat.ArithAvg); if (taskType == CommonEnums.TaskType.Classification) { currReadoutUnit.TestingBinErrorStat = new BinErrStat(refBinDistr, testingComputedOutputsCollection, testingIdealOutputsCollection); currReadoutUnit.CombinedBinaryError = Math.Max(currReadoutUnit.CombinedBinaryError, currReadoutUnit.TestingBinErrorStat.TotalErrStat.Sum); //currReadoutUnit.CombinedBinaryError = Math.Max(currReadoutUnit.CombinedBinaryError, currReadoutUnit.TestingBinErrorStat.ProportionalErr); } } //Current results processing bool better = false, stopTrainingCycle = false; //Result first initialization if (bestReadoutUnit == null) { //Adopt current regression results bestReadoutUnit = currReadoutUnit.DeepClone(); } //Perform call back if it is defined if (controller != null) { //Evaluation of the improvement is driven externally RegressionControlInArgs cbIn = new RegressionControlInArgs { TaskType = taskType, ReadoutUnitIdx = readoutUnitIdx, OutputFieldName = readoutUnitSettings.Name, FoldNum = foldNum, NumOfFolds = numOfFolds, RegrAttemptNumber = regrAttemptNumber, RegrMaxAttempts = readoutUnitSettings.RegressionAttempts, Epoch = epoch, MaxEpochs = readoutUnitSettings.RegressionAttemptEpochs, TrainingPredictorsCollection = trainingPredictorsCollection, TrainingIdealOutputsCollection = trainingIdealOutputsCollection, TrainingComputedOutputsCollection = trainingComputedOutputsCollection, TestingPredictorsCollection = testingPredictorsCollection, TestingIdealOutputsCollection = testingIdealOutputsCollection, TestingComputedOutputsCollection = testingComputedOutputsCollection, CurrReadoutUnit = currReadoutUnit, BestReadoutUnit = bestReadoutUnit, UserObject = controllerUserObject }; //Call external controller RegressionControlOutArgs cbOut = controller(cbIn); //Pick up results better = cbOut.CurrentIsBetter; stopTrainingCycle = cbOut.StopCurrentAttempt; stopRegression = cbOut.StopRegression; } else { //Default implementation better = IsBetter(taskType, currReadoutUnit, bestReadoutUnit); } //Best? if (better) { //Adopt current regression results bestReadoutUnit = currReadoutUnit.DeepClone(); } //Training stop conditions if (stopTrainingCycle || stopRegression) { break; } }//epoch //Regression stop conditions if (stopRegression) { break; } }//regrAttemptNumber //Create statistics of the best network weights bestReadoutUnit.OutputWeightsStat = bestReadoutUnit.Network.ComputeWeightsStat(); return(bestReadoutUnit); }
/// <summary> /// Builds readout layer. /// Prepares prediction clusters containing trained readout units. /// </summary> /// <param name="predictorsCollection">Collection of predictors</param> /// <param name="idealOutputsCollection">Collection of desired outputs related to predictors</param> /// <param name="regressionController">Regression controller delegate</param> /// <param name="regressionControllerData">An user object</param> /// <returns>Returned ValidationBundle is something like a protocol. /// There is recorded fold by fold (unit by unit) predicted and corresponding ideal values. /// This is the pesimistic approach. Real results on unseen data could be better due to the clustering synergy. /// </returns> public ValidationBundle Build(List <double[]> predictorsCollection, List <double[]> idealOutputsCollection, ReadoutUnit.RegressionCallbackDelegate regressionController, Object regressionControllerData ) { //Random object Random rand = new Random(0); //Allocation of computed and ideal vectors for validation bundle List <double[]> validationComputedVectorCollection = new List <double[]>(idealOutputsCollection.Count); List <double[]> validationIdealVectorCollection = new List <double[]>(idealOutputsCollection.Count); for (int i = 0; i < idealOutputsCollection.Count; i++) { validationComputedVectorCollection.Add(new double[idealOutputsCollection[0].Length]); validationIdealVectorCollection.Add(new double[idealOutputsCollection[0].Length]); } //Test dataset size if (_settings.TestDataRatio > MaxRatioOfTestData) { throw new ArgumentException($"Test dataset size is greater than {MaxRatioOfTestData.ToString(CultureInfo.InvariantCulture)}", "TestDataSetSize"); } int testDataSetLength = (int)Math.Round(idealOutputsCollection.Count * _settings.TestDataRatio, 0); if (testDataSetLength < MinLengthOfTestDataset) { throw new ArgumentException($"Num of test samples is less than {MinLengthOfTestDataset.ToString(CultureInfo.InvariantCulture)}", "TestDataSetSize"); } //Number of folds int numOfFolds = _settings.NumOfFolds; if (numOfFolds <= 0) { //Auto setup numOfFolds = idealOutputsCollection.Count / testDataSetLength; if (numOfFolds > MaxNumOfFolds) { numOfFolds = MaxNumOfFolds; } } //Create shuffled copy of the data TimeSeriesBundle shuffledData = new TimeSeriesBundle(predictorsCollection, idealOutputsCollection); shuffledData.Shuffle(rand); //Data inspection, preparation of datasets and training of ReadoutUnits //Clusters of readout units (one cluster for each output field) for (int clusterIdx = 0; clusterIdx < _settings.ReadoutUnitCfgCollection.Count; clusterIdx++) { _clusterCollection[clusterIdx] = new ReadoutUnit[numOfFolds]; List <double[]> idealValueCollection = new List <double[]>(idealOutputsCollection.Count); BinDistribution refBinDistr = null; if (_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType == CommonEnums.TaskType.Classification) { //Reference binary distribution is relevant only for classification task refBinDistr = new BinDistribution(_dataRange.Mid); } //Transformation to a single value vectors and data analysis foreach (double[] idealVector in shuffledData.OutputVectorCollection) { double[] value = new double[1]; value[0] = idealVector[clusterIdx]; idealValueCollection.Add(value); if (_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType == CommonEnums.TaskType.Classification) { //Reference binary distribution is relevant only for classification task refBinDistr.Update(value); } } List <TimeSeriesBundle> subBundleCollection = null; //Datasets preparation is depending on the task type if (_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType == CommonEnums.TaskType.Classification) { //Classification task subBundleCollection = DivideSamplesForClassificationTask(shuffledData.InputVectorCollection, idealValueCollection, refBinDistr, testDataSetLength ); } else { //Forecast task subBundleCollection = DivideSamplesForForecastTask(shuffledData.InputVectorCollection, idealValueCollection, testDataSetLength ); } //Best predicting unit per each fold in the cluster. ClusterErrStatistics ces = new ClusterErrStatistics(_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType, numOfFolds, refBinDistr); int arrayPos = 0; for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++) { //Build training samples List <double[]> trainingPredictorsCollection = new List <double[]>(); List <double[]> trainingIdealValueCollection = new List <double[]>(); for (int bundleIdx = 0; bundleIdx < subBundleCollection.Count; bundleIdx++) { if (bundleIdx != foldIdx) { trainingPredictorsCollection.AddRange(subBundleCollection[bundleIdx].InputVectorCollection); trainingIdealValueCollection.AddRange(subBundleCollection[bundleIdx].OutputVectorCollection); } } //Call training regression to get the best fold's readout unit. //The best unit becomes to be the predicting cluster member. _clusterCollection[clusterIdx][foldIdx] = ReadoutUnit.CreateTrained(_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType, clusterIdx, foldIdx + 1, numOfFolds, refBinDistr, trainingPredictorsCollection, trainingIdealValueCollection, subBundleCollection[foldIdx].InputVectorCollection, subBundleCollection[foldIdx].OutputVectorCollection, rand, _settings.ReadoutUnitCfgCollection[clusterIdx], regressionController, regressionControllerData ); //Cluster error statistics & data for validation bundle (pesimistic approach) for (int sampleIdx = 0; sampleIdx < subBundleCollection[foldIdx].OutputVectorCollection.Count; sampleIdx++) { double value = _clusterCollection[clusterIdx][foldIdx].Network.Compute(subBundleCollection[foldIdx].InputVectorCollection[sampleIdx])[0]; ces.Update(value, subBundleCollection[foldIdx].OutputVectorCollection[sampleIdx][0]); validationIdealVectorCollection[arrayPos][clusterIdx] = subBundleCollection[foldIdx].OutputVectorCollection[sampleIdx][0]; validationComputedVectorCollection[arrayPos][clusterIdx] = value; ++arrayPos; } } //foldIdx _clusterErrStatisticsCollection.Add(ces); } //clusterIdx //Validation bundle is returned. return(new ValidationBundle(validationComputedVectorCollection, validationIdealVectorCollection)); }
/// <summary> /// Constructs an instance prepared for initialization (updates) /// </summary> /// <param name="taskType"></param> /// <param name="numOfReadoutUnits"></param> /// <param name="refBinDistr"></param> public ClusterErrStatistics(CommonEnums.TaskType taskType, int numOfReadoutUnits, BinDistribution refBinDistr) { TaskType = taskType; NumOfReadoutUnits = numOfReadoutUnits; PrecissionErrStat = new BasicStat(); BinaryErrStat = null; if (TaskType == CommonEnums.TaskType.Classification) { BinaryErrStat = new BinErrStat(refBinDistr); } return; }
private List <TimeSeriesBundle> DivideSamplesForClassificationTask(List <double[]> predictorsCollection, List <double[]> idealValueCollection, BinDistribution refBinDistr, int bundleSize ) { int numOfBundles = idealValueCollection.Count / bundleSize; List <TimeSeriesBundle> bundleCollection = new List <TimeSeriesBundle>(numOfBundles); //Scan int[] bin0SampleIdxs = new int[refBinDistr.NumOf[0]]; int bin0SamplesPos = 0; int[] bin1SampleIdxs = new int[refBinDistr.NumOf[1]]; int bin1SamplesPos = 0; for (int i = 0; i < idealValueCollection.Count; i++) { if (idealValueCollection[i][0] >= refBinDistr.BinBorder) { bin1SampleIdxs[bin1SamplesPos++] = i; } else { bin0SampleIdxs[bin0SamplesPos++] = i; } } //Division int bundleBin0Count = Math.Max(1, refBinDistr.NumOf[0] / numOfBundles); int bundleBin1Count = Math.Max(1, refBinDistr.NumOf[1] / numOfBundles); if (bundleBin0Count * numOfBundles > bin0SampleIdxs.Length) { throw new Exception("Insufficient bin 0 samples"); } if (bundleBin1Count * numOfBundles > bin1SampleIdxs.Length) { throw new Exception("Insufficient bin 1 samples"); } //Bundles creation bin0SamplesPos = 0; bin1SamplesPos = 0; for (int bundleNum = 0; bundleNum < numOfBundles; bundleNum++) { TimeSeriesBundle bundle = new TimeSeriesBundle(); //Bin 0 for (int i = 0; i < bundleBin0Count; i++) { bundle.InputVectorCollection.Add(predictorsCollection[bin0SampleIdxs[bin0SamplesPos]]); bundle.OutputVectorCollection.Add(idealValueCollection[bin0SampleIdxs[bin0SamplesPos]]); ++bin0SamplesPos; } //Bin 1 for (int i = 0; i < bundleBin1Count; i++) { bundle.InputVectorCollection.Add(predictorsCollection[bin1SampleIdxs[bin1SamplesPos]]); bundle.OutputVectorCollection.Add(idealValueCollection[bin1SampleIdxs[bin1SamplesPos]]); ++bin1SamplesPos; } bundleCollection.Add(bundle); } //Remaining samples for (int i = 0; i < bin0SampleIdxs.Length - bin0SamplesPos; i++) { int bundleIdx = i % bundleCollection.Count; bundleCollection[bundleIdx].InputVectorCollection.Add(predictorsCollection[bin0SampleIdxs[bin0SamplesPos + i]]); bundleCollection[bundleIdx].OutputVectorCollection.Add(idealValueCollection[bin0SampleIdxs[bin0SamplesPos + i]]); } for (int i = 0; i < bin1SampleIdxs.Length - bin1SamplesPos; i++) { int bundleIdx = i % bundleCollection.Count; bundleCollection[bundleIdx].InputVectorCollection.Add(predictorsCollection[bin1SampleIdxs[bin1SamplesPos + i]]); bundleCollection[bundleIdx].OutputVectorCollection.Add(idealValueCollection[bin1SampleIdxs[bin1SamplesPos + i]]); } return(bundleCollection); }
//Methods /// <summary> /// Splits this bundle to a collection of smaller bundles. /// Method expects length of the output vectors = 1. /// </summary> /// <param name="subBundleSize">Sub-bundle size</param> /// <param name="binBorder">If specified and there is only one output value, method will keep balanced number of output values GE to binBorder in the each sub-bundle</param> /// <returns>Collection of extracted sub-bundles</returns> public List <VectorBundle> Split(int subBundleSize, double binBorder = double.NaN) { int numOfBundles = OutputVectorCollection.Count / subBundleSize; List <VectorBundle> bundleCollection = new List <VectorBundle>(numOfBundles); if (!double.IsNaN(binBorder) && OutputVectorCollection[0].Length == 1) { BinDistribution refBinDistr = new BinDistribution(binBorder); refBinDistr.Update(OutputVectorCollection, 0); //Scan int[] bin0SampleIdxs = new int[refBinDistr.NumOf[0]]; int bin0SamplesPos = 0; int[] bin1SampleIdxs = new int[refBinDistr.NumOf[1]]; int bin1SamplesPos = 0; for (int i = 0; i < OutputVectorCollection.Count; i++) { if (OutputVectorCollection[i][0] >= refBinDistr.BinBorder) { bin1SampleIdxs[bin1SamplesPos++] = i; } else { bin0SampleIdxs[bin0SamplesPos++] = i; } } //Division int bundleBin0Count = Math.Max(1, refBinDistr.NumOf[0] / numOfBundles); int bundleBin1Count = Math.Max(1, refBinDistr.NumOf[1] / numOfBundles); if (bundleBin0Count * numOfBundles > bin0SampleIdxs.Length) { throw new InvalidOperationException($"Insufficient bin 0 samples"); } if (bundleBin1Count * numOfBundles > bin1SampleIdxs.Length) { throw new InvalidOperationException($"Insufficient bin 1 samples"); } //Bundles creation bin0SamplesPos = 0; bin1SamplesPos = 0; for (int bundleNum = 0; bundleNum < numOfBundles; bundleNum++) { VectorBundle bundle = new VectorBundle(); //Bin 0 for (int i = 0; i < bundleBin0Count; i++) { bundle.InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]); bundle.OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]); ++bin0SamplesPos; } //Bin 1 for (int i = 0; i < bundleBin1Count; i++) { bundle.InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]); bundle.OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]); ++bin1SamplesPos; } bundleCollection.Add(bundle); } //Remaining samples for (int i = 0; i < bin0SampleIdxs.Length - bin0SamplesPos; i++) { int bundleIdx = i % bundleCollection.Count; bundleCollection[bundleIdx].InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]); bundleCollection[bundleIdx].OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]); } for (int i = 0; i < bin1SampleIdxs.Length - bin1SamplesPos; i++) { int bundleIdx = i % bundleCollection.Count; bundleCollection[bundleIdx].InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]); bundleCollection[bundleIdx].OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]); } } else { //Bundles creation int samplesPos = 0; for (int bundleNum = 0; bundleNum < numOfBundles; bundleNum++) { VectorBundle bundle = new VectorBundle(); for (int i = 0; i < subBundleSize && samplesPos < OutputVectorCollection.Count; i++) { bundle.InputVectorCollection.Add(InputVectorCollection[samplesPos]); bundle.OutputVectorCollection.Add(OutputVectorCollection[samplesPos]); ++samplesPos; } bundleCollection.Add(bundle); } //Remaining samples for (int i = 0; i < OutputVectorCollection.Count - samplesPos; i++) { int bundleIdx = i % bundleCollection.Count; bundleCollection[bundleIdx].InputVectorCollection.Add(InputVectorCollection[samplesPos + i]); bundleCollection[bundleIdx].OutputVectorCollection.Add(OutputVectorCollection[samplesPos + i]); } } return(bundleCollection); }
/// <summary> /// Builds readout layer. /// Prepares prediction clusters containing trained readout units. /// </summary> /// <param name="dataBundle">Collection of input predictors and associated desired output values</param> /// <param name="regressionController">Regression controller delegate</param> /// <param name="regressionControllerData">An user object</param> /// <param name="predictorsMapper">Optional specific mapping of predictors to readout units</param> /// <returns>Returned ResultComparativeBundle is something like a protocol. /// There is recorded fold by fold (unit by unit) predicted and corresponding ideal values. /// This is the pesimistic approach. Real results on unseen data could be better due to the clustering synergy. /// </returns> public ResultComparativeBundle Build(VectorBundle dataBundle, ReadoutUnit.RegressionCallbackDelegate regressionController, Object regressionControllerData, PredictorsMapper predictorsMapper = null ) { //Basic checks int numOfPredictors = dataBundle.InputVectorCollection[0].Length; int numOfOutputs = dataBundle.OutputVectorCollection[0].Length; if (numOfPredictors == 0) { throw new Exception("Number of predictors must be greater tham 0."); } if (numOfOutputs != _settings.ReadoutUnitCfgCollection.Count) { throw new Exception("Incorrect number of ideal output values in the vector."); } //Normalization of predictors and output data collections //Allocation of normalizers _predictorNormalizerCollection = new Normalizer[numOfPredictors]; for (int i = 0; i < numOfPredictors; i++) { _predictorNormalizerCollection[i] = new Normalizer(DataRange, NormalizerDefaultReserve, true, false); } _outputNormalizerCollection = new Normalizer[numOfOutputs]; for (int i = 0; i < numOfOutputs; i++) { bool classificationTask = (_settings.ReadoutUnitCfgCollection[i].TaskType == CommonEnums.TaskType.Classification); _outputNormalizerCollection[i] = new Normalizer(DataRange, classificationTask ? 0 : NormalizerDefaultReserve, classificationTask ? false : true, false ); } //Normalizers adjustment for (int pairIdx = 0; pairIdx < dataBundle.InputVectorCollection.Count; pairIdx++) { //Checks if (dataBundle.InputVectorCollection[pairIdx].Length != numOfPredictors) { throw new Exception("Inconsistent number of predictors in the predictors collection."); } if (dataBundle.OutputVectorCollection[pairIdx].Length != numOfOutputs) { throw new Exception("Inconsistent number of values in the ideal values collection."); } //Adjust predictors normalizers for (int i = 0; i < numOfPredictors; i++) { _predictorNormalizerCollection[i].Adjust(dataBundle.InputVectorCollection[pairIdx][i]); } //Adjust outputs normalizers for (int i = 0; i < numOfOutputs; i++) { _outputNormalizerCollection[i].Adjust(dataBundle.OutputVectorCollection[pairIdx][i]); } } //Data normalization //Allocation List <double[]> predictorsCollection = new List <double[]>(dataBundle.InputVectorCollection.Count); List <double[]> idealOutputsCollection = new List <double[]>(dataBundle.OutputVectorCollection.Count); //Normalization for (int pairIdx = 0; pairIdx < dataBundle.InputVectorCollection.Count; pairIdx++) { //Predictors double[] predictors = new double[numOfPredictors]; for (int i = 0; i < numOfPredictors; i++) { predictors[i] = _predictorNormalizerCollection[i].Normalize(dataBundle.InputVectorCollection[pairIdx][i]); } predictorsCollection.Add(predictors); //Outputs double[] outputs = new double[numOfOutputs]; for (int i = 0; i < numOfOutputs; i++) { outputs[i] = _outputNormalizerCollection[i].Normalize(dataBundle.OutputVectorCollection[pairIdx][i]); } idealOutputsCollection.Add(outputs); } //Data processing //Random object initialization Random rand = new Random(0); //Predictors mapper (specified or default) _predictorsMapper = predictorsMapper ?? new PredictorsMapper(numOfPredictors); //Allocation of computed and ideal vectors for result comparative bundle List <double[]> validationComputedVectorCollection = new List <double[]>(idealOutputsCollection.Count); List <double[]> validationIdealVectorCollection = new List <double[]>(idealOutputsCollection.Count); for (int i = 0; i < idealOutputsCollection.Count; i++) { validationComputedVectorCollection.Add(new double[numOfOutputs]); validationIdealVectorCollection.Add(new double[numOfOutputs]); } //Test dataset size if (_settings.TestDataRatio > MaxRatioOfTestData) { throw new ArgumentException($"Test dataset size is greater than {MaxRatioOfTestData.ToString(CultureInfo.InvariantCulture)}", "TestDataSetSize"); } int testDataSetLength = (int)Math.Round(idealOutputsCollection.Count * _settings.TestDataRatio, 0); if (testDataSetLength < MinLengthOfTestDataset) { throw new ArgumentException($"Num of test samples is less than {MinLengthOfTestDataset.ToString(CultureInfo.InvariantCulture)}", "TestDataSetSize"); } //Number of folds int numOfFolds = _settings.NumOfFolds; if (numOfFolds <= 0) { //Auto setup numOfFolds = idealOutputsCollection.Count / testDataSetLength; if (numOfFolds > MaxNumOfFolds) { numOfFolds = MaxNumOfFolds; } } //Create shuffled copy of the data VectorBundle shuffledData = new VectorBundle(predictorsCollection, idealOutputsCollection); shuffledData.Shuffle(rand); //Data inspection, preparation of datasets and training of ReadoutUnits //Clusters of readout units (one cluster for each output field) for (int clusterIdx = 0; clusterIdx < _settings.ReadoutUnitCfgCollection.Count; clusterIdx++) { _clusterCollection[clusterIdx] = new ReadoutUnit[numOfFolds]; List <double[]> idealValueCollection = new List <double[]>(idealOutputsCollection.Count); BinDistribution refBinDistr = null; if (_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType == CommonEnums.TaskType.Classification) { //Reference binary distribution is relevant only for classification task refBinDistr = new BinDistribution(DataRange.Mid); } //Transformation to a single value vectors and data analysis foreach (double[] idealVector in shuffledData.OutputVectorCollection) { double[] value = new double[1]; value[0] = idealVector[clusterIdx]; idealValueCollection.Add(value); if (_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType == CommonEnums.TaskType.Classification) { //Reference binary distribution is relevant only for classification task refBinDistr.Update(value); } } List <VectorBundle> subBundleCollection = null; List <double[]> readoutUnitInputVectorCollection = _predictorsMapper.CreateVectorCollection(_settings.ReadoutUnitCfgCollection[clusterIdx].Name, shuffledData.InputVectorCollection); //Datasets preparation is depending on the task type if (_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType == CommonEnums.TaskType.Classification) { //Classification task subBundleCollection = DivideSamplesForClassificationTask(readoutUnitInputVectorCollection, idealValueCollection, refBinDistr, testDataSetLength ); } else { //Forecast task subBundleCollection = DivideSamplesForForecastTask(readoutUnitInputVectorCollection, idealValueCollection, testDataSetLength ); } //Find best unit per each fold in the cluster. ClusterErrStatistics ces = new ClusterErrStatistics(_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType, numOfFolds, refBinDistr); int arrayPos = 0; for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++) { //Build training samples List <double[]> trainingPredictorsCollection = new List <double[]>(); List <double[]> trainingIdealValueCollection = new List <double[]>(); for (int bundleIdx = 0; bundleIdx < subBundleCollection.Count; bundleIdx++) { if (bundleIdx != foldIdx) { trainingPredictorsCollection.AddRange(subBundleCollection[bundleIdx].InputVectorCollection); trainingIdealValueCollection.AddRange(subBundleCollection[bundleIdx].OutputVectorCollection); } } //Call training regression to get the best fold's readout unit. //The best unit becomes to be the predicting cluster member. _clusterCollection[clusterIdx][foldIdx] = ReadoutUnit.CreateTrained(_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType, clusterIdx, foldIdx + 1, numOfFolds, refBinDistr, trainingPredictorsCollection, trainingIdealValueCollection, subBundleCollection[foldIdx].InputVectorCollection, subBundleCollection[foldIdx].OutputVectorCollection, rand, _settings.ReadoutUnitCfgCollection[clusterIdx], regressionController, regressionControllerData ); //Cluster error statistics & data for validation bundle (pesimistic approach) for (int sampleIdx = 0; sampleIdx < subBundleCollection[foldIdx].OutputVectorCollection.Count; sampleIdx++) { double nrmComputedValue = _clusterCollection[clusterIdx][foldIdx].Network.Compute(subBundleCollection[foldIdx].InputVectorCollection[sampleIdx])[0]; double natComputedValue = _outputNormalizerCollection[clusterIdx].Naturalize(nrmComputedValue); double natIdealValue = _outputNormalizerCollection[clusterIdx].Naturalize(subBundleCollection[foldIdx].OutputVectorCollection[sampleIdx][0]); ces.Update(nrmComputedValue, subBundleCollection[foldIdx].OutputVectorCollection[sampleIdx][0], natComputedValue, natIdealValue); validationIdealVectorCollection[arrayPos][clusterIdx] = natIdealValue; validationComputedVectorCollection[arrayPos][clusterIdx] = natComputedValue; ++arrayPos; } } //foldIdx _clusterErrStatisticsCollection.Add(ces); } //clusterIdx //Validation bundle is returned. return(new ResultComparativeBundle(validationComputedVectorCollection, validationIdealVectorCollection)); }