Пример #1
0
        /// <summary>
        /// Builds readout layer.
        /// Prepares prediction clusters containing trained readout units.
        /// </summary>
        /// <param name="predictorsCollection">Collection of predictors</param>
        /// <param name="idealOutputsCollection">Collection of desired outputs related to predictors</param>
        /// <param name="regressionController">Regression controller delegate</param>
        /// <param name="regressionControllerData">An user object</param>
        /// <returns>Returned ValidationBundle is something like a protocol.
        /// There is recorded fold by fold (unit by unit) predicted and corresponding ideal values.
        /// This is the pesimistic approach. Real results on unseen data could be better due to the clustering synergy.
        /// </returns>
        public ValidationBundle Build(List <double[]> predictorsCollection,
                                      List <double[]> idealOutputsCollection,
                                      ReadoutUnit.RegressionCallbackDelegate regressionController,
                                      Object regressionControllerData
                                      )
        {
            //Random object
            Random rand = new Random(0);
            //Allocation of computed and ideal vectors for validation bundle
            List <double[]> validationComputedVectorCollection = new List <double[]>(idealOutputsCollection.Count);
            List <double[]> validationIdealVectorCollection    = new List <double[]>(idealOutputsCollection.Count);

            for (int i = 0; i < idealOutputsCollection.Count; i++)
            {
                validationComputedVectorCollection.Add(new double[idealOutputsCollection[0].Length]);
                validationIdealVectorCollection.Add(new double[idealOutputsCollection[0].Length]);
            }
            //Test dataset size
            if (_settings.TestDataRatio > MaxRatioOfTestData)
            {
                throw new ArgumentException($"Test dataset size is greater than {MaxRatioOfTestData.ToString(CultureInfo.InvariantCulture)}", "TestDataSetSize");
            }
            int testDataSetLength = (int)Math.Round(idealOutputsCollection.Count * _settings.TestDataRatio, 0);

            if (testDataSetLength < MinLengthOfTestDataset)
            {
                throw new ArgumentException($"Num of test samples is less than {MinLengthOfTestDataset.ToString(CultureInfo.InvariantCulture)}", "TestDataSetSize");
            }
            //Number of folds
            int numOfFolds = _settings.NumOfFolds;

            if (numOfFolds <= 0)
            {
                //Auto setup
                numOfFolds = idealOutputsCollection.Count / testDataSetLength;
                if (numOfFolds > MaxNumOfFolds)
                {
                    numOfFolds = MaxNumOfFolds;
                }
            }
            //Create shuffled copy of the data
            TimeSeriesBundle shuffledData = new TimeSeriesBundle(predictorsCollection, idealOutputsCollection);

            shuffledData.Shuffle(rand);
            //Data inspection, preparation of datasets and training of ReadoutUnits
            //Clusters of readout units (one cluster for each output field)
            for (int clusterIdx = 0; clusterIdx < _settings.ReadoutUnitCfgCollection.Count; clusterIdx++)
            {
                _clusterCollection[clusterIdx] = new ReadoutUnit[numOfFolds];
                List <double[]> idealValueCollection = new List <double[]>(idealOutputsCollection.Count);
                BinDistribution refBinDistr          = null;
                if (_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType == CommonEnums.TaskType.Classification)
                {
                    //Reference binary distribution is relevant only for classification task
                    refBinDistr = new BinDistribution(_dataRange.Mid);
                }
                //Transformation to a single value vectors and data analysis
                foreach (double[] idealVector in shuffledData.OutputVectorCollection)
                {
                    double[] value = new double[1];
                    value[0] = idealVector[clusterIdx];
                    idealValueCollection.Add(value);
                    if (_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType == CommonEnums.TaskType.Classification)
                    {
                        //Reference binary distribution is relevant only for classification task
                        refBinDistr.Update(value);
                    }
                }
                List <TimeSeriesBundle> subBundleCollection = null;
                //Datasets preparation is depending on the task type
                if (_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType == CommonEnums.TaskType.Classification)
                {
                    //Classification task
                    subBundleCollection = DivideSamplesForClassificationTask(shuffledData.InputVectorCollection,
                                                                             idealValueCollection,
                                                                             refBinDistr,
                                                                             testDataSetLength
                                                                             );
                }
                else
                {
                    //Forecast task
                    subBundleCollection = DivideSamplesForForecastTask(shuffledData.InputVectorCollection,
                                                                       idealValueCollection,
                                                                       testDataSetLength
                                                                       );
                }
                //Best predicting unit per each fold in the cluster.
                ClusterErrStatistics ces = new ClusterErrStatistics(_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType, numOfFolds, refBinDistr);
                int arrayPos             = 0;
                for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++)
                {
                    //Build training samples
                    List <double[]> trainingPredictorsCollection = new List <double[]>();
                    List <double[]> trainingIdealValueCollection = new List <double[]>();
                    for (int bundleIdx = 0; bundleIdx < subBundleCollection.Count; bundleIdx++)
                    {
                        if (bundleIdx != foldIdx)
                        {
                            trainingPredictorsCollection.AddRange(subBundleCollection[bundleIdx].InputVectorCollection);
                            trainingIdealValueCollection.AddRange(subBundleCollection[bundleIdx].OutputVectorCollection);
                        }
                    }
                    //Call training regression to get the best fold's readout unit.
                    //The best unit becomes to be the predicting cluster member.
                    _clusterCollection[clusterIdx][foldIdx] = ReadoutUnit.CreateTrained(_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType,
                                                                                        clusterIdx,
                                                                                        foldIdx + 1,
                                                                                        numOfFolds,
                                                                                        refBinDistr,
                                                                                        trainingPredictorsCollection,
                                                                                        trainingIdealValueCollection,
                                                                                        subBundleCollection[foldIdx].InputVectorCollection,
                                                                                        subBundleCollection[foldIdx].OutputVectorCollection,
                                                                                        rand,
                                                                                        _settings.ReadoutUnitCfgCollection[clusterIdx],
                                                                                        regressionController,
                                                                                        regressionControllerData
                                                                                        );
                    //Cluster error statistics & data for validation bundle (pesimistic approach)
                    for (int sampleIdx = 0; sampleIdx < subBundleCollection[foldIdx].OutputVectorCollection.Count; sampleIdx++)
                    {
                        double value = _clusterCollection[clusterIdx][foldIdx].Network.Compute(subBundleCollection[foldIdx].InputVectorCollection[sampleIdx])[0];
                        ces.Update(value, subBundleCollection[foldIdx].OutputVectorCollection[sampleIdx][0]);
                        validationIdealVectorCollection[arrayPos][clusterIdx]    = subBundleCollection[foldIdx].OutputVectorCollection[sampleIdx][0];
                        validationComputedVectorCollection[arrayPos][clusterIdx] = value;
                        ++arrayPos;
                    }
                } //foldIdx
                _clusterErrStatisticsCollection.Add(ces);
            }     //clusterIdx
            //Validation bundle is returned.
            return(new ValidationBundle(validationComputedVectorCollection, validationIdealVectorCollection));
        }
Пример #2
0
        /// <summary>
        /// Builds readout layer.
        /// Prepares prediction clusters containing trained readout units.
        /// </summary>
        /// <param name="dataBundle">Collection of input predictors and associated desired output values</param>
        /// <param name="regressionController">Regression controller delegate</param>
        /// <param name="regressionControllerData">An user object</param>
        /// <param name="predictorsMapper">Optional specific mapping of predictors to readout units</param>
        /// <returns>Returned ResultComparativeBundle is something like a protocol.
        /// There is recorded fold by fold (unit by unit) predicted and corresponding ideal values.
        /// This is the pesimistic approach. Real results on unseen data could be better due to the clustering synergy.
        /// </returns>
        public ResultComparativeBundle Build(VectorBundle dataBundle,
                                             ReadoutUnit.RegressionCallbackDelegate regressionController,
                                             Object regressionControllerData,
                                             PredictorsMapper predictorsMapper = null
                                             )
        {
            //Basic checks
            int numOfPredictors = dataBundle.InputVectorCollection[0].Length;
            int numOfOutputs    = dataBundle.OutputVectorCollection[0].Length;

            if (numOfPredictors == 0)
            {
                throw new Exception("Number of predictors must be greater tham 0.");
            }
            if (numOfOutputs != _settings.ReadoutUnitCfgCollection.Count)
            {
                throw new Exception("Incorrect number of ideal output values in the vector.");
            }

            //Normalization of predictors and output data collections
            //Allocation of normalizers
            _predictorNormalizerCollection = new Normalizer[numOfPredictors];
            for (int i = 0; i < numOfPredictors; i++)
            {
                _predictorNormalizerCollection[i] = new Normalizer(DataRange, NormalizerDefaultReserve, true, false);
            }
            _outputNormalizerCollection = new Normalizer[numOfOutputs];
            for (int i = 0; i < numOfOutputs; i++)
            {
                bool classificationTask = (_settings.ReadoutUnitCfgCollection[i].TaskType == CommonEnums.TaskType.Classification);
                _outputNormalizerCollection[i] = new Normalizer(DataRange,
                                                                classificationTask ? 0 : NormalizerDefaultReserve,
                                                                classificationTask ? false : true,
                                                                false
                                                                );
            }
            //Normalizers adjustment
            for (int pairIdx = 0; pairIdx < dataBundle.InputVectorCollection.Count; pairIdx++)
            {
                //Checks
                if (dataBundle.InputVectorCollection[pairIdx].Length != numOfPredictors)
                {
                    throw new Exception("Inconsistent number of predictors in the predictors collection.");
                }
                if (dataBundle.OutputVectorCollection[pairIdx].Length != numOfOutputs)
                {
                    throw new Exception("Inconsistent number of values in the ideal values collection.");
                }
                //Adjust predictors normalizers
                for (int i = 0; i < numOfPredictors; i++)
                {
                    _predictorNormalizerCollection[i].Adjust(dataBundle.InputVectorCollection[pairIdx][i]);
                }
                //Adjust outputs normalizers
                for (int i = 0; i < numOfOutputs; i++)
                {
                    _outputNormalizerCollection[i].Adjust(dataBundle.OutputVectorCollection[pairIdx][i]);
                }
            }
            //Data normalization
            //Allocation
            List <double[]> predictorsCollection   = new List <double[]>(dataBundle.InputVectorCollection.Count);
            List <double[]> idealOutputsCollection = new List <double[]>(dataBundle.OutputVectorCollection.Count);

            //Normalization
            for (int pairIdx = 0; pairIdx < dataBundle.InputVectorCollection.Count; pairIdx++)
            {
                //Predictors
                double[] predictors = new double[numOfPredictors];
                for (int i = 0; i < numOfPredictors; i++)
                {
                    predictors[i] = _predictorNormalizerCollection[i].Normalize(dataBundle.InputVectorCollection[pairIdx][i]);
                }
                predictorsCollection.Add(predictors);
                //Outputs
                double[] outputs = new double[numOfOutputs];
                for (int i = 0; i < numOfOutputs; i++)
                {
                    outputs[i] = _outputNormalizerCollection[i].Normalize(dataBundle.OutputVectorCollection[pairIdx][i]);
                }
                idealOutputsCollection.Add(outputs);
            }
            //Data processing
            //Random object initialization
            Random rand = new Random(0);

            //Predictors mapper (specified or default)
            _predictorsMapper = predictorsMapper ?? new PredictorsMapper(numOfPredictors);
            //Allocation of computed and ideal vectors for result comparative bundle
            List <double[]> validationComputedVectorCollection = new List <double[]>(idealOutputsCollection.Count);
            List <double[]> validationIdealVectorCollection    = new List <double[]>(idealOutputsCollection.Count);

            for (int i = 0; i < idealOutputsCollection.Count; i++)
            {
                validationComputedVectorCollection.Add(new double[numOfOutputs]);
                validationIdealVectorCollection.Add(new double[numOfOutputs]);
            }
            //Test dataset size
            if (_settings.TestDataRatio > MaxRatioOfTestData)
            {
                throw new ArgumentException($"Test dataset size is greater than {MaxRatioOfTestData.ToString(CultureInfo.InvariantCulture)}", "TestDataSetSize");
            }
            int testDataSetLength = (int)Math.Round(idealOutputsCollection.Count * _settings.TestDataRatio, 0);

            if (testDataSetLength < MinLengthOfTestDataset)
            {
                throw new ArgumentException($"Num of test samples is less than {MinLengthOfTestDataset.ToString(CultureInfo.InvariantCulture)}", "TestDataSetSize");
            }
            //Number of folds
            int numOfFolds = _settings.NumOfFolds;

            if (numOfFolds <= 0)
            {
                //Auto setup
                numOfFolds = idealOutputsCollection.Count / testDataSetLength;
                if (numOfFolds > MaxNumOfFolds)
                {
                    numOfFolds = MaxNumOfFolds;
                }
            }
            //Create shuffled copy of the data
            VectorBundle shuffledData = new VectorBundle(predictorsCollection, idealOutputsCollection);

            shuffledData.Shuffle(rand);
            //Data inspection, preparation of datasets and training of ReadoutUnits
            //Clusters of readout units (one cluster for each output field)
            for (int clusterIdx = 0; clusterIdx < _settings.ReadoutUnitCfgCollection.Count; clusterIdx++)
            {
                _clusterCollection[clusterIdx] = new ReadoutUnit[numOfFolds];
                List <double[]> idealValueCollection = new List <double[]>(idealOutputsCollection.Count);
                BinDistribution refBinDistr          = null;
                if (_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType == CommonEnums.TaskType.Classification)
                {
                    //Reference binary distribution is relevant only for classification task
                    refBinDistr = new BinDistribution(DataRange.Mid);
                }
                //Transformation to a single value vectors and data analysis
                foreach (double[] idealVector in shuffledData.OutputVectorCollection)
                {
                    double[] value = new double[1];
                    value[0] = idealVector[clusterIdx];
                    idealValueCollection.Add(value);
                    if (_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType == CommonEnums.TaskType.Classification)
                    {
                        //Reference binary distribution is relevant only for classification task
                        refBinDistr.Update(value);
                    }
                }
                List <VectorBundle> subBundleCollection = null;
                List <double[]>     readoutUnitInputVectorCollection = _predictorsMapper.CreateVectorCollection(_settings.ReadoutUnitCfgCollection[clusterIdx].Name, shuffledData.InputVectorCollection);
                //Datasets preparation is depending on the task type
                if (_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType == CommonEnums.TaskType.Classification)
                {
                    //Classification task
                    subBundleCollection = DivideSamplesForClassificationTask(readoutUnitInputVectorCollection,
                                                                             idealValueCollection,
                                                                             refBinDistr,
                                                                             testDataSetLength
                                                                             );
                }
                else
                {
                    //Forecast task
                    subBundleCollection = DivideSamplesForForecastTask(readoutUnitInputVectorCollection,
                                                                       idealValueCollection,
                                                                       testDataSetLength
                                                                       );
                }
                //Find best unit per each fold in the cluster.
                ClusterErrStatistics ces = new ClusterErrStatistics(_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType, numOfFolds, refBinDistr);
                int arrayPos             = 0;
                for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++)
                {
                    //Build training samples
                    List <double[]> trainingPredictorsCollection = new List <double[]>();
                    List <double[]> trainingIdealValueCollection = new List <double[]>();
                    for (int bundleIdx = 0; bundleIdx < subBundleCollection.Count; bundleIdx++)
                    {
                        if (bundleIdx != foldIdx)
                        {
                            trainingPredictorsCollection.AddRange(subBundleCollection[bundleIdx].InputVectorCollection);
                            trainingIdealValueCollection.AddRange(subBundleCollection[bundleIdx].OutputVectorCollection);
                        }
                    }
                    //Call training regression to get the best fold's readout unit.
                    //The best unit becomes to be the predicting cluster member.
                    _clusterCollection[clusterIdx][foldIdx] = ReadoutUnit.CreateTrained(_settings.ReadoutUnitCfgCollection[clusterIdx].TaskType,
                                                                                        clusterIdx,
                                                                                        foldIdx + 1,
                                                                                        numOfFolds,
                                                                                        refBinDistr,
                                                                                        trainingPredictorsCollection,
                                                                                        trainingIdealValueCollection,
                                                                                        subBundleCollection[foldIdx].InputVectorCollection,
                                                                                        subBundleCollection[foldIdx].OutputVectorCollection,
                                                                                        rand,
                                                                                        _settings.ReadoutUnitCfgCollection[clusterIdx],
                                                                                        regressionController,
                                                                                        regressionControllerData
                                                                                        );
                    //Cluster error statistics & data for validation bundle (pesimistic approach)
                    for (int sampleIdx = 0; sampleIdx < subBundleCollection[foldIdx].OutputVectorCollection.Count; sampleIdx++)
                    {
                        double nrmComputedValue = _clusterCollection[clusterIdx][foldIdx].Network.Compute(subBundleCollection[foldIdx].InputVectorCollection[sampleIdx])[0];
                        double natComputedValue = _outputNormalizerCollection[clusterIdx].Naturalize(nrmComputedValue);
                        double natIdealValue    = _outputNormalizerCollection[clusterIdx].Naturalize(subBundleCollection[foldIdx].OutputVectorCollection[sampleIdx][0]);
                        ces.Update(nrmComputedValue,
                                   subBundleCollection[foldIdx].OutputVectorCollection[sampleIdx][0],
                                   natComputedValue,
                                   natIdealValue);
                        validationIdealVectorCollection[arrayPos][clusterIdx]    = natIdealValue;
                        validationComputedVectorCollection[arrayPos][clusterIdx] = natComputedValue;
                        ++arrayPos;
                    }
                } //foldIdx
                _clusterErrStatisticsCollection.Add(ces);
            }     //clusterIdx
            //Validation bundle is returned.
            return(new ResultComparativeBundle(validationComputedVectorCollection, validationIdealVectorCollection));
        }