Exemple #1
0
 /// <summary>
 /// Normalizes all values in the sample data bundle
 /// </summary>
 /// <param name="bundle">Sample data bundle</param>
 public void Normalize(VectorBundle bundle)
 {
     AdjustNormalizers(bundle);
     NormalizeInputVectorCollection(bundle.InputVectorCollection);
     NormalizeOutputVectorCollection(bundle.OutputVectorCollection);
     return;
 }
Exemple #2
0
        /// <summary>
        /// Loads the vector bundle from the csv data (patterned input feeding).
        /// </summary>
        /// <param name="csvData">The csv data.</param>
        /// <param name="numOfOutputFields">The number of output fields.</param>
        public static VectorBundle Load(CsvDataHolder csvData, int numOfOutputFields)
        {
            VectorBundle bundle = new VectorBundle();

            foreach (DelimitedStringValues dataRow in csvData.DataRowCollection)
            {
                int numOfInputValues = dataRow.NumOfStringValues - numOfOutputFields;
                //Check data length
                if (numOfInputValues <= 0)
                {
                    throw new ArgumentException("Incorrect length of data row.", "csvData");
                }
                //Input data
                double[] inputData = new double[numOfInputValues];
                for (int i = 0; i < numOfInputValues; i++)
                {
                    inputData[i] = dataRow.GetValueAt(i).ParseDouble(true, $"Can't parse double data value {dataRow.GetValueAt(i)}.");
                }
                //Output data
                double[] outputData = new double[numOfOutputFields];
                for (int i = 0; i < numOfOutputFields; i++)
                {
                    outputData[i] = dataRow.GetValueAt(numOfInputValues + i).ParseDouble(true, $"Can't parse double data value {dataRow.GetValueAt(numOfInputValues + i)}.");
                }
                bundle.AddPair(inputData, outputData);
            }
            return(bundle);
        }
Exemple #3
0
 /// <summary>
 /// Adjusts internal normalizers
 /// </summary>
 /// <param name="bundle">Sample data bundle</param>
 public void AdjustNormalizers(VectorBundle bundle)
 {
     ResetNormalizers();
     foreach (double[] inputVector in bundle.InputVectorCollection)
     {
         AdjustInputNormalizers(inputVector);
     }
     foreach (double[] outputVector in bundle.OutputVectorCollection)
     {
         AdjustOutputNormalizers(outputVector);
     }
     return;
 }
Exemple #4
0
        //Static methods
        /// <summary>
        /// Loads the data and prepares TimeSeriesBundle.
        /// The first line of the csv file must contain field names. These field names must
        /// match the names of the input and output fields.
        /// </summary>
        /// <param name="fileName"> Data file name </param>
        /// <param name="inputFieldNameCollection"> Input fields to be extracted from a file</param>
        /// <param name="outputFieldNameCollection"> Output fields to be extracted from a file</param>
        /// <param name="remainingInputVector"> Returned the last input vector unused in the bundle </param>
        public static VectorBundle LoadFromCsv(string fileName,
                                               List <string> inputFieldNameCollection,
                                               List <string> outputFieldNameCollection,
                                               out double[] remainingInputVector
                                               )
        {
            VectorBundle bundle = null;

            remainingInputVector = null;
            using (StreamReader streamReader = new StreamReader(new FileStream(fileName, FileMode.Open)))
            {
                List <int> inputFieldIndexes  = new List <int>();
                List <int> outputFieldIndexes = new List <int>();
                //First row contains column names (data fields)
                string delimitedColumnNames = streamReader.ReadLine();
                //What data delimiter is used?
                char csvDelimiter = DelimitedStringValues.RecognizeDelimiter(delimitedColumnNames);
                //Split column names
                DelimitedStringValues columnNames = new DelimitedStringValues(csvDelimiter);
                columnNames.LoadFromString(delimitedColumnNames);
                //Check if the recognized data delimiter works properly
                if (columnNames.NumOfStringValues < inputFieldNameCollection.Count)
                {
                    throw new FormatException("1st row of the file doesn't contain delimited column names or the value delimiter was not properly recognized.");
                }
                //Collect indexes of allowed fields
                foreach (string name in inputFieldNameCollection)
                {
                    inputFieldIndexes.Add(columnNames.IndexOf(name));
                }
                for (int i = 0; i < outputFieldNameCollection.Count; i++)
                {
                    outputFieldIndexes.Add(columnNames.IndexOf(outputFieldNameCollection[i]));
                }
                //Load full data in string form
                List <DelimitedStringValues> fullData = new List <DelimitedStringValues>();
                while (!streamReader.EndOfStream)
                {
                    DelimitedStringValues row = new DelimitedStringValues(csvDelimiter);
                    row.LoadFromString(streamReader.ReadLine());
                    fullData.Add(row);
                }
                //Prepare input and output vectors
                List <double[]> inputVectorCollection  = new List <double[]>(fullData.Count);
                List <double[]> outputVectorCollection = new List <double[]>(fullData.Count);
                for (int i = 0; i < fullData.Count; i++)
                {
                    //Input vector
                    double[] inputVector = new double[inputFieldIndexes.Count];
                    for (int j = 0; j < inputFieldIndexes.Count; j++)
                    {
                        inputVector[j] = fullData[i].GetValue(inputFieldIndexes[j]).ParseDouble(true, $"Can't parse double value {fullData[i].GetValue(inputFieldIndexes[j])}.");
                    }
                    if (i < fullData.Count - 1)
                    {
                        //Within the bundle
                        inputVectorCollection.Add(inputVector);
                    }
                    else
                    {
                        //remaining input vector out of the bundle
                        remainingInputVector = inputVector;
                    }
                    if (i > 0)
                    {
                        //Output vector
                        double[] outputVector = new double[outputFieldIndexes.Count];
                        for (int j = 0; j < outputFieldIndexes.Count; j++)
                        {
                            outputVector[j] = fullData[i].GetValue(outputFieldIndexes[j]).ParseDouble(true, $"Can't parse double value {fullData[i].GetValue(outputFieldIndexes[j])}.");
                        }
                        outputVectorCollection.Add(outputVector);
                    }
                }
                //Create bundle
                bundle = new VectorBundle(inputVectorCollection, outputVectorCollection);
            }
            return(bundle);
        } //LoadFromCsv
Exemple #5
0
 /// <summary>
 /// Naturalizes all values in the sample data bundle
 /// </summary>
 /// <param name="bundle">Sample data bundle</param>
 public void Naturalize(VectorBundle bundle)
 {
     NaturalizeInputVectorCollection(bundle.InputVectorCollection);
     NaturalizeOutputVectorCollection(bundle.OutputVectorCollection);
     return;
 }
Exemple #6
0
        /// <summary>
        /// Splits this bundle to a collection of smaller folds (sub-bundles) suitable for the cross-validation.
        /// </summary>
        /// <param name="foldDataRatio">The requested ratio of the samples constituting the single fold (sub-bundle).</param>
        /// <param name="binBorder">When the binBorder is specified then all the output features are considered as binary features within the one-takes-all group and function then keeps balanced ratios of 0 and 1 for every output feature and the fold.</param>
        /// <returns>A collection of the created folds.</returns>
        public List <VectorBundle> Folderize(double foldDataRatio, double binBorder = double.NaN)
        {
            if (OutputVectorCollection.Count < 2)
            {
                throw new InvalidOperationException($"Insufficient number of samples ({OutputVectorCollection.Count.ToString(CultureInfo.InvariantCulture)}).");
            }
            List <VectorBundle> foldCollection = new List <VectorBundle>();

            //Fold data ratio basic correction
            if (foldDataRatio > MaxRatioOfFoldData)
            {
                foldDataRatio = MaxRatioOfFoldData;
            }
            //Prelimitary fold size estimation
            int foldSize = Math.Max(1, (int)Math.Round(OutputVectorCollection.Count * foldDataRatio, 0));
            //Prelimitary number of folds
            int numOfFolds = (int)Math.Round((double)OutputVectorCollection.Count / foldSize);

            //Folds creation
            if (double.IsNaN(binBorder))
            {
                //No binary output -> simple split
                int samplesPos = 0;
                for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++)
                {
                    VectorBundle fold = new VectorBundle();
                    for (int i = 0; i < foldSize && samplesPos < OutputVectorCollection.Count; i++)
                    {
                        fold.InputVectorCollection.Add(InputVectorCollection[samplesPos]);
                        fold.OutputVectorCollection.Add(OutputVectorCollection[samplesPos]);
                        ++samplesPos;
                    }
                    foldCollection.Add(fold);
                }
                //Remaining samples
                for (int i = 0; i < OutputVectorCollection.Count - samplesPos; i++)
                {
                    int foldIdx = i % foldCollection.Count;
                    foldCollection[foldIdx].InputVectorCollection.Add(InputVectorCollection[samplesPos + i]);
                    foldCollection[foldIdx].OutputVectorCollection.Add(OutputVectorCollection[samplesPos + i]);
                }
            }//Indifferent output
            else
            {
                //Binary outputs -> keep balanced ratios of outputs
                int numOfOutputs = OutputVectorCollection[0].Length;
                if (numOfOutputs == 1)
                {
                    //Special case there is only one binary output
                    //Investigation of the output data metrics
                    BinDistribution refBinDistr = new BinDistribution(binBorder);
                    refBinDistr.Update(OutputVectorCollection, 0);
                    int min01 = Math.Min(refBinDistr.NumOf[0], refBinDistr.NumOf[1]);
                    if (min01 < 2)
                    {
                        throw new InvalidOperationException($"Insufficient bin 0 or 1 samples (less than 2).");
                    }
                    if (numOfFolds > min01)
                    {
                        numOfFolds = min01;
                    }
                    //Scan data
                    int[] bin0SampleIdxs = new int[refBinDistr.NumOf[0]];
                    int   bin0SamplesPos = 0;
                    int[] bin1SampleIdxs = new int[refBinDistr.NumOf[1]];
                    int   bin1SamplesPos = 0;
                    for (int i = 0; i < OutputVectorCollection.Count; i++)
                    {
                        if (OutputVectorCollection[i][0] >= refBinDistr.BinBorder)
                        {
                            bin1SampleIdxs[bin1SamplesPos++] = i;
                        }
                        else
                        {
                            bin0SampleIdxs[bin0SamplesPos++] = i;
                        }
                    }
                    //Determine distributions of 0 and 1 for one fold
                    int bundleBin0Count = Math.Max(1, refBinDistr.NumOf[0] / numOfFolds);
                    int bundleBin1Count = Math.Max(1, refBinDistr.NumOf[1] / numOfFolds);
                    //Bundles creation
                    bin0SamplesPos = 0;
                    bin1SamplesPos = 0;
                    for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++)
                    {
                        VectorBundle fold = new VectorBundle();
                        //Bin 0
                        for (int i = 0; i < bundleBin0Count; i++)
                        {
                            fold.InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]);
                            fold.OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]);
                            ++bin0SamplesPos;
                        }
                        //Bin 1
                        for (int i = 0; i < bundleBin1Count; i++)
                        {
                            fold.InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]);
                            fold.OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]);
                            ++bin1SamplesPos;
                        }
                        foldCollection.Add(fold);
                    }
                    //Remaining samples
                    for (int i = 0; i < bin0SampleIdxs.Length - bin0SamplesPos; i++)
                    {
                        int foldIdx = i % foldCollection.Count;
                        foldCollection[foldIdx].InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]);
                        foldCollection[foldIdx].OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]);
                    }
                    for (int i = 0; i < bin1SampleIdxs.Length - bin1SamplesPos; i++)
                    {
                        int foldIdx = i % foldCollection.Count;
                        foldCollection[foldIdx].InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]);
                        foldCollection[foldIdx].OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]);
                    }
                }//Only 1 binary output
                else
                {
                    //There is more than 1 binary output - "one takes all approach"
                    //Investigation of the output data metrics
                    //Collect bin 1 sample indexes and check "one takes all" consistency for every output feature
                    List <int>[] outBin1SampleIdxs = new List <int> [numOfOutputs];
                    for (int i = 0; i < numOfOutputs; i++)
                    {
                        outBin1SampleIdxs[i] = new List <int>();
                    }
                    for (int sampleIdx = 0; sampleIdx < OutputVectorCollection.Count; sampleIdx++)
                    {
                        int numOf1 = 0;
                        for (int outFeatureIdx = 0; outFeatureIdx < numOfOutputs; outFeatureIdx++)
                        {
                            if (OutputVectorCollection[sampleIdx][outFeatureIdx] >= binBorder)
                            {
                                outBin1SampleIdxs[outFeatureIdx].Add(sampleIdx);
                                ++numOf1;
                            }
                        }
                        if (numOf1 != 1)
                        {
                            throw new ArgumentException($"Data are inconsistent on data index {sampleIdx.ToString(CultureInfo.InvariantCulture)}. Output vector has {numOf1.ToString(CultureInfo.InvariantCulture)} feature(s) having bin value 1.", "binBorder");
                        }
                    }
                    //Determine max possible number of folds
                    int maxNumOfFolds = OutputVectorCollection.Count;
                    for (int outFeatureIdx = 0; outFeatureIdx < numOfOutputs; outFeatureIdx++)
                    {
                        int outFeatureMaxFolds = Math.Min(outBin1SampleIdxs[outFeatureIdx].Count, OutputVectorCollection.Count - outBin1SampleIdxs[outFeatureIdx].Count);
                        maxNumOfFolds = Math.Min(outFeatureMaxFolds, maxNumOfFolds);
                    }
                    //Correct the number of folds to be created
                    if (numOfFolds > maxNumOfFolds)
                    {
                        numOfFolds = maxNumOfFolds;
                    }
                    //Create the folds
                    for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++)
                    {
                        foldCollection.Add(new VectorBundle());
                    }
                    //Samples distribution
                    for (int outFeatureIdx = 0; outFeatureIdx < numOfOutputs; outFeatureIdx++)
                    {
                        for (int bin1SampleRefIdx = 0; bin1SampleRefIdx < outBin1SampleIdxs[outFeatureIdx].Count; bin1SampleRefIdx++)
                        {
                            int foldIdx = bin1SampleRefIdx % foldCollection.Count;
                            int dataIdx = outBin1SampleIdxs[outFeatureIdx][bin1SampleRefIdx];
                            foldCollection[foldIdx].AddPair(InputVectorCollection[dataIdx], OutputVectorCollection[dataIdx]);
                        }
                    }
                } //More binary outputs
            }     //Binary output

            return(foldCollection);
        }
Exemple #7
0
 /// <summary>
 /// Adds all the vector pairs from another vector bundle.
 /// </summary>
 /// <param name="data">Another vector bundle.</param>
 public void Add(VectorBundle data)
 {
     InputVectorCollection.AddRange(data.InputVectorCollection);
     OutputVectorCollection.AddRange(data.OutputVectorCollection);
     return;
 }
Exemple #8
0
        //Methods
        /// <summary>
        /// Splits this bundle to a collection of smaller bundles.
        /// Method expects length of the output vectors = 1.
        /// </summary>
        /// <param name="subBundleSize">Sub-bundle size</param>
        /// <param name="binBorder">If specified and there is only one output value, method will keep balanced number of output values GE to binBorder in the each sub-bundle</param>
        /// <returns>Collection of extracted sub-bundles</returns>
        public List <VectorBundle> Split(int subBundleSize, double binBorder = double.NaN)
        {
            int numOfBundles = OutputVectorCollection.Count / subBundleSize;
            List <VectorBundle> bundleCollection = new List <VectorBundle>(numOfBundles);

            if (!double.IsNaN(binBorder) && OutputVectorCollection[0].Length == 1)
            {
                BinDistribution refBinDistr = new BinDistribution(binBorder);
                refBinDistr.Update(OutputVectorCollection, 0);
                //Scan
                int[] bin0SampleIdxs = new int[refBinDistr.NumOf[0]];
                int   bin0SamplesPos = 0;
                int[] bin1SampleIdxs = new int[refBinDistr.NumOf[1]];
                int   bin1SamplesPos = 0;
                for (int i = 0; i < OutputVectorCollection.Count; i++)
                {
                    if (OutputVectorCollection[i][0] >= refBinDistr.BinBorder)
                    {
                        bin1SampleIdxs[bin1SamplesPos++] = i;
                    }
                    else
                    {
                        bin0SampleIdxs[bin0SamplesPos++] = i;
                    }
                }
                //Division
                int bundleBin0Count = Math.Max(1, refBinDistr.NumOf[0] / numOfBundles);
                int bundleBin1Count = Math.Max(1, refBinDistr.NumOf[1] / numOfBundles);
                if (bundleBin0Count * numOfBundles > bin0SampleIdxs.Length)
                {
                    throw new InvalidOperationException($"Insufficient bin 0 samples");
                }
                if (bundleBin1Count * numOfBundles > bin1SampleIdxs.Length)
                {
                    throw new InvalidOperationException($"Insufficient bin 1 samples");
                }
                //Bundles creation
                bin0SamplesPos = 0;
                bin1SamplesPos = 0;
                for (int bundleNum = 0; bundleNum < numOfBundles; bundleNum++)
                {
                    VectorBundle bundle = new VectorBundle();
                    //Bin 0
                    for (int i = 0; i < bundleBin0Count; i++)
                    {
                        bundle.InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]);
                        bundle.OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]);
                        ++bin0SamplesPos;
                    }
                    //Bin 1
                    for (int i = 0; i < bundleBin1Count; i++)
                    {
                        bundle.InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]);
                        bundle.OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]);
                        ++bin1SamplesPos;
                    }
                    bundleCollection.Add(bundle);
                }
                //Remaining samples
                for (int i = 0; i < bin0SampleIdxs.Length - bin0SamplesPos; i++)
                {
                    int bundleIdx = i % bundleCollection.Count;
                    bundleCollection[bundleIdx].InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]);
                    bundleCollection[bundleIdx].OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]);
                }
                for (int i = 0; i < bin1SampleIdxs.Length - bin1SamplesPos; i++)
                {
                    int bundleIdx = i % bundleCollection.Count;
                    bundleCollection[bundleIdx].InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]);
                    bundleCollection[bundleIdx].OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]);
                }
            }
            else
            {
                //Bundles creation
                int samplesPos = 0;
                for (int bundleNum = 0; bundleNum < numOfBundles; bundleNum++)
                {
                    VectorBundle bundle = new VectorBundle();
                    for (int i = 0; i < subBundleSize && samplesPos < OutputVectorCollection.Count; i++)
                    {
                        bundle.InputVectorCollection.Add(InputVectorCollection[samplesPos]);
                        bundle.OutputVectorCollection.Add(OutputVectorCollection[samplesPos]);
                        ++samplesPos;
                    }
                    bundleCollection.Add(bundle);
                }
                //Remaining samples
                for (int i = 0; i < OutputVectorCollection.Count - samplesPos; i++)
                {
                    int bundleIdx = i % bundleCollection.Count;
                    bundleCollection[bundleIdx].InputVectorCollection.Add(InputVectorCollection[samplesPos + i]);
                    bundleCollection[bundleIdx].OutputVectorCollection.Add(OutputVectorCollection[samplesPos + i]);
                }
            }
            return(bundleCollection);
        }