/// <summary> /// Normalizes all values in the sample data bundle /// </summary> /// <param name="bundle">Sample data bundle</param> public void Normalize(VectorBundle bundle) { AdjustNormalizers(bundle); NormalizeInputVectorCollection(bundle.InputVectorCollection); NormalizeOutputVectorCollection(bundle.OutputVectorCollection); return; }
/// <summary> /// Loads the vector bundle from the csv data (patterned input feeding). /// </summary> /// <param name="csvData">The csv data.</param> /// <param name="numOfOutputFields">The number of output fields.</param> public static VectorBundle Load(CsvDataHolder csvData, int numOfOutputFields) { VectorBundle bundle = new VectorBundle(); foreach (DelimitedStringValues dataRow in csvData.DataRowCollection) { int numOfInputValues = dataRow.NumOfStringValues - numOfOutputFields; //Check data length if (numOfInputValues <= 0) { throw new ArgumentException("Incorrect length of data row.", "csvData"); } //Input data double[] inputData = new double[numOfInputValues]; for (int i = 0; i < numOfInputValues; i++) { inputData[i] = dataRow.GetValueAt(i).ParseDouble(true, $"Can't parse double data value {dataRow.GetValueAt(i)}."); } //Output data double[] outputData = new double[numOfOutputFields]; for (int i = 0; i < numOfOutputFields; i++) { outputData[i] = dataRow.GetValueAt(numOfInputValues + i).ParseDouble(true, $"Can't parse double data value {dataRow.GetValueAt(numOfInputValues + i)}."); } bundle.AddPair(inputData, outputData); } return(bundle); }
/// <summary> /// Adjusts internal normalizers /// </summary> /// <param name="bundle">Sample data bundle</param> public void AdjustNormalizers(VectorBundle bundle) { ResetNormalizers(); foreach (double[] inputVector in bundle.InputVectorCollection) { AdjustInputNormalizers(inputVector); } foreach (double[] outputVector in bundle.OutputVectorCollection) { AdjustOutputNormalizers(outputVector); } return; }
//Static methods /// <summary> /// Loads the data and prepares TimeSeriesBundle. /// The first line of the csv file must contain field names. These field names must /// match the names of the input and output fields. /// </summary> /// <param name="fileName"> Data file name </param> /// <param name="inputFieldNameCollection"> Input fields to be extracted from a file</param> /// <param name="outputFieldNameCollection"> Output fields to be extracted from a file</param> /// <param name="remainingInputVector"> Returned the last input vector unused in the bundle </param> public static VectorBundle LoadFromCsv(string fileName, List <string> inputFieldNameCollection, List <string> outputFieldNameCollection, out double[] remainingInputVector ) { VectorBundle bundle = null; remainingInputVector = null; using (StreamReader streamReader = new StreamReader(new FileStream(fileName, FileMode.Open))) { List <int> inputFieldIndexes = new List <int>(); List <int> outputFieldIndexes = new List <int>(); //First row contains column names (data fields) string delimitedColumnNames = streamReader.ReadLine(); //What data delimiter is used? char csvDelimiter = DelimitedStringValues.RecognizeDelimiter(delimitedColumnNames); //Split column names DelimitedStringValues columnNames = new DelimitedStringValues(csvDelimiter); columnNames.LoadFromString(delimitedColumnNames); //Check if the recognized data delimiter works properly if (columnNames.NumOfStringValues < inputFieldNameCollection.Count) { throw new FormatException("1st row of the file doesn't contain delimited column names or the value delimiter was not properly recognized."); } //Collect indexes of allowed fields foreach (string name in inputFieldNameCollection) { inputFieldIndexes.Add(columnNames.IndexOf(name)); } for (int i = 0; i < outputFieldNameCollection.Count; i++) { outputFieldIndexes.Add(columnNames.IndexOf(outputFieldNameCollection[i])); } //Load full data in string form List <DelimitedStringValues> fullData = new List <DelimitedStringValues>(); while (!streamReader.EndOfStream) { DelimitedStringValues row = new DelimitedStringValues(csvDelimiter); row.LoadFromString(streamReader.ReadLine()); fullData.Add(row); } //Prepare input and output vectors List <double[]> inputVectorCollection = new List <double[]>(fullData.Count); List <double[]> outputVectorCollection = new List <double[]>(fullData.Count); for (int i = 0; i < fullData.Count; i++) { //Input vector double[] inputVector = new double[inputFieldIndexes.Count]; for (int j = 0; j < inputFieldIndexes.Count; j++) { inputVector[j] = fullData[i].GetValue(inputFieldIndexes[j]).ParseDouble(true, $"Can't parse double value {fullData[i].GetValue(inputFieldIndexes[j])}."); } if (i < fullData.Count - 1) { //Within the bundle inputVectorCollection.Add(inputVector); } else { //remaining input vector out of the bundle remainingInputVector = inputVector; } if (i > 0) { //Output vector double[] outputVector = new double[outputFieldIndexes.Count]; for (int j = 0; j < outputFieldIndexes.Count; j++) { outputVector[j] = fullData[i].GetValue(outputFieldIndexes[j]).ParseDouble(true, $"Can't parse double value {fullData[i].GetValue(outputFieldIndexes[j])}."); } outputVectorCollection.Add(outputVector); } } //Create bundle bundle = new VectorBundle(inputVectorCollection, outputVectorCollection); } return(bundle); } //LoadFromCsv
/// <summary> /// Naturalizes all values in the sample data bundle /// </summary> /// <param name="bundle">Sample data bundle</param> public void Naturalize(VectorBundle bundle) { NaturalizeInputVectorCollection(bundle.InputVectorCollection); NaturalizeOutputVectorCollection(bundle.OutputVectorCollection); return; }
/// <summary> /// Splits this bundle to a collection of smaller folds (sub-bundles) suitable for the cross-validation. /// </summary> /// <param name="foldDataRatio">The requested ratio of the samples constituting the single fold (sub-bundle).</param> /// <param name="binBorder">When the binBorder is specified then all the output features are considered as binary features within the one-takes-all group and function then keeps balanced ratios of 0 and 1 for every output feature and the fold.</param> /// <returns>A collection of the created folds.</returns> public List <VectorBundle> Folderize(double foldDataRatio, double binBorder = double.NaN) { if (OutputVectorCollection.Count < 2) { throw new InvalidOperationException($"Insufficient number of samples ({OutputVectorCollection.Count.ToString(CultureInfo.InvariantCulture)})."); } List <VectorBundle> foldCollection = new List <VectorBundle>(); //Fold data ratio basic correction if (foldDataRatio > MaxRatioOfFoldData) { foldDataRatio = MaxRatioOfFoldData; } //Prelimitary fold size estimation int foldSize = Math.Max(1, (int)Math.Round(OutputVectorCollection.Count * foldDataRatio, 0)); //Prelimitary number of folds int numOfFolds = (int)Math.Round((double)OutputVectorCollection.Count / foldSize); //Folds creation if (double.IsNaN(binBorder)) { //No binary output -> simple split int samplesPos = 0; for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++) { VectorBundle fold = new VectorBundle(); for (int i = 0; i < foldSize && samplesPos < OutputVectorCollection.Count; i++) { fold.InputVectorCollection.Add(InputVectorCollection[samplesPos]); fold.OutputVectorCollection.Add(OutputVectorCollection[samplesPos]); ++samplesPos; } foldCollection.Add(fold); } //Remaining samples for (int i = 0; i < OutputVectorCollection.Count - samplesPos; i++) { int foldIdx = i % foldCollection.Count; foldCollection[foldIdx].InputVectorCollection.Add(InputVectorCollection[samplesPos + i]); foldCollection[foldIdx].OutputVectorCollection.Add(OutputVectorCollection[samplesPos + i]); } }//Indifferent output else { //Binary outputs -> keep balanced ratios of outputs int numOfOutputs = OutputVectorCollection[0].Length; if (numOfOutputs == 1) { //Special case there is only one binary output //Investigation of the output data metrics BinDistribution refBinDistr = new BinDistribution(binBorder); refBinDistr.Update(OutputVectorCollection, 0); int min01 = Math.Min(refBinDistr.NumOf[0], refBinDistr.NumOf[1]); if (min01 < 2) { throw new InvalidOperationException($"Insufficient bin 0 or 1 samples (less than 2)."); } if (numOfFolds > min01) { numOfFolds = min01; } //Scan data int[] bin0SampleIdxs = new int[refBinDistr.NumOf[0]]; int bin0SamplesPos = 0; int[] bin1SampleIdxs = new int[refBinDistr.NumOf[1]]; int bin1SamplesPos = 0; for (int i = 0; i < OutputVectorCollection.Count; i++) { if (OutputVectorCollection[i][0] >= refBinDistr.BinBorder) { bin1SampleIdxs[bin1SamplesPos++] = i; } else { bin0SampleIdxs[bin0SamplesPos++] = i; } } //Determine distributions of 0 and 1 for one fold int bundleBin0Count = Math.Max(1, refBinDistr.NumOf[0] / numOfFolds); int bundleBin1Count = Math.Max(1, refBinDistr.NumOf[1] / numOfFolds); //Bundles creation bin0SamplesPos = 0; bin1SamplesPos = 0; for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++) { VectorBundle fold = new VectorBundle(); //Bin 0 for (int i = 0; i < bundleBin0Count; i++) { fold.InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]); fold.OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]); ++bin0SamplesPos; } //Bin 1 for (int i = 0; i < bundleBin1Count; i++) { fold.InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]); fold.OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]); ++bin1SamplesPos; } foldCollection.Add(fold); } //Remaining samples for (int i = 0; i < bin0SampleIdxs.Length - bin0SamplesPos; i++) { int foldIdx = i % foldCollection.Count; foldCollection[foldIdx].InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]); foldCollection[foldIdx].OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]); } for (int i = 0; i < bin1SampleIdxs.Length - bin1SamplesPos; i++) { int foldIdx = i % foldCollection.Count; foldCollection[foldIdx].InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]); foldCollection[foldIdx].OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]); } }//Only 1 binary output else { //There is more than 1 binary output - "one takes all approach" //Investigation of the output data metrics //Collect bin 1 sample indexes and check "one takes all" consistency for every output feature List <int>[] outBin1SampleIdxs = new List <int> [numOfOutputs]; for (int i = 0; i < numOfOutputs; i++) { outBin1SampleIdxs[i] = new List <int>(); } for (int sampleIdx = 0; sampleIdx < OutputVectorCollection.Count; sampleIdx++) { int numOf1 = 0; for (int outFeatureIdx = 0; outFeatureIdx < numOfOutputs; outFeatureIdx++) { if (OutputVectorCollection[sampleIdx][outFeatureIdx] >= binBorder) { outBin1SampleIdxs[outFeatureIdx].Add(sampleIdx); ++numOf1; } } if (numOf1 != 1) { throw new ArgumentException($"Data are inconsistent on data index {sampleIdx.ToString(CultureInfo.InvariantCulture)}. Output vector has {numOf1.ToString(CultureInfo.InvariantCulture)} feature(s) having bin value 1.", "binBorder"); } } //Determine max possible number of folds int maxNumOfFolds = OutputVectorCollection.Count; for (int outFeatureIdx = 0; outFeatureIdx < numOfOutputs; outFeatureIdx++) { int outFeatureMaxFolds = Math.Min(outBin1SampleIdxs[outFeatureIdx].Count, OutputVectorCollection.Count - outBin1SampleIdxs[outFeatureIdx].Count); maxNumOfFolds = Math.Min(outFeatureMaxFolds, maxNumOfFolds); } //Correct the number of folds to be created if (numOfFolds > maxNumOfFolds) { numOfFolds = maxNumOfFolds; } //Create the folds for (int foldIdx = 0; foldIdx < numOfFolds; foldIdx++) { foldCollection.Add(new VectorBundle()); } //Samples distribution for (int outFeatureIdx = 0; outFeatureIdx < numOfOutputs; outFeatureIdx++) { for (int bin1SampleRefIdx = 0; bin1SampleRefIdx < outBin1SampleIdxs[outFeatureIdx].Count; bin1SampleRefIdx++) { int foldIdx = bin1SampleRefIdx % foldCollection.Count; int dataIdx = outBin1SampleIdxs[outFeatureIdx][bin1SampleRefIdx]; foldCollection[foldIdx].AddPair(InputVectorCollection[dataIdx], OutputVectorCollection[dataIdx]); } } } //More binary outputs } //Binary output return(foldCollection); }
/// <summary> /// Adds all the vector pairs from another vector bundle. /// </summary> /// <param name="data">Another vector bundle.</param> public void Add(VectorBundle data) { InputVectorCollection.AddRange(data.InputVectorCollection); OutputVectorCollection.AddRange(data.OutputVectorCollection); return; }
//Methods /// <summary> /// Splits this bundle to a collection of smaller bundles. /// Method expects length of the output vectors = 1. /// </summary> /// <param name="subBundleSize">Sub-bundle size</param> /// <param name="binBorder">If specified and there is only one output value, method will keep balanced number of output values GE to binBorder in the each sub-bundle</param> /// <returns>Collection of extracted sub-bundles</returns> public List <VectorBundle> Split(int subBundleSize, double binBorder = double.NaN) { int numOfBundles = OutputVectorCollection.Count / subBundleSize; List <VectorBundle> bundleCollection = new List <VectorBundle>(numOfBundles); if (!double.IsNaN(binBorder) && OutputVectorCollection[0].Length == 1) { BinDistribution refBinDistr = new BinDistribution(binBorder); refBinDistr.Update(OutputVectorCollection, 0); //Scan int[] bin0SampleIdxs = new int[refBinDistr.NumOf[0]]; int bin0SamplesPos = 0; int[] bin1SampleIdxs = new int[refBinDistr.NumOf[1]]; int bin1SamplesPos = 0; for (int i = 0; i < OutputVectorCollection.Count; i++) { if (OutputVectorCollection[i][0] >= refBinDistr.BinBorder) { bin1SampleIdxs[bin1SamplesPos++] = i; } else { bin0SampleIdxs[bin0SamplesPos++] = i; } } //Division int bundleBin0Count = Math.Max(1, refBinDistr.NumOf[0] / numOfBundles); int bundleBin1Count = Math.Max(1, refBinDistr.NumOf[1] / numOfBundles); if (bundleBin0Count * numOfBundles > bin0SampleIdxs.Length) { throw new InvalidOperationException($"Insufficient bin 0 samples"); } if (bundleBin1Count * numOfBundles > bin1SampleIdxs.Length) { throw new InvalidOperationException($"Insufficient bin 1 samples"); } //Bundles creation bin0SamplesPos = 0; bin1SamplesPos = 0; for (int bundleNum = 0; bundleNum < numOfBundles; bundleNum++) { VectorBundle bundle = new VectorBundle(); //Bin 0 for (int i = 0; i < bundleBin0Count; i++) { bundle.InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]); bundle.OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos]]); ++bin0SamplesPos; } //Bin 1 for (int i = 0; i < bundleBin1Count; i++) { bundle.InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]); bundle.OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos]]); ++bin1SamplesPos; } bundleCollection.Add(bundle); } //Remaining samples for (int i = 0; i < bin0SampleIdxs.Length - bin0SamplesPos; i++) { int bundleIdx = i % bundleCollection.Count; bundleCollection[bundleIdx].InputVectorCollection.Add(InputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]); bundleCollection[bundleIdx].OutputVectorCollection.Add(OutputVectorCollection[bin0SampleIdxs[bin0SamplesPos + i]]); } for (int i = 0; i < bin1SampleIdxs.Length - bin1SamplesPos; i++) { int bundleIdx = i % bundleCollection.Count; bundleCollection[bundleIdx].InputVectorCollection.Add(InputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]); bundleCollection[bundleIdx].OutputVectorCollection.Add(OutputVectorCollection[bin1SampleIdxs[bin1SamplesPos + i]]); } } else { //Bundles creation int samplesPos = 0; for (int bundleNum = 0; bundleNum < numOfBundles; bundleNum++) { VectorBundle bundle = new VectorBundle(); for (int i = 0; i < subBundleSize && samplesPos < OutputVectorCollection.Count; i++) { bundle.InputVectorCollection.Add(InputVectorCollection[samplesPos]); bundle.OutputVectorCollection.Add(OutputVectorCollection[samplesPos]); ++samplesPos; } bundleCollection.Add(bundle); } //Remaining samples for (int i = 0; i < OutputVectorCollection.Count - samplesPos; i++) { int bundleIdx = i % bundleCollection.Count; bundleCollection[bundleIdx].InputVectorCollection.Add(InputVectorCollection[samplesPos + i]); bundleCollection[bundleIdx].OutputVectorCollection.Add(OutputVectorCollection[samplesPos + i]); } } return(bundleCollection); }