/// <summary> /// Normalizes all values in the sample data bundle /// </summary> /// <param name="bundle">Sample data bundle</param> public void Normalize(TimeSeriesBundle bundle) { AdjustNormalizers(bundle); NormalizeInputVectorCollection(bundle.InputVectorCollection); NormalizeOutputVectorCollection(bundle.OutputVectorCollection); return; }
/// <summary> /// Creates PredictionBundle from the vector collection /// </summary> /// <param name="vectorCollection">Collection of vectors</param> /// <param name="normalize">Specifies whether to normalize data in the created bundle</param> /// <param name="bundle">Created bundle</param> /// <returns>The last unused vector</returns> public double[] CreateBundleFromVectorCollection(List <double[]> vectorCollection, bool normalize, out TimeSeriesBundle bundle ) { CheckStructure(); if (vectorCollection[0].Length != _fieldNameTypeCollection.Count) { throw new ArgumentException($"Inconsistent number of fields ({vectorCollection[0].Length}) in vectorCollection and number of defined fields ({_fieldNameTypeCollection.Count}).", "vectorCollection"); } //Input field indexes int[] inputFieldIdxs = new int[_inputFieldNameCollection.Count]; for (int i = 0; i < _inputFieldNameCollection.Count; i++) { inputFieldIdxs[i] = _fieldNameCollection.IndexOf(_inputFieldNameCollection[i]); } //Output field indexes int[] outputFieldIdxs = new int[_outputFieldNameCollection.Count]; for (int i = 0; i < _outputFieldNameCollection.Count; i++) { outputFieldIdxs[i] = _fieldNameCollection.IndexOf(_outputFieldNameCollection[i]); } double[] remainingInputVector = null; bundle = new TimeSeriesBundle(); for (int row = 0; row < vectorCollection.Count; row++) { //Input vector double[] inputVector = new double[inputFieldIdxs.Length]; for (int i = 0; i < inputFieldIdxs.Length; i++) { inputVector[i] = vectorCollection[row][inputFieldIdxs[i]]; } if (row < vectorCollection.Count - 1) { bundle.InputVectorCollection.Add(inputVector); } else { remainingInputVector = inputVector; } //Output vector if (row > 0) { double[] outputVector = new double[outputFieldIdxs.Length]; for (int i = 0; i < outputFieldIdxs.Length; i++) { outputVector[i] = vectorCollection[row][outputFieldIdxs[i]]; } bundle.OutputVectorCollection.Add(outputVector); } } //Normalization ? if (normalize) { Normalize(bundle); NormalizeInputVector(remainingInputVector); } return(remainingInputVector); }
/// <summary> /// Adjusts internal normalizers /// </summary> /// <param name="bundle">Sample data bundle</param> public void AdjustNormalizers(TimeSeriesBundle bundle) { ResetNormalizers(); foreach (double[] inputVector in bundle.InputVectorCollection) { AdjustInputNormalizers(inputVector); } foreach (double[] outputVector in bundle.OutputVectorCollection) { AdjustOutputNormalizers(outputVector); } return; }
/// <summary> /// Naturalizes all values in the sample data bundle /// </summary> /// <param name="bundle">Sample data bundle</param> public void Naturalize(TimeSeriesBundle bundle) { NaturalizeInputVectorCollection(bundle.InputVectorCollection); NaturalizeOutputVectorCollection(bundle.OutputVectorCollection); return; }
/// <summary> /// Loads the data and prepares PredictionBundle. /// The first line of the csv file must be field names. These field names must /// match the names of the input and output fields. /// </summary> /// <param name="fileName"> /// Data file name /// </param> /// <param name="inputFieldNameCollection"> /// Input fields /// </param> /// <param name="outputFieldNameCollection"> /// Output fields /// </param> /// <param name="normRange"> /// Range of normalized values /// </param> /// <param name="normReserveRatio"> /// Reserve held by a normalizer to cover cases where future data exceeds a known range of sample data. /// </param> /// <param name="dataStandardization"> /// Specifies whether to apply data standardization /// </param> /// <param name="singleNormalizer"> /// Use true if all input and output fields are about the same range of values. /// </param> /// <param name="bundleNormalizer"> /// Returned initialized instance of BundleNormalizer. /// </param> /// <param name="remainingInputVector"> /// Returned the last input vector unused in the bundle. /// </param> public static TimeSeriesBundle Load(string fileName, List <string> inputFieldNameCollection, List <string> outputFieldNameCollection, Interval normRange, double normReserveRatio, bool dataStandardization, bool singleNormalizer, out BundleNormalizer bundleNormalizer, out double[] remainingInputVector ) { TimeSeriesBundle bundle = null; bundleNormalizer = new BundleNormalizer(normRange, normReserveRatio, dataStandardization, normReserveRatio, dataStandardization); using (StreamReader streamReader = new StreamReader(new FileStream(fileName, FileMode.Open))) { List <int> fieldIndexes = new List <int>(); List <double[]> allData = new List <double[]>(); //First row contains column names (data fields) string delimitedColumnNames = streamReader.ReadLine(); //What data delimiter is used? char csvDelimiter = DelimitedStringValues.RecognizeDelimiter(delimitedColumnNames); //Split column names DelimitedStringValues columnNames = new DelimitedStringValues(csvDelimiter); columnNames.LoadFromString(delimitedColumnNames); //Check if the recognized data delimiter works properly if (columnNames.NumOfStringValues < inputFieldNameCollection.Count) { throw new FormatException("1st row of the file doesn't contain delimited column names or the value delimiter was not properly recognized."); } //Define fields foreach (string name in inputFieldNameCollection) { if (!bundleNormalizer.IsFieldDefined(name)) { bundleNormalizer.DefineField(name, singleNormalizer ? "COMMON" : name); fieldIndexes.Add(columnNames.IndexOf(name)); } bundleNormalizer.DefineInputField(name); } foreach (string name in outputFieldNameCollection) { if (!bundleNormalizer.IsFieldDefined(name)) { bundleNormalizer.DefineField(name, singleNormalizer ? "COMMON" : name); fieldIndexes.Add(columnNames.IndexOf(name)); } bundleNormalizer.DefineOutputField(name); } //Finalize structure bundleNormalizer.FinalizeStructure(); //Load all relevant data DelimitedStringValues dataRow = new DelimitedStringValues(csvDelimiter); while (!streamReader.EndOfStream) { dataRow.LoadFromString(streamReader.ReadLine()); double[] vector = new double[fieldIndexes.Count]; for (int i = 0; i < fieldIndexes.Count; i++) { vector[i] = dataRow.GetValue(fieldIndexes[i]).ParseDouble(true, $"Can't parse double value {dataRow.GetValue(fieldIndexes[i])}."); } allData.Add(vector); } //Create data bundle remainingInputVector = bundleNormalizer.CreateBundleFromVectorCollection(allData, true, out bundle); } return(bundle); } //Load
//Static methods /// <summary> /// Loads the data and prepares TimeSeriesBundle. /// The first line of the csv file must be field names. These field names must /// match the names of the input and output fields. /// </summary> /// <param name="fileName"> Data file name </param> /// <param name="inputFieldNameCollection"> Input field names </param> /// <param name="outputFieldNameCollection"> Output field names </param> /// <param name="outputFieldTaskCollection"> /// Neural task related to output field. /// Classification task means the output field contains binary value so data /// standardization and normalizer reserve are suppressed. /// </param> /// <param name="normRange"> Range of normalized values </param> /// <param name="normReserveRatio"> /// Reserve held by a normalizer to cover cases where future data exceeds a known range of sample data. /// </param> /// <param name="dataStandardization"> Specifies whether to apply data standardization </param> /// <param name="bundleNormalizer"> Returned initialized instance of BundleNormalizer </param> /// <param name="remainingInputVector"> Returned the last input vector unused in the bundle </param> public static TimeSeriesBundle LoadFromCsv(string fileName, List <string> inputFieldNameCollection, List <string> outputFieldNameCollection, List <CommonEnums.TaskType> outputFieldTaskCollection, Interval normRange, double normReserveRatio, bool dataStandardization, out BundleNormalizer bundleNormalizer, out double[] remainingInputVector ) { TimeSeriesBundle bundle = null; remainingInputVector = null; bundleNormalizer = new BundleNormalizer(normRange); using (StreamReader streamReader = new StreamReader(new FileStream(fileName, FileMode.Open))) { List <int> inputFieldIndexes = new List <int>(); List <int> outputFieldIndexes = new List <int>(); //First row contains column names (data fields) string delimitedColumnNames = streamReader.ReadLine(); //What data delimiter is used? char csvDelimiter = DelimitedStringValues.RecognizeDelimiter(delimitedColumnNames); //Split column names DelimitedStringValues columnNames = new DelimitedStringValues(csvDelimiter); columnNames.LoadFromString(delimitedColumnNames); //Check if the recognized data delimiter works properly if (columnNames.NumOfStringValues < inputFieldNameCollection.Count) { throw new FormatException("1st row of the file doesn't contain delimited column names or the value delimiter was not properly recognized."); } //Define fields foreach (string name in inputFieldNameCollection) { if (!bundleNormalizer.IsFieldDefined(name)) { bundleNormalizer.DefineField(name, name, normReserveRatio, dataStandardization); inputFieldIndexes.Add(columnNames.IndexOf(name)); } bundleNormalizer.DefineInputField(name); } for (int i = 0; i < outputFieldNameCollection.Count; i++) { if (!bundleNormalizer.IsFieldDefined(outputFieldNameCollection[i])) { bundleNormalizer.DefineField(outputFieldNameCollection[i], outputFieldNameCollection[i], outputFieldTaskCollection[i] == CommonEnums.TaskType.Classification ? 0 : normReserveRatio, outputFieldTaskCollection[i] == CommonEnums.TaskType.Classification ? false : dataStandardization ); } outputFieldIndexes.Add(columnNames.IndexOf(outputFieldNameCollection[i])); bundleNormalizer.DefineOutputField(outputFieldNameCollection[i]); } //Finalize structure bundleNormalizer.FinalizeStructure(); //Load full data in string form List <DelimitedStringValues> fullData = new List <DelimitedStringValues>(); while (!streamReader.EndOfStream) { DelimitedStringValues row = new DelimitedStringValues(csvDelimiter); row.LoadFromString(streamReader.ReadLine()); fullData.Add(row); } //Prepare input and output vectors List <double[]> inputVectorCollection = new List <double[]>(fullData.Count); List <double[]> outputVectorCollection = new List <double[]>(fullData.Count); for (int i = 0; i < fullData.Count; i++) { //Input vector double[] inputVector = new double[inputFieldIndexes.Count]; for (int j = 0; j < inputFieldIndexes.Count; j++) { inputVector[j] = fullData[i].GetValue(inputFieldIndexes[j]).ParseDouble(true, $"Can't parse double value {fullData[i].GetValue(inputFieldIndexes[j])}."); } if (i < fullData.Count - 1) { //Within the bundle inputVectorCollection.Add(inputVector); } else { //remaining input vector out of the bundle remainingInputVector = inputVector; } if (i > 0) { //Output vector double[] outputVector = new double[outputFieldIndexes.Count]; for (int j = 0; j < outputFieldIndexes.Count; j++) { outputVector[j] = fullData[i].GetValue(outputFieldIndexes[j]).ParseDouble(true, $"Can't parse double value {fullData[i].GetValue(outputFieldIndexes[j])}."); } outputVectorCollection.Add(outputVector); } } //Create bundle bundle = new TimeSeriesBundle(inputVectorCollection, outputVectorCollection); //Normalize bundle and remaining input vector bundleNormalizer.Normalize(bundle); bundleNormalizer.NormalizeInputVector(remainingInputVector); } return(bundle); } //LoadFromCsv