//Static methods /// <summary> /// Loads the data and prepares TimeSeriesBundle. /// The first line of the csv file must contain field names. These field names must /// match the names of the input and output fields. /// </summary> /// <param name="fileName"> Data file name </param> /// <param name="inputFieldNameCollection"> Input fields to be extracted from a file</param> /// <param name="outputFieldNameCollection"> Output fields to be extracted from a file</param> /// <param name="remainingInputVector"> Returned the last input vector unused in the bundle </param> public static VectorBundle LoadFromCsv(string fileName, List <string> inputFieldNameCollection, List <string> outputFieldNameCollection, out double[] remainingInputVector ) { VectorBundle bundle = null; remainingInputVector = null; using (StreamReader streamReader = new StreamReader(new FileStream(fileName, FileMode.Open))) { List <int> inputFieldIndexes = new List <int>(); List <int> outputFieldIndexes = new List <int>(); //First row contains column names (data fields) string delimitedColumnNames = streamReader.ReadLine(); //What data delimiter is used? char csvDelimiter = DelimitedStringValues.RecognizeDelimiter(delimitedColumnNames); //Split column names DelimitedStringValues columnNames = new DelimitedStringValues(csvDelimiter); columnNames.LoadFromString(delimitedColumnNames); //Check if the recognized data delimiter works properly if (columnNames.NumOfStringValues < inputFieldNameCollection.Count) { throw new FormatException("1st row of the file doesn't contain delimited column names or the value delimiter was not properly recognized."); } //Collect indexes of allowed fields foreach (string name in inputFieldNameCollection) { inputFieldIndexes.Add(columnNames.IndexOf(name)); } for (int i = 0; i < outputFieldNameCollection.Count; i++) { outputFieldIndexes.Add(columnNames.IndexOf(outputFieldNameCollection[i])); } //Load full data in string form List <DelimitedStringValues> fullData = new List <DelimitedStringValues>(); while (!streamReader.EndOfStream) { DelimitedStringValues row = new DelimitedStringValues(csvDelimiter); row.LoadFromString(streamReader.ReadLine()); fullData.Add(row); } //Prepare input and output vectors List <double[]> inputVectorCollection = new List <double[]>(fullData.Count); List <double[]> outputVectorCollection = new List <double[]>(fullData.Count); for (int i = 0; i < fullData.Count; i++) { //Input vector double[] inputVector = new double[inputFieldIndexes.Count]; for (int j = 0; j < inputFieldIndexes.Count; j++) { inputVector[j] = fullData[i].GetValue(inputFieldIndexes[j]).ParseDouble(true, $"Can't parse double value {fullData[i].GetValue(inputFieldIndexes[j])}."); } if (i < fullData.Count - 1) { //Within the bundle inputVectorCollection.Add(inputVector); } else { //remaining input vector out of the bundle remainingInputVector = inputVector; } if (i > 0) { //Output vector double[] outputVector = new double[outputFieldIndexes.Count]; for (int j = 0; j < outputFieldIndexes.Count; j++) { outputVector[j] = fullData[i].GetValue(outputFieldIndexes[j]).ParseDouble(true, $"Can't parse double value {fullData[i].GetValue(outputFieldIndexes[j])}."); } outputVectorCollection.Add(outputVector); } } //Create bundle bundle = new VectorBundle(inputVectorCollection, outputVectorCollection); } return(bundle); } //LoadFromCsv
/// <summary> /// Loads the data and prepares PatternBundle. /// 1st row of the file must start with the #RepetitiveGroupOfAttributes keyword followed by /// attribute names. /// 2nd row of the file must start with the #Outputs keyword followed by /// output field names. /// 3rd+ rows are the data rows. /// The data row must begin with at least one set of values for defined repetitive attributes. /// The data row must end with a value for each defined output. /// </summary> /// <param name="classification"> /// In case of classification the standardization and reserve ratio are not applied on output fields. /// </param> /// <param name="fileName"> /// Data file name /// </param> /// <param name="inputFieldNameCollection"> /// Input fields /// </param> /// <param name="outputFieldNameCollection"> /// Output fields /// </param> /// <param name="normRange"> /// Range of normalized values /// </param> /// <param name="normReserveRatio"> /// Reserve held by a normalizer to cover cases where future data exceeds a known range of sample data. /// </param> /// <param name="dataStandardization"> /// Specifies whether to apply data standardization to input data. /// Output data is never standardized. /// </param> /// <param name="bundleNormalizer"> /// Returned initialized instance of BundleNormalizer. /// </param> public static PatternBundle Load(bool classification, string fileName, List <string> inputFieldNameCollection, List <string> outputFieldNameCollection, Interval normRange, double normReserveRatio, bool dataStandardization, out BundleNormalizer bundleNormalizer ) { PatternBundle bundle = new PatternBundle(); bundleNormalizer = new BundleNormalizer(normRange, normReserveRatio, dataStandardization, classification ? 0 : normReserveRatio, classification ? false : dataStandardization); using (StreamReader streamReader = new StreamReader(new FileStream(fileName, FileMode.Open))) { //The first row contains the "#RepetitiveGroupOfAttributes" keyword followed by name(s) of attribute(s) string delimitedRepetitiveGroupOfAttributes = streamReader.ReadLine(); if (!delimitedRepetitiveGroupOfAttributes.StartsWith("#RepetitiveGroupOfAttributes")) { throw new FormatException("1st row of the file doesn't start with the #RepetitiveGroupOfAttributes keyword."); } //What data delimiter is used? char csvDelimiter = DelimitedStringValues.RecognizeDelimiter(delimitedRepetitiveGroupOfAttributes); //Split column names DelimitedStringValues repetitiveGroupOfAttributes = new DelimitedStringValues(csvDelimiter); repetitiveGroupOfAttributes.LoadFromString(delimitedRepetitiveGroupOfAttributes); repetitiveGroupOfAttributes.RemoveTrailingWhites(); //Check if the recognized data delimiter works properly if (repetitiveGroupOfAttributes.NumOfStringValues < 2) { throw new FormatException("The value delimiter was not recognized or missing repetitive attribute(s) name(s)."); } //Remove the #RepetitiveGroupOfAttributes keyword from the collection repetitiveGroupOfAttributes.RemoveAt(0); //Check if attribute names match with the input fields collection if (repetitiveGroupOfAttributes.NumOfStringValues != inputFieldNameCollection.Count) { throw new FormatException("Different number of attributes in the file and number of specified input fields."); } foreach (string inputFieldName in inputFieldNameCollection) { if (repetitiveGroupOfAttributes.IndexOf(inputFieldName) < 0) { throw new FormatException($"Input field name {inputFieldName} was not found among the repetitive attributes specified in the file."); } } //The second row contains the "#Outputs" keyword followed by name(s) of output class(es) or values(s) string delimitedOutputNames = streamReader.ReadLine(); if (!delimitedOutputNames.StartsWith("#Outputs")) { throw new FormatException("2nd row of the file doesn't start with the #Outputs keyword."); } DelimitedStringValues outputNames = new DelimitedStringValues(csvDelimiter); outputNames.LoadFromString(delimitedOutputNames); outputNames.RemoveTrailingWhites(); //Check if the there is at least one output name if (outputNames.NumOfStringValues < 2) { throw new FormatException("Missing output name(es)."); } //Remove the #Outputs keyword from the collection outputNames.RemoveAt(0); //Check if output names match with the output fields collection if (outputNames.NumOfStringValues != outputFieldNameCollection.Count) { throw new FormatException("Different number of outputs in the file and number of specified output fields."); } foreach (string outputFieldName in outputFieldNameCollection) { if (outputNames.IndexOf(outputFieldName) < 0) { throw new FormatException($"Output field name {outputFieldName} was not found among the outputs specified in the file."); } } //Bundle handler setup foreach (string attrName in repetitiveGroupOfAttributes.StringValueCollection) { bundleNormalizer.DefineField(attrName, attrName); bundleNormalizer.DefineInputField(attrName); } foreach (string outputName in outputNames.StringValueCollection) { bundleNormalizer.DefineField(outputName, outputName); bundleNormalizer.DefineOutputField(outputName); } bundleNormalizer.FinalizeStructure(); //Load data DelimitedStringValues dataRow = new DelimitedStringValues(csvDelimiter); while (!streamReader.EndOfStream) { dataRow.LoadFromString(streamReader.ReadLine()); dataRow.RemoveTrailingWhites(); //Check data length if (dataRow.NumOfStringValues < repetitiveGroupOfAttributes.NumOfStringValues + outputNames.NumOfStringValues || ((dataRow.NumOfStringValues - outputNames.NumOfStringValues) % repetitiveGroupOfAttributes.NumOfStringValues) != 0) { throw new FormatException("Incorrect length of data row."); } //Pattern data List <double[]> patternData = new List <double[]>(); for (int grpIdx = 0; grpIdx < (dataRow.NumOfStringValues - outputNames.NumOfStringValues) / repetitiveGroupOfAttributes.NumOfStringValues; grpIdx++) { double[] inputVector = new double[repetitiveGroupOfAttributes.NumOfStringValues]; for (int attrIdx = 0; attrIdx < repetitiveGroupOfAttributes.NumOfStringValues; attrIdx++) { inputVector[attrIdx] = dataRow.GetValue(grpIdx * repetitiveGroupOfAttributes.NumOfStringValues + attrIdx).ParseDouble(true, "Can't parse double data value."); } //attrIdx patternData.Add(inputVector); } //grpIdx //Output data double[] outputVector = new double[outputNames.NumOfStringValues]; for (int outputIdx = (dataRow.NumOfStringValues - outputNames.NumOfStringValues), i = 0; outputIdx < dataRow.NumOfStringValues; outputIdx++, i++) { outputVector[i] = dataRow.GetValue(outputIdx).ParseDouble(true, $"Can't parse double value {dataRow.GetValue(outputIdx)}."); } //outputIdx bundle.AddPair(patternData, outputVector); } //while !EOF } //using streamReader //Data normalization bundleNormalizer.Normalize(bundle); return(bundle); } //Load
/// <summary> /// Loads the data and prepares PredictionBundle. /// The first line of the csv file must be field names. These field names must /// match the names of the input and output fields. /// </summary> /// <param name="fileName"> /// Data file name /// </param> /// <param name="inputFieldNameCollection"> /// Input fields /// </param> /// <param name="outputFieldNameCollection"> /// Output fields /// </param> /// <param name="normRange"> /// Range of normalized values /// </param> /// <param name="normReserveRatio"> /// Reserve held by a normalizer to cover cases where future data exceeds a known range of sample data. /// </param> /// <param name="dataStandardization"> /// Specifies whether to apply data standardization /// </param> /// <param name="singleNormalizer"> /// Use true if all input and output fields are about the same range of values. /// </param> /// <param name="bundleNormalizer"> /// Returned initialized instance of BundleNormalizer. /// </param> /// <param name="remainingInputVector"> /// Returned the last input vector unused in the bundle. /// </param> public static TimeSeriesBundle Load(string fileName, List <string> inputFieldNameCollection, List <string> outputFieldNameCollection, Interval normRange, double normReserveRatio, bool dataStandardization, bool singleNormalizer, out BundleNormalizer bundleNormalizer, out double[] remainingInputVector ) { TimeSeriesBundle bundle = null; bundleNormalizer = new BundleNormalizer(normRange, normReserveRatio, dataStandardization, normReserveRatio, dataStandardization); using (StreamReader streamReader = new StreamReader(new FileStream(fileName, FileMode.Open))) { List <int> fieldIndexes = new List <int>(); List <double[]> allData = new List <double[]>(); //First row contains column names (data fields) string delimitedColumnNames = streamReader.ReadLine(); //What data delimiter is used? char csvDelimiter = DelimitedStringValues.RecognizeDelimiter(delimitedColumnNames); //Split column names DelimitedStringValues columnNames = new DelimitedStringValues(csvDelimiter); columnNames.LoadFromString(delimitedColumnNames); //Check if the recognized data delimiter works properly if (columnNames.NumOfStringValues < inputFieldNameCollection.Count) { throw new FormatException("1st row of the file doesn't contain delimited column names or the value delimiter was not properly recognized."); } //Define fields foreach (string name in inputFieldNameCollection) { if (!bundleNormalizer.IsFieldDefined(name)) { bundleNormalizer.DefineField(name, singleNormalizer ? "COMMON" : name); fieldIndexes.Add(columnNames.IndexOf(name)); } bundleNormalizer.DefineInputField(name); } foreach (string name in outputFieldNameCollection) { if (!bundleNormalizer.IsFieldDefined(name)) { bundleNormalizer.DefineField(name, singleNormalizer ? "COMMON" : name); fieldIndexes.Add(columnNames.IndexOf(name)); } bundleNormalizer.DefineOutputField(name); } //Finalize structure bundleNormalizer.FinalizeStructure(); //Load all relevant data DelimitedStringValues dataRow = new DelimitedStringValues(csvDelimiter); while (!streamReader.EndOfStream) { dataRow.LoadFromString(streamReader.ReadLine()); double[] vector = new double[fieldIndexes.Count]; for (int i = 0; i < fieldIndexes.Count; i++) { vector[i] = dataRow.GetValue(fieldIndexes[i]).ParseDouble(true, $"Can't parse double value {dataRow.GetValue(fieldIndexes[i])}."); } allData.Add(vector); } //Create data bundle remainingInputVector = bundleNormalizer.CreateBundleFromVectorCollection(allData, true, out bundle); } return(bundle); } //Load
//Static methods /// <summary> /// Loads the data and prepares PatternBundle. /// 1st row of the file must start with the #RepetitiveGroupOfAttributes keyword followed by /// attribute names. /// 2nd row of the file must start with the #Outputs keyword followed by /// output field names. /// 3rd+ rows are the data rows. /// The data row must begin with at least one complete set of values for defined repetitive attributes. /// The data row must end with values of defined output fields. /// </summary> /// <param name="fileName"> Data file name </param> /// <param name="inputFieldNameCollection"> Input fields to be extracted from a file</param> /// <param name="outputFieldNameCollection"> Output fields to be extracted from a file</param> public static PatternBundle LoadFromCsv(string fileName, List <string> inputFieldNameCollection, List <string> outputFieldNameCollection ) { PatternBundle bundle = new PatternBundle(); using (StreamReader streamReader = new StreamReader(new FileStream(fileName, FileMode.Open))) { List <int> inputFieldGrpIndexes = new List <int>(); List <int> outputFieldIndexes = new List <int>(); //The first row contains the "#RepetitiveGroupOfAttributes" keyword followed by name(s) of attribute(s) string delimitedRepetitiveGroupOfAttributes = streamReader.ReadLine(); if (!delimitedRepetitiveGroupOfAttributes.StartsWith("#RepetitiveGroupOfAttributes")) { throw new FormatException("1st row of the file doesn't start with the #RepetitiveGroupOfAttributes keyword."); } //What data delimiter is used? char csvDelimiter = DelimitedStringValues.RecognizeDelimiter(delimitedRepetitiveGroupOfAttributes); //Split column names DelimitedStringValues repetitiveGroupOfAttributes = new DelimitedStringValues(csvDelimiter); repetitiveGroupOfAttributes.LoadFromString(delimitedRepetitiveGroupOfAttributes); repetitiveGroupOfAttributes.RemoveTrailingWhites(); //Check if the recognized data delimiter works properly if (repetitiveGroupOfAttributes.NumOfStringValues < 2) { throw new FormatException("The value delimiter was not recognized or missing repetitive attribute(s) name(s)."); } //Remove the #RepetitiveGroupOfAttributes keyword from the collection repetitiveGroupOfAttributes.RemoveAt(0); //Check if attribute names match with the input fields collection if (repetitiveGroupOfAttributes.NumOfStringValues < inputFieldNameCollection.Count) { throw new FormatException("Inconsistent number of attributes in the file and number of specified input fields."); } foreach (string inputFieldName in inputFieldNameCollection) { int index = repetitiveGroupOfAttributes.IndexOf(inputFieldName); if (index < 0) { throw new FormatException($"Input field name {inputFieldName} was not found among the repetitive attributes specified in the file."); } inputFieldGrpIndexes.Add(index); } //The second row contains the "#Outputs" keyword followed by name(s) of output class(es) or values(s) string delimitedOutputNames = streamReader.ReadLine(); if (!delimitedOutputNames.StartsWith("#Outputs")) { throw new FormatException("2nd row of the file doesn't start with the #Outputs keyword."); } DelimitedStringValues outputNames = new DelimitedStringValues(csvDelimiter); outputNames.LoadFromString(delimitedOutputNames); outputNames.RemoveTrailingWhites(); //Remove the #Outputs keyword from the collection outputNames.RemoveAt(0); //Check if the there is at least one output name if (outputNames.NumOfStringValues < 1) { throw new FormatException("Missing output name(es)."); } //Check if output names match with the output fields collection if (outputNames.NumOfStringValues < outputFieldNameCollection.Count) { throw new FormatException("Inconsistent number of outputs in the file and number of specified output fields."); } foreach (string outputFieldName in outputFieldNameCollection) { int index = outputNames.IndexOf(outputFieldName); if (index < 0) { throw new FormatException($"Output field name {outputFieldName} was not found among the outputs specified in the file."); } outputFieldIndexes.Add(index); } //Load data DelimitedStringValues dataRow = new DelimitedStringValues(csvDelimiter); while (!streamReader.EndOfStream) { dataRow.LoadFromString(streamReader.ReadLine()); dataRow.RemoveTrailingWhites(); //Check data length if (dataRow.NumOfStringValues < repetitiveGroupOfAttributes.NumOfStringValues + outputNames.NumOfStringValues || ((dataRow.NumOfStringValues - outputNames.NumOfStringValues) % repetitiveGroupOfAttributes.NumOfStringValues) != 0) { throw new FormatException("Incorrect length of data row."); } //Pattern data List <double[]> patternData = new List <double[]>(); for (int grpIdx = 0; grpIdx < (dataRow.NumOfStringValues - outputNames.NumOfStringValues) / repetitiveGroupOfAttributes.NumOfStringValues; grpIdx++) { double[] inputVector = new double[inputFieldGrpIndexes.Count]; for (int i = 0; i < inputFieldGrpIndexes.Count; i++) { inputVector[i] = dataRow.GetValue(grpIdx * repetitiveGroupOfAttributes.NumOfStringValues + inputFieldGrpIndexes[i]).ParseDouble(true, $"Can't parse double data value {dataRow.GetValue(grpIdx * repetitiveGroupOfAttributes.NumOfStringValues + inputFieldGrpIndexes[i])}."); } patternData.Add(inputVector); }//grpIdx //Output data double[] outputVector = new double[outputFieldIndexes.Count]; int dataRowStartIdx = dataRow.NumOfStringValues - outputNames.NumOfStringValues; for (int i = 0; i < outputFieldIndexes.Count; i++) { outputVector[i] = dataRow.GetValue(dataRowStartIdx + outputFieldIndexes[i]).ParseDouble(true, $"Can't parse double value {dataRow.GetValue(dataRowStartIdx + outputFieldIndexes[i])}."); } bundle.AddPair(patternData, outputVector); } //while !EOF } //using streamReader return(bundle); } //LoadFromCsv
//Static methods /// <summary> /// Loads the data and prepares TimeSeriesBundle. /// The first line of the csv file must be field names. These field names must /// match the names of the input and output fields. /// </summary> /// <param name="fileName"> Data file name </param> /// <param name="inputFieldNameCollection"> Input field names </param> /// <param name="outputFieldNameCollection"> Output field names </param> /// <param name="outputFieldTaskCollection"> /// Neural task related to output field. /// Classification task means the output field contains binary value so data /// standardization and normalizer reserve are suppressed. /// </param> /// <param name="normRange"> Range of normalized values </param> /// <param name="normReserveRatio"> /// Reserve held by a normalizer to cover cases where future data exceeds a known range of sample data. /// </param> /// <param name="dataStandardization"> Specifies whether to apply data standardization </param> /// <param name="bundleNormalizer"> Returned initialized instance of BundleNormalizer </param> /// <param name="remainingInputVector"> Returned the last input vector unused in the bundle </param> public static TimeSeriesBundle LoadFromCsv(string fileName, List <string> inputFieldNameCollection, List <string> outputFieldNameCollection, List <CommonEnums.TaskType> outputFieldTaskCollection, Interval normRange, double normReserveRatio, bool dataStandardization, out BundleNormalizer bundleNormalizer, out double[] remainingInputVector ) { TimeSeriesBundle bundle = null; remainingInputVector = null; bundleNormalizer = new BundleNormalizer(normRange); using (StreamReader streamReader = new StreamReader(new FileStream(fileName, FileMode.Open))) { List <int> inputFieldIndexes = new List <int>(); List <int> outputFieldIndexes = new List <int>(); //First row contains column names (data fields) string delimitedColumnNames = streamReader.ReadLine(); //What data delimiter is used? char csvDelimiter = DelimitedStringValues.RecognizeDelimiter(delimitedColumnNames); //Split column names DelimitedStringValues columnNames = new DelimitedStringValues(csvDelimiter); columnNames.LoadFromString(delimitedColumnNames); //Check if the recognized data delimiter works properly if (columnNames.NumOfStringValues < inputFieldNameCollection.Count) { throw new FormatException("1st row of the file doesn't contain delimited column names or the value delimiter was not properly recognized."); } //Define fields foreach (string name in inputFieldNameCollection) { if (!bundleNormalizer.IsFieldDefined(name)) { bundleNormalizer.DefineField(name, name, normReserveRatio, dataStandardization); inputFieldIndexes.Add(columnNames.IndexOf(name)); } bundleNormalizer.DefineInputField(name); } for (int i = 0; i < outputFieldNameCollection.Count; i++) { if (!bundleNormalizer.IsFieldDefined(outputFieldNameCollection[i])) { bundleNormalizer.DefineField(outputFieldNameCollection[i], outputFieldNameCollection[i], outputFieldTaskCollection[i] == CommonEnums.TaskType.Classification ? 0 : normReserveRatio, outputFieldTaskCollection[i] == CommonEnums.TaskType.Classification ? false : dataStandardization ); } outputFieldIndexes.Add(columnNames.IndexOf(outputFieldNameCollection[i])); bundleNormalizer.DefineOutputField(outputFieldNameCollection[i]); } //Finalize structure bundleNormalizer.FinalizeStructure(); //Load full data in string form List <DelimitedStringValues> fullData = new List <DelimitedStringValues>(); while (!streamReader.EndOfStream) { DelimitedStringValues row = new DelimitedStringValues(csvDelimiter); row.LoadFromString(streamReader.ReadLine()); fullData.Add(row); } //Prepare input and output vectors List <double[]> inputVectorCollection = new List <double[]>(fullData.Count); List <double[]> outputVectorCollection = new List <double[]>(fullData.Count); for (int i = 0; i < fullData.Count; i++) { //Input vector double[] inputVector = new double[inputFieldIndexes.Count]; for (int j = 0; j < inputFieldIndexes.Count; j++) { inputVector[j] = fullData[i].GetValue(inputFieldIndexes[j]).ParseDouble(true, $"Can't parse double value {fullData[i].GetValue(inputFieldIndexes[j])}."); } if (i < fullData.Count - 1) { //Within the bundle inputVectorCollection.Add(inputVector); } else { //remaining input vector out of the bundle remainingInputVector = inputVector; } if (i > 0) { //Output vector double[] outputVector = new double[outputFieldIndexes.Count]; for (int j = 0; j < outputFieldIndexes.Count; j++) { outputVector[j] = fullData[i].GetValue(outputFieldIndexes[j]).ParseDouble(true, $"Can't parse double value {fullData[i].GetValue(outputFieldIndexes[j])}."); } outputVectorCollection.Add(outputVector); } } //Create bundle bundle = new TimeSeriesBundle(inputVectorCollection, outputVectorCollection); //Normalize bundle and remaining input vector bundleNormalizer.Normalize(bundle); bundleNormalizer.NormalizeInputVector(remainingInputVector); } return(bundle); } //LoadFromCsv