Exemple #1
0
        /// <summary>
        /// Loads the data and prepares PatternBundle.
        /// 1st row of the file must start with the #RepetitiveGroupOfAttributes keyword followed by
        /// attribute names.
        /// 2nd row of the file must start with the #Outputs keyword followed by
        /// output field names.
        /// 3rd+ rows are the data rows.
        /// The data row must begin with at least one set of values for defined repetitive attributes.
        /// The data row must end with a value for each defined output.
        /// </summary>
        /// <param name="classification">
        /// In case of classification the standardization and reserve ratio are not applied on output fields.
        /// </param>
        /// <param name="fileName">
        /// Data file name
        /// </param>
        /// <param name="inputFieldNameCollection">
        /// Input fields
        /// </param>
        /// <param name="outputFieldNameCollection">
        /// Output fields
        /// </param>
        /// <param name="normRange">
        /// Range of normalized values
        /// </param>
        /// <param name="normReserveRatio">
        /// Reserve held by a normalizer to cover cases where future data exceeds a known range of sample data.
        /// </param>
        /// <param name="dataStandardization">
        /// Specifies whether to apply data standardization to input data.
        /// Output data is never standardized.
        /// </param>
        /// <param name="bundleNormalizer">
        /// Returned initialized instance of BundleNormalizer.
        /// </param>
        public static PatternBundle Load(bool classification,
                                         string fileName,
                                         List <string> inputFieldNameCollection,
                                         List <string> outputFieldNameCollection,
                                         Interval normRange,
                                         double normReserveRatio,
                                         bool dataStandardization,
                                         out BundleNormalizer bundleNormalizer
                                         )
        {
            PatternBundle bundle = new PatternBundle();

            bundleNormalizer = new BundleNormalizer(normRange, normReserveRatio, dataStandardization, classification ? 0 : normReserveRatio, classification ? false : dataStandardization);
            using (StreamReader streamReader = new StreamReader(new FileStream(fileName, FileMode.Open)))
            {
                //The first row contains the "#RepetitiveGroupOfAttributes" keyword followed by name(s) of attribute(s)
                string delimitedRepetitiveGroupOfAttributes = streamReader.ReadLine();
                if (!delimitedRepetitiveGroupOfAttributes.StartsWith("#RepetitiveGroupOfAttributes"))
                {
                    throw new FormatException("1st row of the file doesn't start with the #RepetitiveGroupOfAttributes keyword.");
                }
                //What data delimiter is used?
                char csvDelimiter = DelimitedStringValues.RecognizeDelimiter(delimitedRepetitiveGroupOfAttributes);
                //Split column names
                DelimitedStringValues repetitiveGroupOfAttributes = new DelimitedStringValues(csvDelimiter);
                repetitiveGroupOfAttributes.LoadFromString(delimitedRepetitiveGroupOfAttributes);
                repetitiveGroupOfAttributes.RemoveTrailingWhites();
                //Check if the recognized data delimiter works properly
                if (repetitiveGroupOfAttributes.NumOfStringValues < 2)
                {
                    throw new FormatException("The value delimiter was not recognized or missing repetitive attribute(s) name(s).");
                }
                //Remove the #RepetitiveGroupOfAttributes keyword from the collection
                repetitiveGroupOfAttributes.RemoveAt(0);
                //Check if attribute names match with the input fields collection
                if (repetitiveGroupOfAttributes.NumOfStringValues != inputFieldNameCollection.Count)
                {
                    throw new FormatException("Different number of attributes in the file and number of specified input fields.");
                }
                foreach (string inputFieldName in inputFieldNameCollection)
                {
                    if (repetitiveGroupOfAttributes.IndexOf(inputFieldName) < 0)
                    {
                        throw new FormatException($"Input field name {inputFieldName} was not found among the repetitive attributes specified in the file.");
                    }
                }
                //The second row contains the "#Outputs" keyword followed by name(s) of output class(es) or values(s)
                string delimitedOutputNames = streamReader.ReadLine();
                if (!delimitedOutputNames.StartsWith("#Outputs"))
                {
                    throw new FormatException("2nd row of the file doesn't start with the #Outputs keyword.");
                }
                DelimitedStringValues outputNames = new DelimitedStringValues(csvDelimiter);
                outputNames.LoadFromString(delimitedOutputNames);
                outputNames.RemoveTrailingWhites();
                //Check if the there is at least one output name
                if (outputNames.NumOfStringValues < 2)
                {
                    throw new FormatException("Missing output name(es).");
                }
                //Remove the #Outputs keyword from the collection
                outputNames.RemoveAt(0);
                //Check if output names match with the output fields collection
                if (outputNames.NumOfStringValues != outputFieldNameCollection.Count)
                {
                    throw new FormatException("Different number of outputs in the file and number of specified output fields.");
                }
                foreach (string outputFieldName in outputFieldNameCollection)
                {
                    if (outputNames.IndexOf(outputFieldName) < 0)
                    {
                        throw new FormatException($"Output field name {outputFieldName} was not found among the outputs specified in the file.");
                    }
                }
                //Bundle handler setup
                foreach (string attrName in repetitiveGroupOfAttributes.StringValueCollection)
                {
                    bundleNormalizer.DefineField(attrName, attrName);
                    bundleNormalizer.DefineInputField(attrName);
                }
                foreach (string outputName in outputNames.StringValueCollection)
                {
                    bundleNormalizer.DefineField(outputName, outputName);
                    bundleNormalizer.DefineOutputField(outputName);
                }
                bundleNormalizer.FinalizeStructure();
                //Load data
                DelimitedStringValues dataRow = new DelimitedStringValues(csvDelimiter);
                while (!streamReader.EndOfStream)
                {
                    dataRow.LoadFromString(streamReader.ReadLine());
                    dataRow.RemoveTrailingWhites();
                    //Check data length
                    if (dataRow.NumOfStringValues < repetitiveGroupOfAttributes.NumOfStringValues + outputNames.NumOfStringValues ||
                        ((dataRow.NumOfStringValues - outputNames.NumOfStringValues) % repetitiveGroupOfAttributes.NumOfStringValues) != 0)
                    {
                        throw new FormatException("Incorrect length of data row.");
                    }
                    //Pattern data
                    List <double[]> patternData = new List <double[]>();
                    for (int grpIdx = 0; grpIdx < (dataRow.NumOfStringValues - outputNames.NumOfStringValues) / repetitiveGroupOfAttributes.NumOfStringValues; grpIdx++)
                    {
                        double[] inputVector = new double[repetitiveGroupOfAttributes.NumOfStringValues];
                        for (int attrIdx = 0; attrIdx < repetitiveGroupOfAttributes.NumOfStringValues; attrIdx++)
                        {
                            inputVector[attrIdx] = dataRow.GetValue(grpIdx * repetitiveGroupOfAttributes.NumOfStringValues + attrIdx).ParseDouble(true, "Can't parse double data value.");
                        } //attrIdx
                        patternData.Add(inputVector);
                    }     //grpIdx
                    //Output data
                    double[] outputVector = new double[outputNames.NumOfStringValues];
                    for (int outputIdx = (dataRow.NumOfStringValues - outputNames.NumOfStringValues), i = 0; outputIdx < dataRow.NumOfStringValues; outputIdx++, i++)
                    {
                        outputVector[i] = dataRow.GetValue(outputIdx).ParseDouble(true, $"Can't parse double value {dataRow.GetValue(outputIdx)}.");
                    } //outputIdx
                    bundle.AddPair(patternData, outputVector);
                }     //while !EOF
            }         //using streamReader
            //Data normalization
            bundleNormalizer.Normalize(bundle);
            return(bundle);
        } //Load
Exemple #2
0
        //Static methods
        /// <summary>
        /// Loads the data and prepares PatternBundle.
        /// 1st row of the file must start with the #RepetitiveGroupOfAttributes keyword followed by
        /// attribute names.
        /// 2nd row of the file must start with the #Outputs keyword followed by
        /// output field names.
        /// 3rd+ rows are the data rows.
        /// The data row must begin with at least one complete set of values for defined repetitive attributes.
        /// The data row must end with values of defined output fields.
        /// </summary>
        /// <param name="fileName"> Data file name </param>
        /// <param name="inputFieldNameCollection"> Input fields to be extracted from a file</param>
        /// <param name="outputFieldNameCollection"> Output fields to be extracted from a file</param>
        public static PatternBundle LoadFromCsv(string fileName,
                                                List <string> inputFieldNameCollection,
                                                List <string> outputFieldNameCollection
                                                )
        {
            PatternBundle bundle = new PatternBundle();

            using (StreamReader streamReader = new StreamReader(new FileStream(fileName, FileMode.Open)))
            {
                List <int> inputFieldGrpIndexes = new List <int>();
                List <int> outputFieldIndexes   = new List <int>();
                //The first row contains the "#RepetitiveGroupOfAttributes" keyword followed by name(s) of attribute(s)
                string delimitedRepetitiveGroupOfAttributes = streamReader.ReadLine();
                if (!delimitedRepetitiveGroupOfAttributes.StartsWith("#RepetitiveGroupOfAttributes"))
                {
                    throw new FormatException("1st row of the file doesn't start with the #RepetitiveGroupOfAttributes keyword.");
                }
                //What data delimiter is used?
                char csvDelimiter = DelimitedStringValues.RecognizeDelimiter(delimitedRepetitiveGroupOfAttributes);
                //Split column names
                DelimitedStringValues repetitiveGroupOfAttributes = new DelimitedStringValues(csvDelimiter);
                repetitiveGroupOfAttributes.LoadFromString(delimitedRepetitiveGroupOfAttributes);
                repetitiveGroupOfAttributes.RemoveTrailingWhites();
                //Check if the recognized data delimiter works properly
                if (repetitiveGroupOfAttributes.NumOfStringValues < 2)
                {
                    throw new FormatException("The value delimiter was not recognized or missing repetitive attribute(s) name(s).");
                }
                //Remove the #RepetitiveGroupOfAttributes keyword from the collection
                repetitiveGroupOfAttributes.RemoveAt(0);
                //Check if attribute names match with the input fields collection
                if (repetitiveGroupOfAttributes.NumOfStringValues < inputFieldNameCollection.Count)
                {
                    throw new FormatException("Inconsistent number of attributes in the file and number of specified input fields.");
                }
                foreach (string inputFieldName in inputFieldNameCollection)
                {
                    int index = repetitiveGroupOfAttributes.IndexOf(inputFieldName);
                    if (index < 0)
                    {
                        throw new FormatException($"Input field name {inputFieldName} was not found among the repetitive attributes specified in the file.");
                    }
                    inputFieldGrpIndexes.Add(index);
                }
                //The second row contains the "#Outputs" keyword followed by name(s) of output class(es) or values(s)
                string delimitedOutputNames = streamReader.ReadLine();
                if (!delimitedOutputNames.StartsWith("#Outputs"))
                {
                    throw new FormatException("2nd row of the file doesn't start with the #Outputs keyword.");
                }
                DelimitedStringValues outputNames = new DelimitedStringValues(csvDelimiter);
                outputNames.LoadFromString(delimitedOutputNames);
                outputNames.RemoveTrailingWhites();
                //Remove the #Outputs keyword from the collection
                outputNames.RemoveAt(0);
                //Check if the there is at least one output name
                if (outputNames.NumOfStringValues < 1)
                {
                    throw new FormatException("Missing output name(es).");
                }
                //Check if output names match with the output fields collection
                if (outputNames.NumOfStringValues < outputFieldNameCollection.Count)
                {
                    throw new FormatException("Inconsistent number of outputs in the file and number of specified output fields.");
                }
                foreach (string outputFieldName in outputFieldNameCollection)
                {
                    int index = outputNames.IndexOf(outputFieldName);
                    if (index < 0)
                    {
                        throw new FormatException($"Output field name {outputFieldName} was not found among the outputs specified in the file.");
                    }
                    outputFieldIndexes.Add(index);
                }
                //Load data
                DelimitedStringValues dataRow = new DelimitedStringValues(csvDelimiter);
                while (!streamReader.EndOfStream)
                {
                    dataRow.LoadFromString(streamReader.ReadLine());
                    dataRow.RemoveTrailingWhites();
                    //Check data length
                    if (dataRow.NumOfStringValues < repetitiveGroupOfAttributes.NumOfStringValues + outputNames.NumOfStringValues ||
                        ((dataRow.NumOfStringValues - outputNames.NumOfStringValues) % repetitiveGroupOfAttributes.NumOfStringValues) != 0)
                    {
                        throw new FormatException("Incorrect length of data row.");
                    }
                    //Pattern data
                    List <double[]> patternData = new List <double[]>();
                    for (int grpIdx = 0; grpIdx < (dataRow.NumOfStringValues - outputNames.NumOfStringValues) / repetitiveGroupOfAttributes.NumOfStringValues; grpIdx++)
                    {
                        double[] inputVector = new double[inputFieldGrpIndexes.Count];
                        for (int i = 0; i < inputFieldGrpIndexes.Count; i++)
                        {
                            inputVector[i] = dataRow.GetValue(grpIdx * repetitiveGroupOfAttributes.NumOfStringValues + inputFieldGrpIndexes[i]).ParseDouble(true, $"Can't parse double data value {dataRow.GetValue(grpIdx * repetitiveGroupOfAttributes.NumOfStringValues + inputFieldGrpIndexes[i])}.");
                        }
                        patternData.Add(inputVector);
                    }//grpIdx
                    //Output data
                    double[] outputVector    = new double[outputFieldIndexes.Count];
                    int      dataRowStartIdx = dataRow.NumOfStringValues - outputNames.NumOfStringValues;
                    for (int i = 0; i < outputFieldIndexes.Count; i++)
                    {
                        outputVector[i] = dataRow.GetValue(dataRowStartIdx + outputFieldIndexes[i]).ParseDouble(true, $"Can't parse double value {dataRow.GetValue(dataRowStartIdx + outputFieldIndexes[i])}.");
                    }
                    bundle.AddPair(patternData, outputVector);
                } //while !EOF
            }     //using streamReader
            return(bundle);
        }         //LoadFromCsv