/// <summary> /// Extract fields from a file into a numeric array for machine learning. /// </summary> /// /// <param name="analyst">The analyst to use.</param> /// <param name="headers">The headers for the input data.</param> /// <param name="csv">The CSV that holds the input data.</param> /// <param name="outputLength">The length of the returned array.</param> /// <param name="skipOutput">True if the output should be skipped.</param> /// <returns>The encoded data.</returns> public static double[] ExtractFields(EncogAnalyst analyst, CSVHeaders headers, ReadCSV csv, int outputLength, bool skipOutput) { var output = new double[outputLength]; int outputIndex = 0; foreach (AnalystField stat in analyst.Script.Normalize.NormalizedFields) { if (stat.Action == NormalizationAction.Ignore) { continue; } if (stat.Output && skipOutput) { continue; } int index = headers.Find(stat.Name); String str = csv.Get(index); // is this an unknown value? if (str.Equals("?") || str.Length == 0) { IHandleMissingValues handler = analyst.Script.Normalize.MissingValues; double[] d = handler.HandleMissing(analyst, stat); // should we skip the entire row if (d == null) { return(null); } // copy the returned values in place of the missing values for (int i = 0; i < d.Length; i++) { output[outputIndex++] = d[i]; } } else { // known value if (stat.Action == NormalizationAction.Normalize) { double d = csv.Format.Parse(str.Trim()); d = stat.Normalize(d); output[outputIndex++] = d; } else { double[] d = stat.Encode(str.Trim()); foreach (double element in d) { output[outputIndex++] = element; } } } } return(output); }
/// <summary> /// Process the file. /// </summary> /// /// <param name="outputFile">The output file.</param> /// <param name="method">THe method to use.</param> public void Process(FileInfo outputFile, IMLMethod method) { var csv = new ReadCSV(InputFilename.ToString(), ExpectInputHeaders, Format); IMLData output; foreach (AnalystField field in _analyst.Script.Normalize.NormalizedFields) { field.Init(); } int outputLength = _analyst.DetermineTotalInputFieldCount(); StreamWriter tw = PrepareOutputFile(outputFile); ResetStatus(); while (csv.Next()) { UpdateStatus(false); var row = new LoadedRow(csv, _outputColumns); double[] inputArray = AnalystNormalizeCSV.ExtractFields(_analyst, _analystHeaders, csv, outputLength, true); if (_series.TotalDepth > 1) { inputArray = _series.Process(inputArray); } if (inputArray != null) { IMLData input = new BasicMLData(inputArray); // evaluation data if ((method is IMLClassification) && !(method is IMLRegression)) { // classification only? var tmp = new BasicMLData(1); tmp[0] = ((IMLClassification)method).Classify(input); output = tmp; } else { // regression output = ((IMLRegression)method).Compute(input); } // skip file data int index = _fileColumns; int outputIndex = 0; // display output foreach (AnalystField field in _analyst.Script.Normalize.NormalizedFields) { if (_analystHeaders.Find(field.Name) != -1) { if (field.Output) { if (field.Classify) { // classification ClassItem cls = field.DetermineClass( outputIndex, output); outputIndex += field.ColumnsNeeded; if (cls == null) { row.Data[index++] = "?Unknown?"; } else { row.Data[index++] = cls.Name; } } else { // regression double n = output[outputIndex++]; n = field.DeNormalize(n); row.Data[index++] = Format .Format(n, Precision); } } } } } WriteRow(tw, row); } ReportDone(false); tw.Close(); csv.Close(); }