public void TestClassification() { FileInfo rawFile = TEMP_DIR.CreateFile("simple.csv"); FileInfo egaFile = TEMP_DIR.CreateFile("simple.ega"); FileInfo outputFile = TEMP_DIR.CreateFile("simple_output.csv"); FileUtil.CopyResource("Encog.Resources.simple.csv", rawFile); FileUtil.CopyResource("Encog.Resources.simple-c.ega", egaFile); EncogAnalyst analyst = new EncogAnalyst(); analyst.AddAnalystListener(new ConsoleAnalystListener()); analyst.Load(egaFile); analyst.ExecuteTask("task-full"); ReadCSV csv = new ReadCSV(outputFile.ToString(), true, CSVFormat.English); while (csv.Next()) { Assert.AreEqual(csv.Get(3), csv.Get(4)); } Assert.AreEqual(4, analyst.Script.Fields.Length); Assert.AreEqual(3, analyst.Script.Fields[3].ClassMembers.Count); csv.Close(); }
/// <summary> /// This method is called to determine the birth year for a person. It /// obtains 100 web pages that Yahoo returns for that person. Each of these /// pages is then searched for the birth year of that person. Which ever year /// is selected the largest number of times is selected as the birth year. /// </summary> public void Process() { ReadCSV famous = new ReadCSV("famous.csv"); Report("Building training data from list of famous people."); DateTime started = new DateTime(); this.totalTasks = 0; while (famous.Next()) { String name = famous.Get("Person"); int year = famous.GetInt("Year"); CollectionWorker worker = new CollectionWorker(this, name, year); worker.Call(); this.totalTasks++; } long length = (DateTime.Now - started).Ticks; length /= 1000L; length /= 60; Console.WriteLine("Took " + length + " minutes to collect training data from the Internet."); Console.WriteLine("Writing training file"); WriteTrainingFile(); }
/// <summary> /// Construct a loaded row. /// </summary> /// /// <param name="csv">The CSV file to use.</param> /// <param name="extra">The number of extra columns to add.</param> public LoadedRow(ReadCSV csv, int extra) { int count = csv.GetCount(); _data = new String[count + extra]; for (int i = 0; i < count; i++) { _data[i] = csv.Get(i); } }
private async void BeginLoadAsync() { tokenSource = new CancellationTokenSource(); BaseClassList = await ReadCSV.Get(Path, _progress, tokenSource.Token); if (BaseClassList.Count != 0) { ProgressMessage = "Loading completed"; } await Task.Delay(1000); ProgressMessage = "Ready"; }
/// <summary> /// Get the data for a specific column. /// </summary> /// <param name="name">The column to read.</param> /// <param name="csv">The CSV file to read from.</param> /// <returns>The column data.</returns> public String GetColumnData(String name, ReadCSV csv) { if (!_columnMapping.ContainsKey(name)) { return(null); } BaseCachedColumn column = _columnMapping[name]; if (!(column is FileData)) { return(null); } var fd = (FileData)column; return(csv.Get(fd.Index)); }
/// <inheritdoc /> public string[] ReadLine() { if (_reader == null) { throw new EncogError("Please call rewind before reading the file."); } if (_reader.Next()) { int len = _reader.ColumnCount; var result = new string[len]; for (int i = 0; i < result.Length; i++) { result[i] = _reader.Get(i); } return(result); } _reader.Close(); return(null); }
private void DetermineInputFieldValue(IInputField field, int index, bool headers) { if (field is InputFieldCSV) { var fieldCSV = (InputFieldCSV)field; ReadCSV csv = _csvMap[field]; field.CurrentValueRaw = csv.Get(fieldCSV.ColumnName); try { field.CurrentValue = CSVFormatUsed.Parse((string)field.CurrentValueRaw); } catch (FormatException) { field.CurrentValue = double.NaN; } } else if (field is InputFieldMLDataSet) { var mlField = (InputFieldMLDataSet)field; MLDataFieldHolder holder = _dataSetFieldMap [field]; IMLDataPair pair = holder.Pair; int offset = mlField.Offset; if (offset < pair.Input.Count) { field.CurrentValue = pair.Input[offset]; field.CurrentValueRaw = pair.Input[offset]; } else { offset -= pair.Input.Count; field.CurrentValue = pair.Ideal[offset]; field.CurrentValueRaw = pair.Ideal[offset]; } } else { field.CurrentValueRaw = field.GetValue(index); field.CurrentValue = (double)field.CurrentValueRaw; } }
/// <summary> /// Read the CSV file. /// </summary> private void ReadFile() { ReadCSV csv = null; try { csv = new ReadCSV(InputFilename.ToString(), ExpectInputHeaders, Format); ResetStatus(); int row = 0; while (csv.Next() && !ShouldStop()) { UpdateStatus("Reading data"); foreach (BaseCachedColumn column in Columns) { if (column is FileData) { if (column.Input) { var fd = (FileData)column; String str = csv.Get(fd.Index); double d = Format.Parse(str); fd.Data[row] = d; } } } row++; } } finally { ReportDone("Reading data"); if (csv != null) { csv.Close(); } } }
/// <summary> /// Program entry point. /// </summary> /// <param name="app">Holds arguments and other info.</param> public void Execute(IExampleInterface app) { ErrorCalculation.Mode = ErrorCalculationMode.RMS; // Download the data that we will attempt to model. string filename = DownloadData(app.Args); // Define the format of the data file. // This area will change, depending on the columns and // format of the file that you are trying to model. var format = new CSVFormat('.', ' '); // decimal point and // space separated IVersatileDataSource source = new CSVDataSource(filename, true, format); var data = new VersatileMLDataSet(source); data.NormHelper.Format = format; ColumnDefinition columnSSN = data.DefineSourceColumn("SSN", ColumnType.Continuous); ColumnDefinition columnDEV = data.DefineSourceColumn("DEV", ColumnType.Continuous); // Analyze the data, determine the min/max/mean/sd of every column. data.Analyze(); // Use SSN & DEV to predict SSN. For time-series it is okay to have // SSN both as // an input and an output. data.DefineInput(columnSSN); data.DefineInput(columnDEV); data.DefineOutput(columnSSN); // Create feedforward neural network as the model type. // MLMethodFactory.TYPE_FEEDFORWARD. // You could also other model types, such as: // MLMethodFactory.SVM: Support Vector Machine (SVM) // MLMethodFactory.TYPE_RBFNETWORK: RBF Neural Network // MLMethodFactor.TYPE_NEAT: NEAT Neural Network // MLMethodFactor.TYPE_PNN: Probabilistic Neural Network var model = new EncogModel(data); model.SelectMethod(data, MLMethodFactory.TypeFeedforward); // Send any output to the console. model.Report = new ConsoleStatusReportable(); // Now normalize the data. Encog will automatically determine the // correct normalization // type based on the model you chose in the last step. data.Normalize(); // Set time series. data.LeadWindowSize = 1; data.LagWindowSize = WindowSize; // Hold back some data for a final validation. // Do not shuffle the data into a random ordering. (never shuffle // time series) // Use a seed of 1001 so that we always use the same holdback and // will get more consistent results. model.HoldBackValidation(0.3, false, 1001); // Choose whatever is the default training type for this model. model.SelectTrainingType(data); // Use a 5-fold cross-validated train. Return the best method found. // (never shuffle time series) var bestMethod = (IMLRegression)model.Crossvalidate(5, false); // Display the training and validation errors. Console.WriteLine(@"Training error: " + model.CalculateError(bestMethod, model.TrainingDataset)); Console.WriteLine(@"Validation error: " + model.CalculateError(bestMethod, model.ValidationDataset)); // Display our normalization parameters. NormalizationHelper helper = data.NormHelper; Console.WriteLine(helper.ToString()); // Display the final model. Console.WriteLine(@"Final model: " + bestMethod); // Loop over the entire, original, dataset and feed it through the // model. This also shows how you would process new data, that was // not part of your training set. You do not need to retrain, simply // use the NormalizationHelper class. After you train, you can save // the NormalizationHelper to later normalize and denormalize your // data. source.Close(); var csv = new ReadCSV(filename, true, format); var line = new String[2]; // Create a vector to hold each time-slice, as we build them. // These will be grouped together into windows. var slice = new double[2]; var window = new VectorWindow(WindowSize + 1); IMLData input = helper.AllocateInputVector(WindowSize + 1); // Only display the first 100 int stopAfter = 100; while (csv.Next() && stopAfter > 0) { var result = new StringBuilder(); line[0] = csv.Get(2); // ssn line[1] = csv.Get(3); // dev helper.NormalizeInputVector(line, slice, false); // enough data to build a full window? if (window.IsReady()) { window.CopyWindow(((BasicMLData)input).Data, 0); String correct = csv.Get(2); // trying to predict SSN. IMLData output = bestMethod.Compute(input); String predicted = helper .DenormalizeOutputVectorToString(output)[0]; result.Append(line); result.Append(" -> predicted: "); result.Append(predicted); result.Append("(correct: "); result.Append(correct); result.Append(")"); Console.WriteLine(result.ToString()); } // Add the normalized slice to the window. We do this just after // the after checking to see if the window is ready so that the // window is always one behind the current row. This is because // we are trying to predict next row. window.Add(slice); stopAfter--; } csv.Close(); // Delete data file and shut down. File.Delete(filename); EncogFramework.Instance.Shutdown(); }
/// <summary> /// Extract fields from a file into a numeric array for machine learning. /// </summary> /// /// <param name="analyst">The analyst to use.</param> /// <param name="headers">The headers for the input data.</param> /// <param name="csv">The CSV that holds the input data.</param> /// <param name="outputLength">The length of the returned array.</param> /// <param name="skipOutput">True if the output should be skipped.</param> /// <returns>The encoded data.</returns> public static double[] ExtractFields(EncogAnalyst analyst, CSVHeaders headers, ReadCSV csv, int outputLength, bool skipOutput) { var output = new double[outputLength]; int outputIndex = 0; foreach (AnalystField stat in analyst.Script.Normalize.NormalizedFields) { if (stat.Action == NormalizationAction.Ignore) { continue; } if (stat.Output && skipOutput) { continue; } int index = headers.Find(stat.Name); String str = csv.Get(index); // is this an unknown value? if (str.Equals("?") || str.Length == 0) { IHandleMissingValues handler = analyst.Script.Normalize.MissingValues; double[] d = handler.HandleMissing(analyst, stat); // should we skip the entire row if (d == null) { return(null); } // copy the returned values in place of the missing values for (int i = 0; i < d.Length; i++) { output[outputIndex++] = d[i]; } } else { // known value if (stat.Action == NormalizationAction.Normalize) { double d = csv.Format.Parse(str.Trim()); d = stat.Normalize(d); output[outputIndex++] = d; } else { double[] d = stat.Encode(str.Trim()); foreach (double element in d) { output[outputIndex++] = element; } } } } return(output); }
/// <summary> /// Program entry point. /// </summary> /// <param name="app">Holds arguments and other info.</param> public void Execute(IExampleInterface app) { // Download the data that we will attempt to model. string irisFile = DownloadData(app.Args); // Define the format of the data file. // This area will change, depending on the columns and // format of the file that you are trying to model. IVersatileDataSource source = new CSVDataSource(irisFile, false, CSVFormat.DecimalPoint); var data = new VersatileMLDataSet(source); data.DefineSourceColumn("sepal-length", 0, ColumnType.Continuous); data.DefineSourceColumn("sepal-width", 1, ColumnType.Continuous); data.DefineSourceColumn("petal-length", 2, ColumnType.Continuous); data.DefineSourceColumn("petal-width", 3, ColumnType.Continuous); // Define the column that we are trying to predict. ColumnDefinition outputColumn = data.DefineSourceColumn("species", 4, ColumnType.Nominal); // Analyze the data, determine the min/max/mean/sd of every column. data.Analyze(); // Map the prediction column to the output of the model, and all // other columns to the input. data.DefineSingleOutputOthersInput(outputColumn); // Create feedforward neural network as the model type. MLMethodFactory.TYPE_FEEDFORWARD. // You could also other model types, such as: // MLMethodFactory.SVM: Support Vector Machine (SVM) // MLMethodFactory.TYPE_RBFNETWORK: RBF Neural Network // MLMethodFactor.TYPE_NEAT: NEAT Neural Network // MLMethodFactor.TYPE_PNN: Probabilistic Neural Network var model = new EncogModel(data); model.SelectMethod(data, MLMethodFactory.TypeFeedforward); // Send any output to the console. model.Report = new ConsoleStatusReportable(); // Now normalize the data. Encog will automatically determine the correct normalization // type based on the model you chose in the last step. data.Normalize(); // Hold back some data for a final validation. // Shuffle the data into a random ordering. // Use a seed of 1001 so that we always use the same holdback and will get more consistent results. model.HoldBackValidation(0.3, true, 1001); // Choose whatever is the default training type for this model. model.SelectTrainingType(data); // Use a 5-fold cross-validated train. Return the best method found. var bestMethod = (IMLRegression)model.Crossvalidate(5, true); // Display the training and validation errors. Console.WriteLine(@"Training error: " + model.CalculateError(bestMethod, model.TrainingDataset)); Console.WriteLine(@"Validation error: " + model.CalculateError(bestMethod, model.ValidationDataset)); // Display our normalization parameters. NormalizationHelper helper = data.NormHelper; Console.WriteLine(helper.ToString()); // Display the final model. Console.WriteLine(@"Final model: " + bestMethod); // Loop over the entire, original, dataset and feed it through the model. // This also shows how you would process new data, that was not part of your // training set. You do not need to retrain, simply use the NormalizationHelper // class. After you train, you can save the NormalizationHelper to later // normalize and denormalize your data. source.Close(); var csv = new ReadCSV(irisFile, false, CSVFormat.DecimalPoint); var line = new String[4]; IMLData input = helper.AllocateInputVector(); while (csv.Next()) { var result = new StringBuilder(); line[0] = csv.Get(0); line[1] = csv.Get(1); line[2] = csv.Get(2); line[3] = csv.Get(3); String correct = csv.Get(4); helper.NormalizeInputVector(line, ((BasicMLData)input).Data, false); IMLData output = bestMethod.Compute(input); String irisChosen = helper.DenormalizeOutputVectorToString(output)[0]; result.Append(line); result.Append(" -> predicted: "); result.Append(irisChosen); result.Append("(correct: "); result.Append(correct); result.Append(")"); Console.WriteLine(result.ToString()); } csv.Close(); // Delete data file ande shut down. File.Delete(irisFile); EncogFramework.Instance.Shutdown(); }
/// <summary> /// Perform the analysis. /// </summary> /// <param name="target">The Encog analyst object to analyze.</param> public void Process(EncogAnalyst target) { int count = 0; CSVFormat csvFormat = ConvertStringConst .ConvertToCSVFormat(_format); var csv = new ReadCSV(_filename, _headers, csvFormat); // pass one, calculate the min/max while (csv.Next()) { if (_fields == null) { GenerateFields(csv); } for (int i = 0; i < csv.ColumnCount; i++) { if (_fields != null) { _fields[i].Analyze1(csv.Get(i)); } } count++; } if (count == 0) { throw new AnalystError("Can't analyze file, it is empty."); } if (_fields != null) { foreach (AnalyzedField field in _fields) { field.CompletePass1(); } } csv.Close(); // pass two, standard deviation csv = new ReadCSV(_filename, _headers, csvFormat); while (csv.Next()) { for (int i = 0; i < csv.ColumnCount; i++) { if (_fields != null) { _fields[i].Analyze2(csv.Get(i)); } } } if (_fields != null) { foreach (AnalyzedField field in _fields) { field.CompletePass2(); } } csv.Close(); String str = _script.Properties.GetPropertyString( ScriptProperties.SetupConfigAllowedClasses) ?? ""; bool allowInt = str.Contains("int"); bool allowReal = str.Contains("real") || str.Contains("double"); bool allowString = str.Contains("string"); // remove any classes that did not qualify foreach (AnalyzedField field in _fields) { if (field.Class) { if (!allowInt && field.Integer) { field.Class = false; } if (!allowString && (!field.Integer && !field.Real)) { field.Class = false; } if (!allowReal && field.Real && !field.Integer) { field.Class = false; } } } // merge with existing if ((target.Script.Fields != null) && (_fields.Length == target.Script.Fields.Length)) { for (int i = 0; i < _fields.Length; i++) { // copy the old field name _fields[i].Name = target.Script.Fields[i].Name; if (_fields[i].Class) { IList <AnalystClassItem> t = _fields[i].AnalyzedClassMembers; IList <AnalystClassItem> s = target.Script.Fields[i].ClassMembers; if (s.Count == t.Count) { for (int j = 0; j < s.Count; j++) { if (t[j].Code.Equals(s[j].Code)) { t[j].Name = s[j].Name; } } } } } } // now copy the fields var df = new DataField[_fields.Length]; for (int i_4 = 0; i_4 < df.Length; i_4++) { df[i_4] = _fields[i_4].FinalizeField(); } target.Script.Fields = df; }
/// <summary> /// Private constructor. /// </summary> private PropertyConstraints() { _data = new Dictionary <String, List <PropertyEntry> >(); try { Stream mask0 = ResourceLoader.CreateStream("Encog.Resources.analyst.csv"); var csv = new ReadCSV(mask0, false, CSVFormat.EgFormat); while (csv.Next()) { String sectionStr = csv.Get(0); String nameStr = csv.Get(1); String typeStr = csv.Get(2); // determine type PropertyType t; if ("boolean".Equals(typeStr, StringComparison.InvariantCultureIgnoreCase)) { t = PropertyType.TypeBoolean; } else if ("real".Equals(typeStr, StringComparison.InvariantCultureIgnoreCase)) { t = PropertyType.TypeDouble; } else if ("format".Equals(typeStr, StringComparison.InvariantCultureIgnoreCase)) { t = PropertyType.TypeFormat; } else if ("int".Equals(typeStr, StringComparison.InvariantCultureIgnoreCase)) { t = PropertyType.TypeInteger; } else if ("list-string".Equals(typeStr, StringComparison.InvariantCultureIgnoreCase)) { t = PropertyType.TypeListString; } else if ("string".Equals(typeStr, StringComparison.InvariantCultureIgnoreCase)) { t = PropertyType.TypeString; } else { throw new AnalystError("Unknown type constraint: " + typeStr); } var entry = new PropertyEntry(t, nameStr, sectionStr); List <PropertyEntry> list; if (_data.ContainsKey(sectionStr)) { list = _data[sectionStr]; } else { list = new List <PropertyEntry>(); _data[sectionStr] = list; } list.Add(entry); } csv.Close(); mask0.Close(); } catch (IOException e) { throw new EncogError(e); } }
/// <summary> /// Analyze the input file. /// </summary> /// <param name="input">The input file.</param> /// <param name="headers">True, if there are headers.</param> /// <param name="format">The format of the CSV data.</param> public virtual void Analyze(FileInfo input, bool headers, CSVFormat format) { ResetStatus(); InputFilename = input; ExpectInputHeaders = headers; Format = format; _columnMapping.Clear(); _columns.Clear(); // first count the rows TextReader reader = null; try { int recordCount = 0; reader = new StreamReader(InputFilename.OpenRead()); while (reader.ReadLine() != null) { UpdateStatus(true); recordCount++; } if (headers) { recordCount--; } RecordCount = recordCount; } catch (IOException ex) { throw new QuantError(ex); } finally { ReportDone(true); if (reader != null) { try { reader.Close(); } catch (IOException e) { throw new QuantError(e); } } InputFilename = input; ExpectInputHeaders = headers; Format = format; } // now analyze columns ReadCSV csv = null; try { csv = new ReadCSV(input.ToString(), headers, format); if (!csv.Next()) { throw new QuantError("File is empty"); } for (int i = 0; i < csv.ColumnCount; i++) { String name; if (headers) { name = AttemptResolveName(csv.ColumnNames[i]); } else { name = "Column-" + (i + 1); } // determine if it should be an input/output field String str = csv.Get(i); bool io = false; try { Format.Parse(str); io = true; } catch (FormatException ex) { EncogLogging.Log(ex); } AddColumn(new FileData(name, i, io, io)); } } finally { if (csv != null) { csv.Close(); } Analyzed = true; } }