public void FeatureNormalizationTransformer_Transform_Vector() { var sut = new MeanZeroFeatureTransformer(); var matrix = new F64Matrix(new double[] { 123, 12, 41, 120, 124, 122 }, 3, 2); // Create transformation sut.Transform(matrix); // use vector transform on each row var actual = new F64Matrix(3, 2); for (int i = 0; i < actual.RowCount; i++) { var row = sut.Transform(matrix.Row(i)); for (int j = 0; j < actual.ColumnCount; j++) { actual[i, j] = row[j]; } } var expected = new F64Matrix(new double[] { 27, -72.666666666666671, -55, 35.333333333333329, 28, 37.333333333333329 }, 3, 2); Assert.AreEqual(expected, actual); }
public void FeatureNormalizationTransformer_Transform_Matrix() { var sut = new MeanZeroFeatureTransformer(); var matrix = new F64Matrix(new double[] { 123, 12, 41, 120, 124, 122 }, 3, 2); var actual = new F64Matrix(3, 2); sut.Transform(matrix, actual); var expected = new F64Matrix(new double[] { 27, -72.666666666666671, -55, 35.333333333333329, 28, 37.333333333333329 }, 3, 2); Assert.AreEqual(expected, actual); }
public void Regression_Standard_Neural_Net_FeatureTransform_Normalization() { #region Read Data // Use StreamReader(filepath) when running from filesystem var parser = new CsvParser(() => new StringReader(Resources.winequality_white)); var targetName = "quality"; // read feature matrix var observations = parser.EnumerateRows(c => c != targetName) .ToF64Matrix(); // read classification targets var targets = parser.EnumerateRows(targetName) .ToF64Vector(); #endregion // transform pixel values to be between 0 and 1 // and shift each feature to have a mean value of zero. var minMaxTransformer = new MinMaxTransformer(0.0, 1.0); var meanZeroTransformer = new MeanZeroFeatureTransformer(); minMaxTransformer.Transform(observations, observations); meanZeroTransformer.Transform(observations, observations); var numberOfFeatures = observations.ColumnCount; // define the neural net. var net = new NeuralNet(); net.Add(new InputLayer(inputUnits: numberOfFeatures)); net.Add(new DropoutLayer(0.2)); net.Add(new DenseLayer(800, Activation.Relu)); net.Add(new DropoutLayer(0.5)); net.Add(new DenseLayer(800, Activation.Relu)); net.Add(new DropoutLayer(0.5)); net.Add(new SquaredErrorRegressionLayer()); // using only 10 iteration to make the example run faster. // using square error as error metric. This is only used for reporting progress. var learner = new RegressionNeuralNetLearner(net, iterations: 10, loss: new SquareLoss()); var model = learner.Learn(observations, targets); var metric = new MeanSquaredErrorRegressionMetric(); var predictions = model.Predict(observations); Trace.WriteLine("Training Error: " + metric.Error(targets, predictions)); }
public void Predict(int iterations = DefaultNNIterations, int targetOffset = 1, string targetName = DefaultTargetName, bool pauseAtEnd = false) { _iterations = iterations; _targetName = targetName; _targetOffset = targetOffset; Program.StatusLogger.Info($"Iterations: {_iterations}"); Program.StatusLogger.Info($"Target: {_targetName}"); Program.StatusLogger.Info($"Offset: {_targetOffset}"); var data = new ConcurrentDictionary <int, ModelData>(); if (File.Exists(Path())) { data = JsonConvert.DeserializeObject <ConcurrentDictionary <int, ModelData> >(File.ReadAllText(Path())); //data = TypeSerializer.DeserializeFromReader<ConcurrentDictionary<int, ModelData>>(new StreamReader(Path())); Program.StatusLogger.Info("Cached data was loaded."); } else { //http://publicdata.landregistry.gov.uk/market-trend-data/house-price-index-data/UK-HPI-full-file-2019-07.csv var header = File.ReadLines("UK-HPI-full-file-2019-07.csv").First(); var columnNames = header.Split(","); var parser = new CsvParser(() => new StringReader(File.ReadAllText("UK-HPI-full-file-2019-07.csv")), ',', false, true); var creditData = _creditDataExtractor.Extract(); var populationData = _populationDataExtractor.Extract(); var otherPopulationData = _otherPopulationDataExtractor.Extract(); var densityData = _londonDensityDataExtractor.Extract(); var gvaData = _gvaDataExtractor.Extract(); var featureRows = parser.EnumerateRows().ToArray(); var targets = parser.EnumerateRows(_targetName).ToArray(); string previousKey = null; for (int i = 0; i < featureRows.Length; i++) { var item = featureRows[i]; var key = item.GetValue("RegionName"); var date = DateTime.ParseExact(item.GetValue("Date"), "dd/MM/yyyy", new CultureInfo("en-GB"), DateTimeStyles.AssumeLocal); if (key != previousKey) { Program.StatusLogger.Info($"Processing {key}"); } previousKey = key; var regionFeatures = item.GetValues(columnNames.Except(excludeColumns).ToArray()).Select(s => ParseRowValue(s)); var creditDataKey = _creditDataExtractor.GetKey(date, creditData.Keys.ToArray()); if (!creditData.ContainsKey(creditDataKey)) { regionFeatures = regionFeatures.Concat(Enumerable.Repeat(-1d, creditData.Values.First().Length)); Trace.WriteLine($"Credit data not found: {creditDataKey}"); } else { regionFeatures = regionFeatures.Concat(creditData[creditDataKey]); } var modelData = new ModelData { Name = key, Code = item.GetValue("AreaCode"), Date = date, Observations = regionFeatures.ToArray(), OriginalTarget = ParseTarget(item.GetValue(_targetName)) }; modelData.Observations = modelData.Observations .Concat(_populationDataExtractor.Get(populationData, modelData)) .Concat(_londonDensityDataExtractor.Get(densityData, modelData)) .Concat(_otherPopulationDataExtractor.Get(otherPopulationData, modelData)) .Concat(_gvaDataExtractor.Get(gvaData, modelData)) .ToArray(); data.TryAdd(i, modelData); } _targetCalculator.Calculate(data, _targetOffset); //TypeSerializer.SerializeToWriter<ConcurrentDictionary<int, ModelData>>(data, new StreamWriter(Path())); var json = JsonConvert.SerializeObject(data, Formatting.Indented); File.WriteAllText(Path(), json); } var itemCount = 0; Parallel.ForEach(data.OrderBy(o => o.Value.Date).GroupBy(g => g.Value.Name).AsParallel(), new ParallelOptions { MaxDegreeOfParallelism = -1 }, (grouping) => { var lastDate = grouping.Last().Value.Date; var dataWithTarget = grouping.Where(s => s.Value.OriginalTarget.HasValue && s.Value.Target != -1); if (dataWithTarget.Any()) { var allObservations = dataWithTarget.Select(s => s.Value.Observations).ToArray(); var allTargets = dataWithTarget.Select(s => s.Value.Target).ToArray(); //var validation = new TimeSeriesCrossValidation<double>((int)(allObservationsExceptLast.RowCount * 0.8), 0, 1); //var validationPredictions = validation.Validate((IIndexedLearner<double>)learner, allObservationsExceptLast, allTargetsExceptLast); //var crossMetric = new MeanSquaredErrorRegressionMetric(); //var crossError = crossMetric.Error(validation.GetValidationTargets(allTargetsExceptLast), validationPredictions); //_totalCrossError += crossError; var meanZeroTransformer = new MeanZeroFeatureTransformer(); var minMaxTransformer = new MinMaxTransformer(0d, 1d); var lastObservations = grouping.Last().Value.Observations; F64Matrix allTransformed = minMaxTransformer.Transform(meanZeroTransformer.Transform(allObservations.Append(lastObservations).ToArray())); var transformed = new F64Matrix(allTransformed.Rows(Enumerable.Range(0, allTransformed.RowCount - 1).ToArray()).Data(), allTransformed.RowCount - 1, allTransformed.ColumnCount); var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24); var trainingTestSplit = splitter.SplitSet(transformed, allTargets); transformed = trainingTestSplit.TrainingSet.Observations; var testSet = trainingTestSplit.TestSet; //var learner = GetRandomForest(); //var learner = GetAda(); //var learner = GetNeuralNet(grouping.First().Value.Observations.Length, transformed.RowCount); var learner = GetEnsemble(grouping.First().Value.Observations.Length, transformed.RowCount); Program.StatusLogger.Info("Learning commenced " + grouping.First().Value.Name); var model = learner.Learn(transformed, trainingTestSplit.TrainingSet.Targets); Program.StatusLogger.Info("Learning completed " + grouping.First().Value.Name); if (model.GetRawVariableImportance().Any(a => a > 0)) { var importanceSummary = string.Join(",\r\n", model.GetRawVariableImportance().Select((d, i) => i.ToString() + ":" + d.ToString())); Program.StatusLogger.Info("Raw variable importance:\r\n" + importanceSummary); } var lastTransformed = allTransformed.Row(transformed.RowCount); var prediction = model.Predict(lastTransformed); //var before = item.Value.Item2[transformed.RowCount - _targetOffset - 1]; var change = -1; //Math.Round(prediction / before, 2); var testPrediction = model.Predict(testSet.Observations); var metric = new MeanSquaredErrorRegressionMetric(); var error = metric.Error(testSet.Targets, testPrediction); var averageError = 0d; lock (Locker) { _totalError += error; itemCount++; averageError = Math.Round(_totalError / itemCount, 3); } var isLondon = London.Contains(grouping.First().Value.Name); var message = $"TotalError: {Math.Round(_totalError, 3)}, AverageError: {averageError}, Target: {_targetName}, Offset: {_targetOffset}, Region: {grouping.First().Value.Name}, London: {isLondon}, Error: {Math.Round(error, 3)}, Next: {Math.Round(prediction, 3)}, Change: {change}"; Program.Logger.Info(message); } }); if (pauseAtEnd) { Console.WriteLine("Press any key to continue"); Console.ReadKey(); } }
public void Predict(int iterations = DefaultIterations) { _iterations = iterations; Program.StatusLogger.Info($"Iterations: {_iterations}"); Program.StatusLogger.Info($"Target: {_targetName}"); Program.StatusLogger.Info($"Offset: {_targetOffset}"); var data = new ConcurrentDictionary <int, ModelData>(); if (File.Exists(Path())) { data = JsonConvert.DeserializeObject <ConcurrentDictionary <int, ModelData> >(File.ReadAllText(Path())); //data = TypeSerializer.DeserializeFromReader<ConcurrentDictionary<int, ModelData>>(new StreamReader(Path())); Program.StatusLogger.Info("Cached data was loaded."); } else { //http://publicdata.landregistry.gov.uk/market-trend-data/house-price-index-data/UK-HPI-full-file-2019-07.csv var header = File.ReadLines("UK-HPI-full-file-2019-07.csv").First(); var columnNames = header.Split(","); var parser = new CsvParser(() => new StringReader(File.ReadAllText("UK-HPI-full-file-2019-07.csv")), ',', false, true); var creditData = _creditDataExtractor.ExtractQuarter(); var featureRows = parser.EnumerateRows().ToArray(); var targets = parser.EnumerateRows(_targetName).ToArray(); string previousKey = null; for (int i = 0; i < featureRows.Length; i++) { var item = featureRows[i]; var key = item.GetValue("RegionName"); var date = DateTime.ParseExact(item.GetValue("Date"), "dd/MM/yyyy", new CultureInfo("en-GB"), DateTimeStyles.AssumeLocal); if (key != previousKey) { Program.StatusLogger.Info($"Processing {key}"); } previousKey = key; var regionFeatures = item.GetValues(columnNames.Except(excludeColumns).ToArray()).Select(s => ParseRowValue(s)); var creditDataKey = _creditDataExtractor.GetMonthOfPreviousQuarter(date); if (!creditData.ContainsKey(creditDataKey)) { regionFeatures = regionFeatures.Concat(Enumerable.Repeat(-1d, creditData.Values.First().Length)); Trace.WriteLine($"Credit data not found: {creditDataKey}"); } else { regionFeatures = regionFeatures.Concat(creditData[creditDataKey]); } data.TryAdd(i, new ModelData { Name = key, Date = date, Observations = regionFeatures.ToArray(), OriginalTarget = ParseRowValue(item.GetValue(_targetName)) }); } _targetExtractor.Calculate(data, _targetOffset); //TypeSerializer.SerializeToWriter<ConcurrentDictionary<int, ModelData>>(data, new StreamWriter(Path())); var json = JsonConvert.SerializeObject(data, Formatting.Indented); File.WriteAllText(Path(), json); } var regionNames = _binaryFeatureEncoder.Encode(data.Select(s => s.Value.Name)); for (int i = 0; i < data.Count(); i++) { data[i].Observations = data[i].Observations.Concat(regionNames[data[i].Name]).ToArray(); } //data.Where(d => d.Value.Target != -1) data = new ConcurrentDictionary <int, ModelData>(data.OrderBy(o => o.Value.Date)); var itemCount = 0; //var numberOfFeatures = transformed.ColumnCount; //var learner = GetRandomForest(); //var learner = GetNeuralnet(numberOfFeatures); var learner = GetAda(); var lastDate = data.Last().Value.Date; var dataWithTarget = data.Where(s => s.Value.Target != -1); var allObservations = dataWithTarget.Select(s => s.Value.Observations).ToArray(); var allTargets = dataWithTarget.Select(s => s.Value.Target).ToArray(); //var splitter = new NoShuffleTrainingTestIndexSplitter<double>(0.8); //var split = splitter.SplitSet(dateSortedData.Select(s => s.First).ToArray(), dateSortedData.Select(s => s.Second).ToArray()); var meanZeroTransformer = new MeanZeroFeatureTransformer(); F64Matrix transformed = meanZeroTransformer.Transform(allObservations); Program.StatusLogger.Info("Learning commenced"); var model = learner.Learn(transformed, allTargets); Program.StatusLogger.Info("Learning completed"); var importanceSummary = string.Join(",\r\n", model.GetRawVariableImportance().Select((d, i) => i.ToString() + ":" + d.ToString())); Program.StatusLogger.Info("Raw variable importance:\r\n" + importanceSummary); var lastObservations = data.Where(s => s.Value.Date == lastDate).Select(s => s.Value.Observations).ToArray(); var prediction = model.Predict(lastObservations); //var before = item.Targets[transformed.RowCount - _targetOffset - 1]; //var change = Math.Round(prediction / before, 2); var allPrediction = model.Predict(transformed); var metric = new MeanSquaredErrorRegressionMetric(); var error = metric.Error(allTargets, allPrediction); _totalError = error; itemCount++; foreach (var item in lastObservations.Zip(prediction)) { var regionName = _binaryFeatureEncoder.Decode(item.First); var isLondon = London.Contains(regionName); //var message = $"TotalError: {(int)(_totalError / itemCount)}, TotalCrossError: {(_totalCrossError / itemCount)}, Region: {item.Key}, London: {isLondon}, Error: {error}, CrossError: {crossError}, Next: {prediction}, Change: {change}"; //var message = $"TotalError: {(int)(_totalError / itemCount)}, Region: {item.Key}, London: {isLondon}, Error: {error}, Next: {prediction}, Change: {change}"; var message = $"TotalError: {Math.Round(_totalError, 6)}, Region: {regionName}, London: {isLondon}, Error: -1, Next: {item.Second}, Change: -1"; Program.Logger.Info(message); } Program.StatusLogger.Info("Prediction completed"); Console.ReadKey(); }