public void FeatureNormalizationTransformer_Transform_Vector()
        {
            var sut    = new MeanZeroFeatureTransformer();
            var matrix = new F64Matrix(new double[] { 123, 12,
                                                      41, 120,
                                                      124, 122 }, 3, 2);

            // Create transformation
            sut.Transform(matrix);

            // use vector transform on each row
            var actual = new F64Matrix(3, 2);

            for (int i = 0; i < actual.RowCount; i++)
            {
                var row = sut.Transform(matrix.Row(i));
                for (int j = 0; j < actual.ColumnCount; j++)
                {
                    actual[i, j] = row[j];
                }
            }

            var expected = new F64Matrix(new double[] { 27, -72.666666666666671,
                                                        -55, 35.333333333333329,
                                                        28, 37.333333333333329 }, 3, 2);

            Assert.AreEqual(expected, actual);
        }
        public void FeatureNormalizationTransformer_Transform_Matrix()
        {
            var sut    = new MeanZeroFeatureTransformer();
            var matrix = new F64Matrix(new double[] { 123, 12,
                                                      41, 120,
                                                      124, 122 }, 3, 2);
            var actual = new F64Matrix(3, 2);

            sut.Transform(matrix, actual);

            var expected = new F64Matrix(new double[] { 27, -72.666666666666671, -55, 35.333333333333329, 28, 37.333333333333329 }, 3, 2);

            Assert.AreEqual(expected, actual);
        }
Beispiel #3
0
        public void FeatureTransformationExtensions_MatrixTransform()
        {
            var meanZeroTransformer = new MeanZeroFeatureTransformer();
            var minMaxTransformer   = new MinMaxTransformer(-1.0, 1.0);
            var matrix = new F64Matrix(new double[] { 123, 12,
                                                      41, 120,
                                                      124, 122 }, 3, 2);

            var actual = matrix.Transform(meanZeroTransformer.Transform)
                         .Transform(minMaxTransformer.Transform);

            var expected = new F64Matrix(new double[] { 0.97590361445783125, -1, -1, 0.96363636363636362, 1, 1 }, 3, 2);

            Assert.AreEqual(expected, actual);
        }
Beispiel #4
0
        public void Regression_Standard_Neural_Net_FeatureTransform_Normalization()
        {
            #region Read Data

            // Use StreamReader(filepath) when running from filesystem
            var parser     = new CsvParser(() => new StringReader(Resources.winequality_white));
            var targetName = "quality";

            // read feature matrix
            var observations = parser.EnumerateRows(c => c != targetName)
                               .ToF64Matrix();

            // read classification targets
            var targets = parser.EnumerateRows(targetName)
                          .ToF64Vector();

            #endregion

            // transform pixel values to be between 0 and 1
            // and shift each feature to have a mean value of zero.
            var minMaxTransformer   = new MinMaxTransformer(0.0, 1.0);
            var meanZeroTransformer = new MeanZeroFeatureTransformer();

            minMaxTransformer.Transform(observations, observations);
            meanZeroTransformer.Transform(observations, observations);

            var numberOfFeatures = observations.ColumnCount;

            // define the neural net.
            var net = new NeuralNet();
            net.Add(new InputLayer(inputUnits: numberOfFeatures));
            net.Add(new DropoutLayer(0.2));
            net.Add(new DenseLayer(800, Activation.Relu));
            net.Add(new DropoutLayer(0.5));
            net.Add(new DenseLayer(800, Activation.Relu));
            net.Add(new DropoutLayer(0.5));
            net.Add(new SquaredErrorRegressionLayer());

            // using only 10 iteration to make the example run faster.
            // using square error as error metric. This is only used for reporting progress.
            var learner = new RegressionNeuralNetLearner(net, iterations: 10, loss: new SquareLoss());
            var model   = learner.Learn(observations, targets);

            var metric      = new MeanSquaredErrorRegressionMetric();
            var predictions = model.Predict(observations);

            Trace.WriteLine("Training Error: " + metric.Error(targets, predictions));
        }
        public void Predict(int iterations = DefaultNNIterations, int targetOffset = 1, string targetName = DefaultTargetName, bool pauseAtEnd = false)
        {
            _iterations   = iterations;
            _targetName   = targetName;
            _targetOffset = targetOffset;

            Program.StatusLogger.Info($"Iterations: {_iterations}");
            Program.StatusLogger.Info($"Target: {_targetName}");
            Program.StatusLogger.Info($"Offset: {_targetOffset}");

            var data = new ConcurrentDictionary <int, ModelData>();

            if (File.Exists(Path()))
            {
                data = JsonConvert.DeserializeObject <ConcurrentDictionary <int, ModelData> >(File.ReadAllText(Path()));
                //data = TypeSerializer.DeserializeFromReader<ConcurrentDictionary<int, ModelData>>(new StreamReader(Path()));

                Program.StatusLogger.Info("Cached data was loaded.");
            }
            else
            {
                //http://publicdata.landregistry.gov.uk/market-trend-data/house-price-index-data/UK-HPI-full-file-2019-07.csv
                var header      = File.ReadLines("UK-HPI-full-file-2019-07.csv").First();
                var columnNames = header.Split(",");

                var parser = new CsvParser(() => new StringReader(File.ReadAllText("UK-HPI-full-file-2019-07.csv")), ',', false, true);

                var creditData          = _creditDataExtractor.Extract();
                var populationData      = _populationDataExtractor.Extract();
                var otherPopulationData = _otherPopulationDataExtractor.Extract();
                var densityData         = _londonDensityDataExtractor.Extract();
                var gvaData             = _gvaDataExtractor.Extract();

                var featureRows = parser.EnumerateRows().ToArray();
                var targets     = parser.EnumerateRows(_targetName).ToArray();

                string previousKey = null;

                for (int i = 0; i < featureRows.Length; i++)
                {
                    var item = featureRows[i];
                    var key  = item.GetValue("RegionName");
                    var date = DateTime.ParseExact(item.GetValue("Date"), "dd/MM/yyyy", new CultureInfo("en-GB"), DateTimeStyles.AssumeLocal);

                    if (key != previousKey)
                    {
                        Program.StatusLogger.Info($"Processing {key}");
                    }
                    previousKey = key;

                    var regionFeatures = item.GetValues(columnNames.Except(excludeColumns).ToArray()).Select(s => ParseRowValue(s));

                    var creditDataKey = _creditDataExtractor.GetKey(date, creditData.Keys.ToArray());
                    if (!creditData.ContainsKey(creditDataKey))
                    {
                        regionFeatures = regionFeatures.Concat(Enumerable.Repeat(-1d, creditData.Values.First().Length));
                        Trace.WriteLine($"Credit data not found: {creditDataKey}");
                    }
                    else
                    {
                        regionFeatures = regionFeatures.Concat(creditData[creditDataKey]);
                    }

                    var modelData = new ModelData
                    {
                        Name           = key,
                        Code           = item.GetValue("AreaCode"),
                        Date           = date,
                        Observations   = regionFeatures.ToArray(),
                        OriginalTarget = ParseTarget(item.GetValue(_targetName))
                    };

                    modelData.Observations = modelData.Observations
                                             .Concat(_populationDataExtractor.Get(populationData, modelData))
                                             .Concat(_londonDensityDataExtractor.Get(densityData, modelData))
                                             .Concat(_otherPopulationDataExtractor.Get(otherPopulationData, modelData))
                                             .Concat(_gvaDataExtractor.Get(gvaData, modelData))
                                             .ToArray();

                    data.TryAdd(i, modelData);
                }

                _targetCalculator.Calculate(data, _targetOffset);


                //TypeSerializer.SerializeToWriter<ConcurrentDictionary<int, ModelData>>(data, new StreamWriter(Path()));
                var json = JsonConvert.SerializeObject(data, Formatting.Indented);
                File.WriteAllText(Path(), json);
            }

            var itemCount = 0;

            Parallel.ForEach(data.OrderBy(o => o.Value.Date).GroupBy(g => g.Value.Name).AsParallel(), new ParallelOptions {
                MaxDegreeOfParallelism = -1
            }, (grouping) =>
            {
                var lastDate       = grouping.Last().Value.Date;
                var dataWithTarget = grouping.Where(s => s.Value.OriginalTarget.HasValue && s.Value.Target != -1);

                if (dataWithTarget.Any())
                {
                    var allObservations = dataWithTarget.Select(s => s.Value.Observations).ToArray();
                    var allTargets      = dataWithTarget.Select(s => s.Value.Target).ToArray();

                    //var validation = new TimeSeriesCrossValidation<double>((int)(allObservationsExceptLast.RowCount * 0.8), 0, 1);
                    //var validationPredictions = validation.Validate((IIndexedLearner<double>)learner, allObservationsExceptLast, allTargetsExceptLast);
                    //var crossMetric = new MeanSquaredErrorRegressionMetric();
                    //var crossError = crossMetric.Error(validation.GetValidationTargets(allTargetsExceptLast), validationPredictions);
                    //_totalCrossError += crossError;
                    var meanZeroTransformer  = new MeanZeroFeatureTransformer();
                    var minMaxTransformer    = new MinMaxTransformer(0d, 1d);
                    var lastObservations     = grouping.Last().Value.Observations;
                    F64Matrix allTransformed = minMaxTransformer.Transform(meanZeroTransformer.Transform(allObservations.Append(lastObservations).ToArray()));
                    var transformed          = new F64Matrix(allTransformed.Rows(Enumerable.Range(0, allTransformed.RowCount - 1).ToArray()).Data(), allTransformed.RowCount - 1, allTransformed.ColumnCount);

                    var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24);

                    var trainingTestSplit = splitter.SplitSet(transformed, allTargets);
                    transformed           = trainingTestSplit.TrainingSet.Observations;
                    var testSet           = trainingTestSplit.TestSet;

                    //var learner = GetRandomForest();
                    //var learner = GetAda();
                    //var learner = GetNeuralNet(grouping.First().Value.Observations.Length, transformed.RowCount);
                    var learner = GetEnsemble(grouping.First().Value.Observations.Length, transformed.RowCount);

                    Program.StatusLogger.Info("Learning commenced " + grouping.First().Value.Name);

                    var model = learner.Learn(transformed, trainingTestSplit.TrainingSet.Targets);

                    Program.StatusLogger.Info("Learning completed " + grouping.First().Value.Name);

                    if (model.GetRawVariableImportance().Any(a => a > 0))
                    {
                        var importanceSummary = string.Join(",\r\n", model.GetRawVariableImportance().Select((d, i) => i.ToString() + ":" + d.ToString()));
                        Program.StatusLogger.Info("Raw variable importance:\r\n" + importanceSummary);
                    }

                    var lastTransformed = allTransformed.Row(transformed.RowCount);
                    var prediction      = model.Predict(lastTransformed);

                    //var before = item.Value.Item2[transformed.RowCount - _targetOffset - 1];
                    var change = -1; //Math.Round(prediction / before, 2);

                    var testPrediction = model.Predict(testSet.Observations);

                    var metric       = new MeanSquaredErrorRegressionMetric();
                    var error        = metric.Error(testSet.Targets, testPrediction);
                    var averageError = 0d;
                    lock (Locker)
                    {
                        _totalError += error;
                        itemCount++;
                        averageError = Math.Round(_totalError / itemCount, 3);
                    }
                    var isLondon = London.Contains(grouping.First().Value.Name);

                    var message = $"TotalError: {Math.Round(_totalError, 3)}, AverageError: {averageError}, Target: {_targetName}, Offset: {_targetOffset}, Region: {grouping.First().Value.Name}, London: {isLondon}, Error: {Math.Round(error, 3)}, Next: {Math.Round(prediction, 3)}, Change: {change}";

                    Program.Logger.Info(message);
                }
            });

            if (pauseAtEnd)
            {
                Console.WriteLine("Press any key to continue");
                Console.ReadKey();
            }
        }
        public void Predict(int iterations = DefaultIterations)
        {
            _iterations = iterations;

            Program.StatusLogger.Info($"Iterations: {_iterations}");
            Program.StatusLogger.Info($"Target: {_targetName}");
            Program.StatusLogger.Info($"Offset: {_targetOffset}");

            var data = new ConcurrentDictionary <int, ModelData>();

            if (File.Exists(Path()))
            {
                data = JsonConvert.DeserializeObject <ConcurrentDictionary <int, ModelData> >(File.ReadAllText(Path()));
                //data = TypeSerializer.DeserializeFromReader<ConcurrentDictionary<int, ModelData>>(new StreamReader(Path()));

                Program.StatusLogger.Info("Cached data was loaded.");
            }
            else
            {
                //http://publicdata.landregistry.gov.uk/market-trend-data/house-price-index-data/UK-HPI-full-file-2019-07.csv
                var header      = File.ReadLines("UK-HPI-full-file-2019-07.csv").First();
                var columnNames = header.Split(",");

                var parser = new CsvParser(() => new StringReader(File.ReadAllText("UK-HPI-full-file-2019-07.csv")), ',', false, true);

                var creditData = _creditDataExtractor.ExtractQuarter();

                var featureRows = parser.EnumerateRows().ToArray();
                var targets     = parser.EnumerateRows(_targetName).ToArray();

                string previousKey = null;

                for (int i = 0; i < featureRows.Length; i++)
                {
                    var item = featureRows[i];
                    var key  = item.GetValue("RegionName");
                    var date = DateTime.ParseExact(item.GetValue("Date"), "dd/MM/yyyy", new CultureInfo("en-GB"), DateTimeStyles.AssumeLocal);

                    if (key != previousKey)
                    {
                        Program.StatusLogger.Info($"Processing {key}");
                    }
                    previousKey = key;

                    var regionFeatures = item.GetValues(columnNames.Except(excludeColumns).ToArray()).Select(s => ParseRowValue(s));

                    var creditDataKey = _creditDataExtractor.GetMonthOfPreviousQuarter(date);
                    if (!creditData.ContainsKey(creditDataKey))
                    {
                        regionFeatures = regionFeatures.Concat(Enumerable.Repeat(-1d, creditData.Values.First().Length));
                        Trace.WriteLine($"Credit data not found: {creditDataKey}");
                    }
                    else
                    {
                        regionFeatures = regionFeatures.Concat(creditData[creditDataKey]);
                    }

                    data.TryAdd(i, new ModelData {
                        Name = key, Date = date, Observations = regionFeatures.ToArray(), OriginalTarget = ParseRowValue(item.GetValue(_targetName))
                    });
                }

                _targetExtractor.Calculate(data, _targetOffset);


                //TypeSerializer.SerializeToWriter<ConcurrentDictionary<int, ModelData>>(data, new StreamWriter(Path()));
                var json = JsonConvert.SerializeObject(data, Formatting.Indented);
                File.WriteAllText(Path(), json);
            }

            var regionNames = _binaryFeatureEncoder.Encode(data.Select(s => s.Value.Name));

            for (int i = 0; i < data.Count(); i++)
            {
                data[i].Observations = data[i].Observations.Concat(regionNames[data[i].Name]).ToArray();
            }

            //data.Where(d => d.Value.Target != -1)
            data = new ConcurrentDictionary <int, ModelData>(data.OrderBy(o => o.Value.Date));

            var itemCount = 0;

            //var numberOfFeatures = transformed.ColumnCount;

            //var learner = GetRandomForest();

            //var learner = GetNeuralnet(numberOfFeatures);

            var learner = GetAda();

            var lastDate = data.Last().Value.Date;

            var dataWithTarget = data.Where(s => s.Value.Target != -1);

            var allObservations = dataWithTarget.Select(s => s.Value.Observations).ToArray();
            var allTargets      = dataWithTarget.Select(s => s.Value.Target).ToArray();

            //var splitter = new NoShuffleTrainingTestIndexSplitter<double>(0.8);
            //var split = splitter.SplitSet(dateSortedData.Select(s => s.First).ToArray(), dateSortedData.Select(s => s.Second).ToArray());

            var       meanZeroTransformer = new MeanZeroFeatureTransformer();
            F64Matrix transformed         = meanZeroTransformer.Transform(allObservations);

            Program.StatusLogger.Info("Learning commenced");
            var model = learner.Learn(transformed, allTargets);

            Program.StatusLogger.Info("Learning completed");

            var importanceSummary = string.Join(",\r\n", model.GetRawVariableImportance().Select((d, i) => i.ToString() + ":" + d.ToString()));

            Program.StatusLogger.Info("Raw variable importance:\r\n" + importanceSummary);

            var lastObservations = data.Where(s => s.Value.Date == lastDate).Select(s => s.Value.Observations).ToArray();

            var prediction = model.Predict(lastObservations);
            //var before = item.Targets[transformed.RowCount - _targetOffset - 1];
            //var change = Math.Round(prediction / before, 2);

            var allPrediction = model.Predict(transformed);

            var metric = new MeanSquaredErrorRegressionMetric();
            var error  = metric.Error(allTargets, allPrediction);

            _totalError = error;
            itemCount++;

            foreach (var item in lastObservations.Zip(prediction))
            {
                var regionName = _binaryFeatureEncoder.Decode(item.First);
                var isLondon   = London.Contains(regionName);

                //var message = $"TotalError: {(int)(_totalError / itemCount)}, TotalCrossError: {(_totalCrossError / itemCount)}, Region: {item.Key}, London: {isLondon}, Error: {error}, CrossError: {crossError}, Next: {prediction}, Change: {change}";
                //var message = $"TotalError: {(int)(_totalError / itemCount)}, Region: {item.Key}, London: {isLondon}, Error: {error}, Next: {prediction}, Change: {change}";
                var message = $"TotalError: {Math.Round(_totalError, 6)}, Region: {regionName}, London: {isLondon}, Error: -1, Next: {item.Second}, Change: -1";
                Program.Logger.Info(message);
            }

            Program.StatusLogger.Info("Prediction completed");
            Console.ReadKey();
        }