コード例 #1
0
        /// <summary>
        /// Predicts a set of observations using the ensembled probabilities
        /// Note this can yield a different result than using regular predict
        /// Usually this will be a more accurate predictions
        /// </summary>
        /// <param name="observations"></param>
        /// <returns></returns>
        public ProbabilityPrediction[] PredictProbability(F64Matrix observations)
        {
            var rows        = observations.RowCount;
            var predictions = new ProbabilityPrediction[rows];

            for (int i = 0; i < rows; i++)
            {
                predictions[i] = PredictProbability(observations.Row(i));
            }

            return(predictions);
        }
コード例 #2
0
        /// <summary>
        /// Predicts a set of observations using majority vote
        /// </summary>
        /// <param name="observations"></param>
        /// <returns></returns>
        public double[] Predict(F64Matrix observations)
        {
            var rows        = observations.RowCount;
            var predictions = new double[rows];

            for (int i = 0; i < rows; i++)
            {
                predictions[i] = Predict(observations.Row(i));
            }

            return(predictions);
        }
コード例 #3
0
        /// <summary>
        /// Predicts the observation subset provided by indices
        /// </summary>
        /// <param name="observations"></param>
        /// <param name="indices"></param>
        /// <returns></returns>
        public double[] Predict(F64Matrix observations, int[] indices)
        {
            var rows        = observations.RowCount;
            var predictions = new double[indices.Length];

            for (int i = 0; i < indices.Length; i++)
            {
                predictions[i] = Tree.Predict(observations.Row(indices[i]));
            }

            return(predictions);
        }
        /// <summary>
        /// Predicts the observation subset provided by indices with probabilities
        /// </summary>
        /// <param name="observations"></param>
        /// <param name="indices"></param>
        /// <returns></returns>
        public ProbabilityPrediction[] PredictProbability(F64Matrix observations, int[] indices)
        {
            var rows        = observations.RowCount;
            var predictions = new ProbabilityPrediction[indices.Length];

            for (int i = 0; i < indices.Length; i++)
            {
                predictions[i] = Tree.PredictProbability(observations.Row(indices[i]));
            }

            return(predictions);
        }
コード例 #5
0
        /// <summary>
        /// Batch-prediction for subset of the given genomes.
        /// </summary>
        /// <param name="genomes">
        /// The genomes.
        /// </param>
        /// <param name="indices">
        /// The row indices of genomes to predict for.
        /// </param>
        /// <returns>
        /// The <see cref="T:double[]"/> prediction for each given index.
        /// </returns>
        public double[] Predict(F64Matrix genomes, int[] indices)
        {
            var predictions = new double[indices.Length];

            for (var i = 0; i < indices.Length; i++)
            {
                var currentRow = indices[i];
                predictions[i] = this.Predict(genomes.Row(currentRow));
            }

            return(predictions);
        }
コード例 #6
0
        /// <summary>
        /// Predicts a set of observations
        /// </summary>
        /// <param name="observations"></param>
        /// <returns></returns>
        public double[] Predict(F64Matrix observations)
        {
            var predictions = new double[observations.RowCount];
            var observation = new double[observations.ColumnCount];

            for (int i = 0; i < observations.RowCount; i++)
            {
                observations.Row(i, observation);
                predictions[i] = Predict(observation);
            }

            return(predictions);
        }
コード例 #7
0
        /// <summary>
        /// Time series cross-validation. Based on rolling validation using the original order of the data.
        /// Using the specified initial size of the training set, a model is trained.
        /// The model predicts the first observation following the training data.
        /// Following, this data point is included in the training and a new model is trained,
        /// which predict the next observation. This continuous until all observations following the initial training size,
        /// has been validated.
        /// </summary>
        /// <param name="learner"></param>
        /// <param name="observations"></param>
        /// <param name="targets"></param>
        /// <returns>The validated predictions, following the initial training size</returns>
        public TPrediction[] Validate(IIndexedLearner <TPrediction> learner, F64Matrix observations, double[] targets)
        {
            if (observations.RowCount != targets.Length)
            {
                throw new ArgumentException($"observation row count {observations.RowCount} " +
                                            $"must match target length {targets.Length}");
            }

            if (m_initialTrainingSize >= observations.RowCount)
            {
                throw new ArgumentException($"observation row count {observations.RowCount} " +
                                            $"is smaller than initial training size {m_initialTrainingSize}");
            }

            var trainingIndices  = Enumerable.Range(0, m_initialTrainingSize).ToArray();
            var predictionLength = targets.Length - trainingIndices.Length;
            var predictions      = new TPrediction[predictionLength];

            var observation       = new double[observations.ColumnCount];
            var lastTrainingIndex = trainingIndices.Last();

            var model = learner.Learn(observations, targets, trainingIndices);

            for (int i = 0; i < predictions.Length; i++)
            {
                // Only train a new model at each retrain interval.
                if (((m_retrainInterval == 1) || ((i % m_retrainInterval) == 0)) && (i != 0))
                {
                    model = learner.Learn(observations, targets, trainingIndices);
                }

                var predictionIndex = lastTrainingIndex + 1;
                observations.Row(predictionIndex, observation);
                predictions[i] = model.Predict(observation);

                lastTrainingIndex++;

                // determine start index and length of the training period, if maxTrainingSetSize is specified.
                var startIndex = m_maxTrainingSetSize != 0 ?
                                 Math.Max(0, (lastTrainingIndex + 1) - m_maxTrainingSetSize) : 0;

                var length = m_maxTrainingSetSize != 0 ?
                             Math.Min(m_maxTrainingSetSize, lastTrainingIndex) : lastTrainingIndex;

                trainingIndices = Enumerable.Range(startIndex, length).ToArray();

                ModelDisposer.DisposeIfDisposable(model);
            }

            return(predictions);
        }
コード例 #8
0
        /// <summary>
        /// Predicts a set of obervations using the ensembled probabilities
        /// </summary>
        /// <param name="observations"></param>
        /// <returns></returns>
        public ProbabilityPrediction[] PredictProbability(F64Matrix observations)
        {
            var rows        = observations.RowCount;
            var cols        = observations.ColumnCount;
            var predictions = new ProbabilityPrediction[rows];
            var observation = new double[cols];

            for (int i = 0; i < rows; i++)
            {
                observations.Row(i, observation);
                predictions[i] = PredictProbability(observation);
            }

            return(predictions);
        }
コード例 #9
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="observations"></param>
        /// <returns></returns>
        public double[] Predict(F64Matrix observations)
        {
            var rows        = observations.RowCount;
            var cols        = observations.ColumnCount;
            var observation = new double[cols];

            var predictions = new double[rows];

            for (int row = 0; row < rows; row++)
            {
                observations.Row(row, observation);
                predictions[row] = Predict(observation);
            }

            return(predictions);
        }
コード例 #10
0
        double ErrorEstimate(F64Matrix observations, int[] indices)
        {
            var rows        = indices.Length;
            var predictions = new double[rows];

            for (int i = 0; i < rows; i++)
            {
                var index = indices[i];
                predictions[i] = Predict(observations.Row(index));
            }

            var error = m_errorMetric.Error(m_indexedTargets, predictions);

            //Trace.WriteLine("Error: " + error);

            return(error);
        }
コード例 #11
0
        /// <summary>
        /// Cross validated predictions.
        /// Only crossValidates within the provided indices.
        /// The predictions are returned in the predictions array.
        /// </summary>
        /// <param name="learner"></param>
        /// <param name="observations"></param>
        /// <param name="targets"></param>
        /// <param name="crossValidationIndices"></param>
        /// <param name="crossValidatedPredictions"></param>
        public void CrossValidate(IIndexedLearner <TPrediction> learner,
                                  F64Matrix observations,
                                  double[] targets,
                                  int[] crossValidationIndices,
                                  TPrediction[] crossValidatedPredictions)
        {
            var rows = crossValidatedPredictions.Length;

            if (m_crossValidationFolds > rows)
            {
                throw new ArgumentException("Too few observations: " + rows +
                                            " for number of cross validation folds: " + m_crossValidationFolds);
            }

            var indices = crossValidationIndices.ToArray();

            // Map the provided crossValidationIndices to crossValidatedPredictions
            // Indices from crossValidationIndices can be larger than crossValidatedPredictions length
            // since crossValidatedPredictions might be a subset of the provided observations and targets
            var cvPredictionIndiceMap = Enumerable.Range(0, crossValidatedPredictions.Length)
                                        .ToDictionary(i => indices[i], i => i);

            var crossValidationIndexSets = CrossValidationUtilities.GetKFoldCrossValidationIndexSets(
                m_indexedSampler, m_crossValidationFolds, targets, indices);

            var observation = new double[observations.ColumnCount];

            foreach (var(trainingIndices, validationIndices) in crossValidationIndexSets)
            {
                var model       = learner.Learn(observations, targets, trainingIndices);
                var predictions = new TPrediction[validationIndices.Length];

                for (int l = 0; l < predictions.Length; l++)
                {
                    observations.Row(validationIndices[l], observation);
                    predictions[l] = model.Predict(observation);
                }

                for (int j = 0; j < validationIndices.Length; j++)
                {
                    crossValidatedPredictions[cvPredictionIndiceMap[validationIndices[j]]] = predictions[j];
                }

                ModelDisposer.DisposeIfDisposable(model);
            }
        }
コード例 #12
0
        public void Predict(int iterations = DefaultNNIterations, int targetOffset = 1, string targetName = DefaultTargetName, bool pauseAtEnd = false)
        {
            _iterations   = iterations;
            _targetName   = targetName;
            _targetOffset = targetOffset;

            Program.StatusLogger.Info($"Iterations: {_iterations}");
            Program.StatusLogger.Info($"Target: {_targetName}");
            Program.StatusLogger.Info($"Offset: {_targetOffset}");

            var data = new ConcurrentDictionary <int, ModelData>();

            if (File.Exists(Path()))
            {
                data = JsonConvert.DeserializeObject <ConcurrentDictionary <int, ModelData> >(File.ReadAllText(Path()));
                //data = TypeSerializer.DeserializeFromReader<ConcurrentDictionary<int, ModelData>>(new StreamReader(Path()));

                Program.StatusLogger.Info("Cached data was loaded.");
            }
            else
            {
                //http://publicdata.landregistry.gov.uk/market-trend-data/house-price-index-data/UK-HPI-full-file-2019-07.csv
                var header      = File.ReadLines("UK-HPI-full-file-2019-07.csv").First();
                var columnNames = header.Split(",");

                var parser = new CsvParser(() => new StringReader(File.ReadAllText("UK-HPI-full-file-2019-07.csv")), ',', false, true);

                var creditData          = _creditDataExtractor.Extract();
                var populationData      = _populationDataExtractor.Extract();
                var otherPopulationData = _otherPopulationDataExtractor.Extract();
                var densityData         = _londonDensityDataExtractor.Extract();
                var gvaData             = _gvaDataExtractor.Extract();

                var featureRows = parser.EnumerateRows().ToArray();
                var targets     = parser.EnumerateRows(_targetName).ToArray();

                string previousKey = null;

                for (int i = 0; i < featureRows.Length; i++)
                {
                    var item = featureRows[i];
                    var key  = item.GetValue("RegionName");
                    var date = DateTime.ParseExact(item.GetValue("Date"), "dd/MM/yyyy", new CultureInfo("en-GB"), DateTimeStyles.AssumeLocal);

                    if (key != previousKey)
                    {
                        Program.StatusLogger.Info($"Processing {key}");
                    }
                    previousKey = key;

                    var regionFeatures = item.GetValues(columnNames.Except(excludeColumns).ToArray()).Select(s => ParseRowValue(s));

                    var creditDataKey = _creditDataExtractor.GetKey(date, creditData.Keys.ToArray());
                    if (!creditData.ContainsKey(creditDataKey))
                    {
                        regionFeatures = regionFeatures.Concat(Enumerable.Repeat(-1d, creditData.Values.First().Length));
                        Trace.WriteLine($"Credit data not found: {creditDataKey}");
                    }
                    else
                    {
                        regionFeatures = regionFeatures.Concat(creditData[creditDataKey]);
                    }

                    var modelData = new ModelData
                    {
                        Name           = key,
                        Code           = item.GetValue("AreaCode"),
                        Date           = date,
                        Observations   = regionFeatures.ToArray(),
                        OriginalTarget = ParseTarget(item.GetValue(_targetName))
                    };

                    modelData.Observations = modelData.Observations
                                             .Concat(_populationDataExtractor.Get(populationData, modelData))
                                             .Concat(_londonDensityDataExtractor.Get(densityData, modelData))
                                             .Concat(_otherPopulationDataExtractor.Get(otherPopulationData, modelData))
                                             .Concat(_gvaDataExtractor.Get(gvaData, modelData))
                                             .ToArray();

                    data.TryAdd(i, modelData);
                }

                _targetCalculator.Calculate(data, _targetOffset);


                //TypeSerializer.SerializeToWriter<ConcurrentDictionary<int, ModelData>>(data, new StreamWriter(Path()));
                var json = JsonConvert.SerializeObject(data, Formatting.Indented);
                File.WriteAllText(Path(), json);
            }

            var itemCount = 0;

            Parallel.ForEach(data.OrderBy(o => o.Value.Date).GroupBy(g => g.Value.Name).AsParallel(), new ParallelOptions {
                MaxDegreeOfParallelism = -1
            }, (grouping) =>
            {
                var lastDate       = grouping.Last().Value.Date;
                var dataWithTarget = grouping.Where(s => s.Value.OriginalTarget.HasValue && s.Value.Target != -1);

                if (dataWithTarget.Any())
                {
                    var allObservations = dataWithTarget.Select(s => s.Value.Observations).ToArray();
                    var allTargets      = dataWithTarget.Select(s => s.Value.Target).ToArray();

                    //var validation = new TimeSeriesCrossValidation<double>((int)(allObservationsExceptLast.RowCount * 0.8), 0, 1);
                    //var validationPredictions = validation.Validate((IIndexedLearner<double>)learner, allObservationsExceptLast, allTargetsExceptLast);
                    //var crossMetric = new MeanSquaredErrorRegressionMetric();
                    //var crossError = crossMetric.Error(validation.GetValidationTargets(allTargetsExceptLast), validationPredictions);
                    //_totalCrossError += crossError;
                    var meanZeroTransformer  = new MeanZeroFeatureTransformer();
                    var minMaxTransformer    = new MinMaxTransformer(0d, 1d);
                    var lastObservations     = grouping.Last().Value.Observations;
                    F64Matrix allTransformed = minMaxTransformer.Transform(meanZeroTransformer.Transform(allObservations.Append(lastObservations).ToArray()));
                    var transformed          = new F64Matrix(allTransformed.Rows(Enumerable.Range(0, allTransformed.RowCount - 1).ToArray()).Data(), allTransformed.RowCount - 1, allTransformed.ColumnCount);

                    var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24);

                    var trainingTestSplit = splitter.SplitSet(transformed, allTargets);
                    transformed           = trainingTestSplit.TrainingSet.Observations;
                    var testSet           = trainingTestSplit.TestSet;

                    //var learner = GetRandomForest();
                    //var learner = GetAda();
                    //var learner = GetNeuralNet(grouping.First().Value.Observations.Length, transformed.RowCount);
                    var learner = GetEnsemble(grouping.First().Value.Observations.Length, transformed.RowCount);

                    Program.StatusLogger.Info("Learning commenced " + grouping.First().Value.Name);

                    var model = learner.Learn(transformed, trainingTestSplit.TrainingSet.Targets);

                    Program.StatusLogger.Info("Learning completed " + grouping.First().Value.Name);

                    if (model.GetRawVariableImportance().Any(a => a > 0))
                    {
                        var importanceSummary = string.Join(",\r\n", model.GetRawVariableImportance().Select((d, i) => i.ToString() + ":" + d.ToString()));
                        Program.StatusLogger.Info("Raw variable importance:\r\n" + importanceSummary);
                    }

                    var lastTransformed = allTransformed.Row(transformed.RowCount);
                    var prediction      = model.Predict(lastTransformed);

                    //var before = item.Value.Item2[transformed.RowCount - _targetOffset - 1];
                    var change = -1; //Math.Round(prediction / before, 2);

                    var testPrediction = model.Predict(testSet.Observations);

                    var metric       = new MeanSquaredErrorRegressionMetric();
                    var error        = metric.Error(testSet.Targets, testPrediction);
                    var averageError = 0d;
                    lock (Locker)
                    {
                        _totalError += error;
                        itemCount++;
                        averageError = Math.Round(_totalError / itemCount, 3);
                    }
                    var isLondon = London.Contains(grouping.First().Value.Name);

                    var message = $"TotalError: {Math.Round(_totalError, 3)}, AverageError: {averageError}, Target: {_targetName}, Offset: {_targetOffset}, Region: {grouping.First().Value.Name}, London: {isLondon}, Error: {Math.Round(error, 3)}, Next: {Math.Round(prediction, 3)}, Change: {change}";

                    Program.Logger.Info(message);
                }
            });

            if (pauseAtEnd)
            {
                Console.WriteLine("Press any key to continue");
                Console.ReadKey();
            }
        }
コード例 #13
0
        /// <summary>
        /// Cross validated predictions.
        /// Only crossValidates within the provided indices.
        /// The predictions are returned in the predictions array.
        /// </summary>
        /// <param name="learner"></param>
        /// <param name="observations"></param>
        /// <param name="targets"></param>
        /// <param name="crossValidationIndices"></param>
        /// <param name="crossValidatedPredictions"></param>
        public void CrossValidate(IIndexedLearner <TPrediction> learner,
                                  F64Matrix observations,
                                  double[] targets,
                                  int[] crossValidationIndices,
                                  TPrediction[] crossValidatedPredictions)
        {
            var rows = crossValidatedPredictions.Length;

            if (m_crossValidationFolds > rows)
            {
                throw new ArgumentException("Too few observations: " + rows +
                                            " for number of cross validation folds: " + m_crossValidationFolds);
            }

            var holdOutSamples = new int[m_crossValidationFolds][];
            var samplesPrFold  = rows / m_crossValidationFolds;

            var indices = crossValidationIndices.ToArray();

            // Map the provided crossValidationIndices to crossValidatedPredictions
            // Indices from crossValidationIndices can be larger than crossValidatedPredictions length
            // since crossValidatedPredictions might be a subset of the provided observations and targets
            var cvPredictionIndiceMap = Enumerable.Range(0, crossValidatedPredictions.Length)
                                        .ToDictionary(i => indices[i], i => i);

            for (int i = 0; i < m_crossValidationFolds; i++)
            {
                if (i == m_crossValidationFolds - 1)
                {
                    // last fold. Add remaining indices.
                    holdOutSamples[i] = indices.ToArray();
                }
                else
                {
                    var holdoutSample = m_indexedSampler.Sample(targets, samplesPrFold, indices);
                    holdOutSamples[i] = holdoutSample;
                    indices           = indices.Except(holdoutSample).ToArray();
                }
            }

            var observation = new double[observations.ColumnCount];

            for (int i = 0; i < m_crossValidationFolds; i++)
            {
                var holdoutIndices  = holdOutSamples[i];
                var trainingIndices = crossValidationIndices.Except(holdoutIndices).ToArray();
                var model           = learner.Learn(observations, targets, trainingIndices);
                var predictions     = new TPrediction[holdoutIndices.Length];

                for (int l = 0; l < predictions.Length; l++)
                {
                    observations.Row(holdoutIndices[l], observation);
                    predictions[l] = model.Predict(observation);
                }

                for (int j = 0; j < holdoutIndices.Length; j++)
                {
                    crossValidatedPredictions[cvPredictionIndiceMap[holdoutIndices[j]]] = predictions[j];
                }

                ModelDisposer.DisposeIfDisposable(model);
            }
        }
コード例 #14
0
 /// <summary>
 /// Get a row as csv.
 /// </summary>
 /// <param name="matrix">
 /// The matrix.
 /// </param>
 /// <param name="row">
 /// The row.
 /// </param>
 /// <param name="sep">
 /// The seperator.
 /// </param>
 /// <returns>
 /// The <see cref="string"/>.
 /// </returns>
 internal static string GetRowAsCsv(this F64Matrix matrix, int row, string sep)
 {
     return(string.Join(sep, matrix.Row(row).Select(o => string.Format(CultureInfo.InvariantCulture, "{0}", o))));
 }
コード例 #15
0
        /// <summary>
        /// Returns the agumented version of the data. Excluding the original.
        /// The each feature in the dataset must be scaled/normnalized between 0.0 and 1.0
        /// before the method works.
        /// </summary>
        /// <param name="dataset"></param>
        /// <returns></returns>
        public F64Matrix Agument(F64Matrix dataset)
        {
            var orgCols = dataset.ColumnCount;
            var orgRows = dataset.RowCount;

            var augmentation   = new F64Matrix(dataset.RowCount, dataset.ColumnCount);
            var indicesVisited = new HashSet <int>();

            var sample    = new double[orgCols];
            var candidate = new double[orgCols];

            indicesVisited.Clear();

            for (int j = 0; j < orgRows; j++)
            {
                if (indicesVisited.Contains(j))
                {
                    continue;
                }
                dataset.Row(j, sample);

                var closestDistance = double.MaxValue;
                var closestIndex    = -1;
                indicesVisited.Add(j);

                for (int f = 0; f < orgRows; f++)
                {
                    if (indicesVisited.Contains(f))
                    {
                        continue;
                    }
                    dataset.Row(f, candidate);

                    var distance = GetDistance(sample, candidate);
                    if (distance < closestDistance)
                    {
                        closestDistance = distance;
                        closestIndex    = f;
                    }
                }

                if (closestIndex != -1)
                {
                    dataset.Row(closestIndex, candidate);
                    indicesVisited.Add(closestIndex);

                    for (int h = 0; h < sample.Length; h++)
                    {
                        var sampleValue   = sample[h];
                        var candiateValue = candidate[h];

                        if (m_random.NextDouble() <= m_probabilityParameter && m_probabilityParameter != 0.0)
                        {
                            var std = (sampleValue - candiateValue) / m_localVariance;

                            augmentation.At(j, h, SampleRandom(candiateValue, std));
                            augmentation.At(closestIndex, h, SampleRandom(sampleValue, std));
                        }
                        else // keep values
                        {
                            augmentation.At(j, h, sampleValue);
                            augmentation.At(closestIndex, h, candiateValue);
                        }
                    }
                }
            }

            return(augmentation);
        }