/// <summary> /// Predicts a set of observations using the ensembled probabilities /// Note this can yield a different result than using regular predict /// Usually this will be a more accurate predictions /// </summary> /// <param name="observations"></param> /// <returns></returns> public ProbabilityPrediction[] PredictProbability(F64Matrix observations) { var rows = observations.RowCount; var predictions = new ProbabilityPrediction[rows]; for (int i = 0; i < rows; i++) { predictions[i] = PredictProbability(observations.Row(i)); } return(predictions); }
/// <summary> /// Predicts a set of observations using majority vote /// </summary> /// <param name="observations"></param> /// <returns></returns> public double[] Predict(F64Matrix observations) { var rows = observations.RowCount; var predictions = new double[rows]; for (int i = 0; i < rows; i++) { predictions[i] = Predict(observations.Row(i)); } return(predictions); }
/// <summary> /// Predicts the observation subset provided by indices /// </summary> /// <param name="observations"></param> /// <param name="indices"></param> /// <returns></returns> public double[] Predict(F64Matrix observations, int[] indices) { var rows = observations.RowCount; var predictions = new double[indices.Length]; for (int i = 0; i < indices.Length; i++) { predictions[i] = Tree.Predict(observations.Row(indices[i])); } return(predictions); }
/// <summary> /// Predicts the observation subset provided by indices with probabilities /// </summary> /// <param name="observations"></param> /// <param name="indices"></param> /// <returns></returns> public ProbabilityPrediction[] PredictProbability(F64Matrix observations, int[] indices) { var rows = observations.RowCount; var predictions = new ProbabilityPrediction[indices.Length]; for (int i = 0; i < indices.Length; i++) { predictions[i] = Tree.PredictProbability(observations.Row(indices[i])); } return(predictions); }
/// <summary> /// Batch-prediction for subset of the given genomes. /// </summary> /// <param name="genomes"> /// The genomes. /// </param> /// <param name="indices"> /// The row indices of genomes to predict for. /// </param> /// <returns> /// The <see cref="T:double[]"/> prediction for each given index. /// </returns> public double[] Predict(F64Matrix genomes, int[] indices) { var predictions = new double[indices.Length]; for (var i = 0; i < indices.Length; i++) { var currentRow = indices[i]; predictions[i] = this.Predict(genomes.Row(currentRow)); } return(predictions); }
/// <summary> /// Predicts a set of observations /// </summary> /// <param name="observations"></param> /// <returns></returns> public double[] Predict(F64Matrix observations) { var predictions = new double[observations.RowCount]; var observation = new double[observations.ColumnCount]; for (int i = 0; i < observations.RowCount; i++) { observations.Row(i, observation); predictions[i] = Predict(observation); } return(predictions); }
/// <summary> /// Time series cross-validation. Based on rolling validation using the original order of the data. /// Using the specified initial size of the training set, a model is trained. /// The model predicts the first observation following the training data. /// Following, this data point is included in the training and a new model is trained, /// which predict the next observation. This continuous until all observations following the initial training size, /// has been validated. /// </summary> /// <param name="learner"></param> /// <param name="observations"></param> /// <param name="targets"></param> /// <returns>The validated predictions, following the initial training size</returns> public TPrediction[] Validate(IIndexedLearner <TPrediction> learner, F64Matrix observations, double[] targets) { if (observations.RowCount != targets.Length) { throw new ArgumentException($"observation row count {observations.RowCount} " + $"must match target length {targets.Length}"); } if (m_initialTrainingSize >= observations.RowCount) { throw new ArgumentException($"observation row count {observations.RowCount} " + $"is smaller than initial training size {m_initialTrainingSize}"); } var trainingIndices = Enumerable.Range(0, m_initialTrainingSize).ToArray(); var predictionLength = targets.Length - trainingIndices.Length; var predictions = new TPrediction[predictionLength]; var observation = new double[observations.ColumnCount]; var lastTrainingIndex = trainingIndices.Last(); var model = learner.Learn(observations, targets, trainingIndices); for (int i = 0; i < predictions.Length; i++) { // Only train a new model at each retrain interval. if (((m_retrainInterval == 1) || ((i % m_retrainInterval) == 0)) && (i != 0)) { model = learner.Learn(observations, targets, trainingIndices); } var predictionIndex = lastTrainingIndex + 1; observations.Row(predictionIndex, observation); predictions[i] = model.Predict(observation); lastTrainingIndex++; // determine start index and length of the training period, if maxTrainingSetSize is specified. var startIndex = m_maxTrainingSetSize != 0 ? Math.Max(0, (lastTrainingIndex + 1) - m_maxTrainingSetSize) : 0; var length = m_maxTrainingSetSize != 0 ? Math.Min(m_maxTrainingSetSize, lastTrainingIndex) : lastTrainingIndex; trainingIndices = Enumerable.Range(startIndex, length).ToArray(); ModelDisposer.DisposeIfDisposable(model); } return(predictions); }
/// <summary> /// Predicts a set of obervations using the ensembled probabilities /// </summary> /// <param name="observations"></param> /// <returns></returns> public ProbabilityPrediction[] PredictProbability(F64Matrix observations) { var rows = observations.RowCount; var cols = observations.ColumnCount; var predictions = new ProbabilityPrediction[rows]; var observation = new double[cols]; for (int i = 0; i < rows; i++) { observations.Row(i, observation); predictions[i] = PredictProbability(observation); } return(predictions); }
/// <summary> /// /// </summary> /// <param name="observations"></param> /// <returns></returns> public double[] Predict(F64Matrix observations) { var rows = observations.RowCount; var cols = observations.ColumnCount; var observation = new double[cols]; var predictions = new double[rows]; for (int row = 0; row < rows; row++) { observations.Row(row, observation); predictions[row] = Predict(observation); } return(predictions); }
double ErrorEstimate(F64Matrix observations, int[] indices) { var rows = indices.Length; var predictions = new double[rows]; for (int i = 0; i < rows; i++) { var index = indices[i]; predictions[i] = Predict(observations.Row(index)); } var error = m_errorMetric.Error(m_indexedTargets, predictions); //Trace.WriteLine("Error: " + error); return(error); }
/// <summary> /// Cross validated predictions. /// Only crossValidates within the provided indices. /// The predictions are returned in the predictions array. /// </summary> /// <param name="learner"></param> /// <param name="observations"></param> /// <param name="targets"></param> /// <param name="crossValidationIndices"></param> /// <param name="crossValidatedPredictions"></param> public void CrossValidate(IIndexedLearner <TPrediction> learner, F64Matrix observations, double[] targets, int[] crossValidationIndices, TPrediction[] crossValidatedPredictions) { var rows = crossValidatedPredictions.Length; if (m_crossValidationFolds > rows) { throw new ArgumentException("Too few observations: " + rows + " for number of cross validation folds: " + m_crossValidationFolds); } var indices = crossValidationIndices.ToArray(); // Map the provided crossValidationIndices to crossValidatedPredictions // Indices from crossValidationIndices can be larger than crossValidatedPredictions length // since crossValidatedPredictions might be a subset of the provided observations and targets var cvPredictionIndiceMap = Enumerable.Range(0, crossValidatedPredictions.Length) .ToDictionary(i => indices[i], i => i); var crossValidationIndexSets = CrossValidationUtilities.GetKFoldCrossValidationIndexSets( m_indexedSampler, m_crossValidationFolds, targets, indices); var observation = new double[observations.ColumnCount]; foreach (var(trainingIndices, validationIndices) in crossValidationIndexSets) { var model = learner.Learn(observations, targets, trainingIndices); var predictions = new TPrediction[validationIndices.Length]; for (int l = 0; l < predictions.Length; l++) { observations.Row(validationIndices[l], observation); predictions[l] = model.Predict(observation); } for (int j = 0; j < validationIndices.Length; j++) { crossValidatedPredictions[cvPredictionIndiceMap[validationIndices[j]]] = predictions[j]; } ModelDisposer.DisposeIfDisposable(model); } }
public void Predict(int iterations = DefaultNNIterations, int targetOffset = 1, string targetName = DefaultTargetName, bool pauseAtEnd = false) { _iterations = iterations; _targetName = targetName; _targetOffset = targetOffset; Program.StatusLogger.Info($"Iterations: {_iterations}"); Program.StatusLogger.Info($"Target: {_targetName}"); Program.StatusLogger.Info($"Offset: {_targetOffset}"); var data = new ConcurrentDictionary <int, ModelData>(); if (File.Exists(Path())) { data = JsonConvert.DeserializeObject <ConcurrentDictionary <int, ModelData> >(File.ReadAllText(Path())); //data = TypeSerializer.DeserializeFromReader<ConcurrentDictionary<int, ModelData>>(new StreamReader(Path())); Program.StatusLogger.Info("Cached data was loaded."); } else { //http://publicdata.landregistry.gov.uk/market-trend-data/house-price-index-data/UK-HPI-full-file-2019-07.csv var header = File.ReadLines("UK-HPI-full-file-2019-07.csv").First(); var columnNames = header.Split(","); var parser = new CsvParser(() => new StringReader(File.ReadAllText("UK-HPI-full-file-2019-07.csv")), ',', false, true); var creditData = _creditDataExtractor.Extract(); var populationData = _populationDataExtractor.Extract(); var otherPopulationData = _otherPopulationDataExtractor.Extract(); var densityData = _londonDensityDataExtractor.Extract(); var gvaData = _gvaDataExtractor.Extract(); var featureRows = parser.EnumerateRows().ToArray(); var targets = parser.EnumerateRows(_targetName).ToArray(); string previousKey = null; for (int i = 0; i < featureRows.Length; i++) { var item = featureRows[i]; var key = item.GetValue("RegionName"); var date = DateTime.ParseExact(item.GetValue("Date"), "dd/MM/yyyy", new CultureInfo("en-GB"), DateTimeStyles.AssumeLocal); if (key != previousKey) { Program.StatusLogger.Info($"Processing {key}"); } previousKey = key; var regionFeatures = item.GetValues(columnNames.Except(excludeColumns).ToArray()).Select(s => ParseRowValue(s)); var creditDataKey = _creditDataExtractor.GetKey(date, creditData.Keys.ToArray()); if (!creditData.ContainsKey(creditDataKey)) { regionFeatures = regionFeatures.Concat(Enumerable.Repeat(-1d, creditData.Values.First().Length)); Trace.WriteLine($"Credit data not found: {creditDataKey}"); } else { regionFeatures = regionFeatures.Concat(creditData[creditDataKey]); } var modelData = new ModelData { Name = key, Code = item.GetValue("AreaCode"), Date = date, Observations = regionFeatures.ToArray(), OriginalTarget = ParseTarget(item.GetValue(_targetName)) }; modelData.Observations = modelData.Observations .Concat(_populationDataExtractor.Get(populationData, modelData)) .Concat(_londonDensityDataExtractor.Get(densityData, modelData)) .Concat(_otherPopulationDataExtractor.Get(otherPopulationData, modelData)) .Concat(_gvaDataExtractor.Get(gvaData, modelData)) .ToArray(); data.TryAdd(i, modelData); } _targetCalculator.Calculate(data, _targetOffset); //TypeSerializer.SerializeToWriter<ConcurrentDictionary<int, ModelData>>(data, new StreamWriter(Path())); var json = JsonConvert.SerializeObject(data, Formatting.Indented); File.WriteAllText(Path(), json); } var itemCount = 0; Parallel.ForEach(data.OrderBy(o => o.Value.Date).GroupBy(g => g.Value.Name).AsParallel(), new ParallelOptions { MaxDegreeOfParallelism = -1 }, (grouping) => { var lastDate = grouping.Last().Value.Date; var dataWithTarget = grouping.Where(s => s.Value.OriginalTarget.HasValue && s.Value.Target != -1); if (dataWithTarget.Any()) { var allObservations = dataWithTarget.Select(s => s.Value.Observations).ToArray(); var allTargets = dataWithTarget.Select(s => s.Value.Target).ToArray(); //var validation = new TimeSeriesCrossValidation<double>((int)(allObservationsExceptLast.RowCount * 0.8), 0, 1); //var validationPredictions = validation.Validate((IIndexedLearner<double>)learner, allObservationsExceptLast, allTargetsExceptLast); //var crossMetric = new MeanSquaredErrorRegressionMetric(); //var crossError = crossMetric.Error(validation.GetValidationTargets(allTargetsExceptLast), validationPredictions); //_totalCrossError += crossError; var meanZeroTransformer = new MeanZeroFeatureTransformer(); var minMaxTransformer = new MinMaxTransformer(0d, 1d); var lastObservations = grouping.Last().Value.Observations; F64Matrix allTransformed = minMaxTransformer.Transform(meanZeroTransformer.Transform(allObservations.Append(lastObservations).ToArray())); var transformed = new F64Matrix(allTransformed.Rows(Enumerable.Range(0, allTransformed.RowCount - 1).ToArray()).Data(), allTransformed.RowCount - 1, allTransformed.ColumnCount); var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24); var trainingTestSplit = splitter.SplitSet(transformed, allTargets); transformed = trainingTestSplit.TrainingSet.Observations; var testSet = trainingTestSplit.TestSet; //var learner = GetRandomForest(); //var learner = GetAda(); //var learner = GetNeuralNet(grouping.First().Value.Observations.Length, transformed.RowCount); var learner = GetEnsemble(grouping.First().Value.Observations.Length, transformed.RowCount); Program.StatusLogger.Info("Learning commenced " + grouping.First().Value.Name); var model = learner.Learn(transformed, trainingTestSplit.TrainingSet.Targets); Program.StatusLogger.Info("Learning completed " + grouping.First().Value.Name); if (model.GetRawVariableImportance().Any(a => a > 0)) { var importanceSummary = string.Join(",\r\n", model.GetRawVariableImportance().Select((d, i) => i.ToString() + ":" + d.ToString())); Program.StatusLogger.Info("Raw variable importance:\r\n" + importanceSummary); } var lastTransformed = allTransformed.Row(transformed.RowCount); var prediction = model.Predict(lastTransformed); //var before = item.Value.Item2[transformed.RowCount - _targetOffset - 1]; var change = -1; //Math.Round(prediction / before, 2); var testPrediction = model.Predict(testSet.Observations); var metric = new MeanSquaredErrorRegressionMetric(); var error = metric.Error(testSet.Targets, testPrediction); var averageError = 0d; lock (Locker) { _totalError += error; itemCount++; averageError = Math.Round(_totalError / itemCount, 3); } var isLondon = London.Contains(grouping.First().Value.Name); var message = $"TotalError: {Math.Round(_totalError, 3)}, AverageError: {averageError}, Target: {_targetName}, Offset: {_targetOffset}, Region: {grouping.First().Value.Name}, London: {isLondon}, Error: {Math.Round(error, 3)}, Next: {Math.Round(prediction, 3)}, Change: {change}"; Program.Logger.Info(message); } }); if (pauseAtEnd) { Console.WriteLine("Press any key to continue"); Console.ReadKey(); } }
/// <summary> /// Cross validated predictions. /// Only crossValidates within the provided indices. /// The predictions are returned in the predictions array. /// </summary> /// <param name="learner"></param> /// <param name="observations"></param> /// <param name="targets"></param> /// <param name="crossValidationIndices"></param> /// <param name="crossValidatedPredictions"></param> public void CrossValidate(IIndexedLearner <TPrediction> learner, F64Matrix observations, double[] targets, int[] crossValidationIndices, TPrediction[] crossValidatedPredictions) { var rows = crossValidatedPredictions.Length; if (m_crossValidationFolds > rows) { throw new ArgumentException("Too few observations: " + rows + " for number of cross validation folds: " + m_crossValidationFolds); } var holdOutSamples = new int[m_crossValidationFolds][]; var samplesPrFold = rows / m_crossValidationFolds; var indices = crossValidationIndices.ToArray(); // Map the provided crossValidationIndices to crossValidatedPredictions // Indices from crossValidationIndices can be larger than crossValidatedPredictions length // since crossValidatedPredictions might be a subset of the provided observations and targets var cvPredictionIndiceMap = Enumerable.Range(0, crossValidatedPredictions.Length) .ToDictionary(i => indices[i], i => i); for (int i = 0; i < m_crossValidationFolds; i++) { if (i == m_crossValidationFolds - 1) { // last fold. Add remaining indices. holdOutSamples[i] = indices.ToArray(); } else { var holdoutSample = m_indexedSampler.Sample(targets, samplesPrFold, indices); holdOutSamples[i] = holdoutSample; indices = indices.Except(holdoutSample).ToArray(); } } var observation = new double[observations.ColumnCount]; for (int i = 0; i < m_crossValidationFolds; i++) { var holdoutIndices = holdOutSamples[i]; var trainingIndices = crossValidationIndices.Except(holdoutIndices).ToArray(); var model = learner.Learn(observations, targets, trainingIndices); var predictions = new TPrediction[holdoutIndices.Length]; for (int l = 0; l < predictions.Length; l++) { observations.Row(holdoutIndices[l], observation); predictions[l] = model.Predict(observation); } for (int j = 0; j < holdoutIndices.Length; j++) { crossValidatedPredictions[cvPredictionIndiceMap[holdoutIndices[j]]] = predictions[j]; } ModelDisposer.DisposeIfDisposable(model); } }
/// <summary> /// Get a row as csv. /// </summary> /// <param name="matrix"> /// The matrix. /// </param> /// <param name="row"> /// The row. /// </param> /// <param name="sep"> /// The seperator. /// </param> /// <returns> /// The <see cref="string"/>. /// </returns> internal static string GetRowAsCsv(this F64Matrix matrix, int row, string sep) { return(string.Join(sep, matrix.Row(row).Select(o => string.Format(CultureInfo.InvariantCulture, "{0}", o)))); }
/// <summary> /// Returns the agumented version of the data. Excluding the original. /// The each feature in the dataset must be scaled/normnalized between 0.0 and 1.0 /// before the method works. /// </summary> /// <param name="dataset"></param> /// <returns></returns> public F64Matrix Agument(F64Matrix dataset) { var orgCols = dataset.ColumnCount; var orgRows = dataset.RowCount; var augmentation = new F64Matrix(dataset.RowCount, dataset.ColumnCount); var indicesVisited = new HashSet <int>(); var sample = new double[orgCols]; var candidate = new double[orgCols]; indicesVisited.Clear(); for (int j = 0; j < orgRows; j++) { if (indicesVisited.Contains(j)) { continue; } dataset.Row(j, sample); var closestDistance = double.MaxValue; var closestIndex = -1; indicesVisited.Add(j); for (int f = 0; f < orgRows; f++) { if (indicesVisited.Contains(f)) { continue; } dataset.Row(f, candidate); var distance = GetDistance(sample, candidate); if (distance < closestDistance) { closestDistance = distance; closestIndex = f; } } if (closestIndex != -1) { dataset.Row(closestIndex, candidate); indicesVisited.Add(closestIndex); for (int h = 0; h < sample.Length; h++) { var sampleValue = sample[h]; var candiateValue = candidate[h]; if (m_random.NextDouble() <= m_probabilityParameter && m_probabilityParameter != 0.0) { var std = (sampleValue - candiateValue) / m_localVariance; augmentation.At(j, h, SampleRandom(candiateValue, std)); augmentation.At(closestIndex, h, SampleRandom(sampleValue, std)); } else // keep values { augmentation.At(j, h, sampleValue); augmentation.At(closestIndex, h, candiateValue); } } } } return(augmentation); }