protected SampleSet <T> ApplyCrossValidated <P>(P preprocessor, Func <T, P, T> sampleProcessing, int xValidationStart, int xValidationLength) where P : Preprocessor { // you need to train your Trainable preprocessor externally if you aren't using cross validation (cross validation will apply its own training) if (!IsCrossValidatedSampleSet() && preprocessor is ITrainable && !((ITrainable)preprocessor).IsTrained()) { throw new ArgumentException("Trainable preprocessor must be trained for non-cross validated sets"); } Preprocessor copy = preprocessor.Copy(); // perform preprocessor training in parallel // TODO - include imposterTrainingSize ParallelQuery <int> crossValidationRange = Enumerable.Range(xValidationStart, xValidationLength).AsParallel(); Dictionary <int, Preprocessor> trainedPreprocessors = crossValidationRange.Select(x => new { x, preprocessorHead = preprocessor.Copy() }) .Select(p => new { p.x, trainablePreprocessor = p.preprocessorHead as ITrainable, p.preprocessorHead }) .Select(t => new { t.x, preprocessorHead = t.trainablePreprocessor != null && IsCrossValidatedSampleSet() ? t.trainablePreprocessor.Train <P>(SampleSetHelpers.GetSampleSetTrainingSamples(this, trainingSize.Value, t.x)) : t.preprocessorHead }) // only do training when cross validated .Select(trained => new { trained.x, preprocessorHead = trained.preprocessorHead.SetPredecessor(preprocessorHeads.ContainsKey(trained.x) ? preprocessorHeads[trained.x] : null) }) // set the current preprocessor as the predecessor to the newly applied one for initializing the new SampleSet .ToDictionary(trained => trained.x, trained => trained.preprocessorHead); // apply preprocessor head to each cross validation (or total sample set if this is not a cross validated set) SampleSet <T> preprocessedSampleSet = new SampleSet <T>(trainedPreprocessors, trainingSize); if (IsCrossValidatedSampleSet()) { preprocessedSampleSet.AddRange(crossValidationRange .SelectMany(x => SampleSetHelpers.GetCrossValidation(this as SampleSet <CrossValidatedSample>, x).Cast <T>().Select(sample => sampleProcessing.Invoke(sample, (P)trainedPreprocessors[x])))); } else { preprocessedSampleSet.AddRange(this.Select(sample => sampleProcessing.Invoke(sample, (P)trainedPreprocessors[DEFAULT_PREPROCESSOR_KEY]))); } return(preprocessedSampleSet); }
public P Train <P>(List <ISample> trainingSamples) where P : Preprocessor { List <int> dimensionCounts = trainingSamples.Select(sample => sample.GetDimensionCount()).Distinct().ToList(); if (dimensionCounts.Count != 1) { throw new ArgumentException("All samples must have the same number of dimensions"); // TODO - create parent or global method assertEqualDimensionality(samples) } trainingDimensions = trainingSamples[0].GetDimensionCount(); // calculate the per feature means string[] identifiers = SampleSetHelpers.GetIdentifiers(trainingSamples, true).ToArray(); Dictionary <string, double[]> means = CalculateDimensionMeans(trainingSamples, trainingDimensions.Value, identifiers); // calculate membership score for each feature then order by best associated indices featureRankings = Enumerable.Range(0, trainingDimensions.Value).AsParallel().Select(featureIndex => new { featureIndex, score = CalculateMembership(featureIndex, trainingSamples, means, identifiers) }) .OrderBy(member => member.score) .Reverse() .Take(numberOfOutputs) .Select(member => member.featureIndex).ToArray(); return(this as P); }
public ClassifierInstance CreateInstance(List <ISample> trainingSamples, double standardMin, double standardMax) { // don't train imposter samples (if you know the sample's identifier they are not an "imposter" as far as classifier training is concerned) List <ISample> samplesToTrain = trainingSamples.Where(sample => !sample.IsImposter()).ToList(); // validate training data ValidateTrainingData(samplesToTrain); // generate scaling/shift per feature to acheive a standard min/max for the training data double range = standardMax - standardMin; double[] featureMin = SampleSetHelpers.GetMinimumFeatureValues(samplesToTrain); double[] featureMax = SampleSetHelpers.GetMaximumFeatureValues(samplesToTrain); double[] featureScaling = featureMax.Select((max, i) => range / (max - featureMin[i])).ToArray(); double[] featureShift = featureScaling.Select((scale, i) => standardMin - (featureMin[i] * scale)).ToArray(); // apply scaling/shift to the features in each training sample List <ISample> standardizedSamples = samplesToTrain.Select(sample => sample.AcceptVisitor( new SampleScaledDimensionVisitor(featureScaling, featureShift))).ToList(); string[] trainingIdentifiers = SampleSetHelpers.GetIdentifiers(samplesToTrain, false); return(CreateInstance(standardizedSamples, featureScaling, featureShift, trainingIdentifiers)); }
public ClassifierFactory <T> Train(SampleSet <T> trainingSamples, int trainingSize, int xValidationStart = 0, int xValidationLength = 1) { // train with a dataset that may require additional cross validation classifier instances xClassifierInstances = Enumerable.Range(xValidationStart, xValidationLength).AsParallel().Select(x => new { x, instance = CreateInstance(SampleSetHelpers.GetSampleSetTrainingSamples(trainingSamples, trainingSize, x), standardMin, standardMax) }) .ToDictionary(validation => validation.x, validation => validation.instance); return(this); }
private static Tuple <ErrorRatePair, List <ErrorRatePair> > CalculateErrorRate <T>(SampleSet <T> sampleSet, ClassifierFactory <T> classifier, int trainingSize, double threshold, int xValidationStart, int xValidationLength) where T : ISample { // test classifier instances with testing samples and threshold asynchronously to get cross validated result List <ErrorRatePair> errorRates = Enumerable.Range(xValidationStart, xValidationLength).AsParallel() .Select(x => CalculateErrorRate(classifier.GetInstance(x), SampleSetHelpers.GetSampleSetTestingSamples(sampleSet, trainingSize, x), threshold)) .ToList(); ErrorRatePair totalError = errorRates[0]; for (int i = 1; i < errorRates.Count; i++) { totalError = totalError + errorRates[i]; } return(new Tuple <ErrorRatePair, List <ErrorRatePair> >(totalError, errorRates)); }