Exemple #1
0
        protected SampleSet <T> ApplyCrossValidated <P>(P preprocessor, Func <T, P, T> sampleProcessing, int xValidationStart, int xValidationLength) where P : Preprocessor
        {
            // you need to train your Trainable preprocessor externally if you aren't using cross validation (cross validation will apply its own training)
            if (!IsCrossValidatedSampleSet() && preprocessor is ITrainable && !((ITrainable)preprocessor).IsTrained())
            {
                throw new ArgumentException("Trainable preprocessor must be trained for non-cross validated sets");
            }

            Preprocessor copy = preprocessor.Copy();
            // perform preprocessor training in parallel
            // TODO - include imposterTrainingSize
            ParallelQuery <int>            crossValidationRange = Enumerable.Range(xValidationStart, xValidationLength).AsParallel();
            Dictionary <int, Preprocessor> trainedPreprocessors = crossValidationRange.Select(x => new { x, preprocessorHead = preprocessor.Copy() })
                                                                  .Select(p => new { p.x, trainablePreprocessor = p.preprocessorHead as ITrainable, p.preprocessorHead })
                                                                  .Select(t => new { t.x, preprocessorHead = t.trainablePreprocessor != null && IsCrossValidatedSampleSet() ? t.trainablePreprocessor.Train <P>(SampleSetHelpers.GetSampleSetTrainingSamples(this, trainingSize.Value, t.x)) : t.preprocessorHead }) // only do training when cross validated
                                                                  .Select(trained => new { trained.x, preprocessorHead = trained.preprocessorHead.SetPredecessor(preprocessorHeads.ContainsKey(trained.x) ? preprocessorHeads[trained.x] : null) })                                                                  // set the current preprocessor as the predecessor to the newly applied one for initializing the new SampleSet
                                                                  .ToDictionary(trained => trained.x, trained => trained.preprocessorHead);

            // apply preprocessor head to each cross validation (or total sample set if this is not a cross validated set)
            SampleSet <T> preprocessedSampleSet = new SampleSet <T>(trainedPreprocessors, trainingSize);

            if (IsCrossValidatedSampleSet())
            {
                preprocessedSampleSet.AddRange(crossValidationRange
                                               .SelectMany(x => SampleSetHelpers.GetCrossValidation(this as SampleSet <CrossValidatedSample>, x).Cast <T>().Select(sample => sampleProcessing.Invoke(sample, (P)trainedPreprocessors[x]))));
            }
            else
            {
                preprocessedSampleSet.AddRange(this.Select(sample => sampleProcessing.Invoke(sample, (P)trainedPreprocessors[DEFAULT_PREPROCESSOR_KEY])));
            }
            return(preprocessedSampleSet);
        }
        public P Train <P>(List <ISample> trainingSamples) where P : Preprocessor
        {
            List <int> dimensionCounts = trainingSamples.Select(sample => sample.GetDimensionCount()).Distinct().ToList();

            if (dimensionCounts.Count != 1)
            {
                throw new ArgumentException("All samples must have the same number of dimensions"); // TODO - create parent or global method assertEqualDimensionality(samples)
            }
            trainingDimensions = trainingSamples[0].GetDimensionCount();

            // calculate the per feature means
            string[] identifiers = SampleSetHelpers.GetIdentifiers(trainingSamples, true).ToArray();
            Dictionary <string, double[]> means = CalculateDimensionMeans(trainingSamples, trainingDimensions.Value, identifiers);

            // calculate membership score for each feature then order by best associated indices
            featureRankings = Enumerable.Range(0, trainingDimensions.Value).AsParallel().Select(featureIndex => new { featureIndex, score = CalculateMembership(featureIndex, trainingSamples, means, identifiers) })
                              .OrderBy(member => member.score)
                              .Reverse()
                              .Take(numberOfOutputs)
                              .Select(member => member.featureIndex).ToArray();
            return(this as P);
        }
        public ClassifierInstance CreateInstance(List <ISample> trainingSamples, double standardMin, double standardMax)
        {
            // don't train imposter samples (if you know the sample's identifier they are not an "imposter" as far as classifier training is concerned)
            List <ISample> samplesToTrain = trainingSamples.Where(sample => !sample.IsImposter()).ToList();

            // validate training data
            ValidateTrainingData(samplesToTrain);

            // generate scaling/shift per feature to acheive a standard min/max for the training data
            double range = standardMax - standardMin;

            double[] featureMin = SampleSetHelpers.GetMinimumFeatureValues(samplesToTrain);
            double[] featureMax = SampleSetHelpers.GetMaximumFeatureValues(samplesToTrain);

            double[] featureScaling = featureMax.Select((max, i) => range / (max - featureMin[i])).ToArray();
            double[] featureShift   = featureScaling.Select((scale, i) => standardMin - (featureMin[i] * scale)).ToArray();

            // apply scaling/shift to the features in each training sample
            List <ISample> standardizedSamples = samplesToTrain.Select(sample => sample.AcceptVisitor(
                                                                           new SampleScaledDimensionVisitor(featureScaling, featureShift))).ToList();

            string[] trainingIdentifiers = SampleSetHelpers.GetIdentifiers(samplesToTrain, false);
            return(CreateInstance(standardizedSamples, featureScaling, featureShift, trainingIdentifiers));
        }
 public ClassifierFactory <T> Train(SampleSet <T> trainingSamples, int trainingSize, int xValidationStart = 0, int xValidationLength = 1)
 {
     // train with a dataset that may require additional cross validation classifier instances
     xClassifierInstances = Enumerable.Range(xValidationStart, xValidationLength).AsParallel().Select(x => new { x, instance = CreateInstance(SampleSetHelpers.GetSampleSetTrainingSamples(trainingSamples, trainingSize, x), standardMin, standardMax) })
                            .ToDictionary(validation => validation.x, validation => validation.instance);
     return(this);
 }
Exemple #5
0
        private static Tuple <ErrorRatePair, List <ErrorRatePair> > CalculateErrorRate <T>(SampleSet <T> sampleSet, ClassifierFactory <T> classifier, int trainingSize, double threshold, int xValidationStart, int xValidationLength) where T : ISample
        {
            // test classifier instances with testing samples and threshold asynchronously to get cross validated result
            List <ErrorRatePair> errorRates = Enumerable.Range(xValidationStart, xValidationLength).AsParallel()
                                              .Select(x => CalculateErrorRate(classifier.GetInstance(x), SampleSetHelpers.GetSampleSetTestingSamples(sampleSet, trainingSize, x), threshold))
                                              .ToList();

            ErrorRatePair totalError = errorRates[0];

            for (int i = 1; i < errorRates.Count; i++)
            {
                totalError = totalError + errorRates[i];
            }
            return(new Tuple <ErrorRatePair, List <ErrorRatePair> >(totalError, errorRates));
        }