Exemplo n.º 1
0
        internal override InternalRegressionTree TrainingIteration(IChannel ch, bool[] activeFeatures)
        {
            Contracts.CheckValue(ch, nameof(ch));
            AgdScoreTracker trainingScores = TrainingScores as AgdScoreTracker;
            //First Let's make XK=YK as we want to fit YK and LineSearch YK
            // and call base class that uses fits XK (in our case will fir YK thanks to the swap)
            var xk = trainingScores.XK;

            trainingScores.XK = trainingScores.YK;
            trainingScores.YK = null;

            //Invoke standard gradient descent on YK rather than XK(Scores)
            InternalRegressionTree tree = base.TrainingIteration(ch, activeFeatures);

            //Reverse the XK/YK swap
            trainingScores.YK = trainingScores.XK;
            trainingScores.XK = xk;

            if (tree == null)
            {
                return(null); // No tree was actually learnt. Give up.
            }
            // ... and update the training scores that we omitted from update
            // in AcceleratedGradientDescent.UpdateScores
            // Here we could use faster way of comuting train scores taking advantage of scores precompited by LineSearch
            // But that would make the code here even more difficult/complex
            trainingScores.AddScores(tree, TreeLearner.Partitioning, 1.0);

            //Now rescale all previous trees based on ratio of new_desired_tree_scale/previous_tree_scale
            for (int t = 0; t < Ensemble.NumTrees - 1; t++)
            {
                Ensemble.GetTreeAt(t).ScaleOutputsBy(AgdScoreTracker.TreeMultiplier(t, Ensemble.NumTrees) / AgdScoreTracker.TreeMultiplier(t, Ensemble.NumTrees - 1));
            }
            return(tree);
        }
 internal RegressionTreeNodeDocuments(InternalRegressionTree tree, DocumentPartitioning partitioning, int nodeIndex)
 {
     Tree           = tree;
     Partitioning   = partitioning;
     NodeIndex      = nodeIndex;
     _documentCount = -1;
 }
Exemplo n.º 3
0
        internal override void AddScores(InternalRegressionTree tree, DocumentPartitioning partitioning, double multiplier)
        {
            _k++;
            double coeff   = (_k - 1.0) / (_k + 2.0);
            var    actions = new Action[tree.NumLeaves];

            Parallel.For(0, tree.NumLeaves, new ParallelOptions {
                MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
            },
                         (int leaf) =>
            {
                int[] documents;
                int begin;
                int count;
                partitioning.ReferenceLeafDocuments(leaf, out documents, out begin, out count);
                double output = tree.LeafValue(leaf) * multiplier;
                for (int i = begin; i < begin + count; ++i)
                {
                    int doc      = documents[i];
                    double newXK = YK[doc] + output;
                    double newYK = newXK + coeff * (newXK - XK[doc]);
                    XK[doc]      = newXK;
                    YK[doc]      = newYK;
                }
            });
            SendScoresUpdatedMessage();
        }
Exemplo n.º 4
0
            public void AdjustTreeOutputs(IChannel ch, InternalRegressionTree tree,
                                          DocumentPartitioning partitioning, ScoreTracker trainingScores)
            {
                const double epsilon    = 1.4e-45;
                double       multiplier = LearningRate * Shrinkage;

                double[] means = null;
                if (!BestStepRankingRegressionTrees)
                {
                    means = _parallelTraining.GlobalMean(Dataset, tree, partitioning, Weights, false);
                }
                for (int l = 0; l < tree.NumLeaves; ++l)
                {
                    double output = tree.GetOutput(l);

                    if (BestStepRankingRegressionTrees)
                    {
                        output *= multiplier;
                    }
                    else
                    {
                        output = multiplier * (output + epsilon) / (means[l] + epsilon);
                    }

                    if (output > MaxTreeOutput)
                    {
                        output = MaxTreeOutput;
                    }
                    else if (output < -MaxTreeOutput)
                    {
                        output = -MaxTreeOutput;
                    }
                    tree.SetOutput(l, output);
                }
            }
Exemplo n.º 5
0
        internal override void AddScores(InternalRegressionTree tree, double multiplier)
        {
            _k++;
            double coeff = (_k - 1.0) / (_k + 2.0);

            int innerLoopSize = 1 + Dataset.NumDocs / BlockingThreadPool.NumThreads;   // +1 is to make sure we don't have a few left over at the end
            // REVIEW: This partitioning doesn't look optimal.
            // Probably make sence to investigate better ways of splitting data?
            var actions     = new Action[(int)Math.Ceiling(1.0 * Dataset.NumDocs / innerLoopSize)];
            var actionIndex = 0;

            for (int d = 0; d < Dataset.NumDocs; d += innerLoopSize)
            {
                var fromDoc = d;
                var toDoc   = Math.Min(d + innerLoopSize, Dataset.NumDocs);
                actions[actionIndex++] = () =>
                {
                    var featureBins = Dataset.GetFeatureBinRowwiseIndexer();
                    for (int doc = fromDoc; doc < toDoc; doc++)
                    {
                        double output = multiplier * tree.GetOutput(featureBins[doc]);
                        double newXK  = YK[doc] + output;
                        double newYK  = newXK + coeff * (newXK - XK[doc]);
                        XK[doc] = newXK;
                        YK[doc] = newYK;
                    }
                };
            }
            Parallel.Invoke(new ParallelOptions {
                MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
            }, actions);
            SendScoresUpdatedMessage();
        }
 internal override void UpdateScores(ScoreTracker t, InternalRegressionTree tree)
 {
     if (t != TrainingScores)
     {
         base.UpdateScores(t, tree);
     }
 }
 double[] IParallelTraining.GlobalMean(Dataset dataset, InternalRegressionTree tree, DocumentPartitioning partitioning, double[] weights, bool filterZeroLambdas)
 {
     double[] means = new double[tree.NumLeaves];
     for (int l = 0; l < tree.NumLeaves; ++l)
     {
         means[l] = partitioning.Mean(weights, dataset.SampleWeights, l, filterZeroLambdas);
     }
     return(means);
 }
Exemplo n.º 8
0
        internal override InternalRegressionTree TrainingIteration(IChannel ch, bool[] activeFeatures)
        {
            Contracts.CheckValue(ch, nameof(ch));
            // Fit a regression tree to the gradient using least squares.
            InternalRegressionTree tree = TreeLearner.FitTargets(ch, activeFeatures, AdjustTargetsAndSetWeights(ch));

            if (tree == null)
            {
                return(null); // Could not learn a tree. Exit.
            }
            // Adjust output values of tree by performing a Newton step.

            // REVIEW: This should be part of OptimizingAlgorithm.
            using (Timer.Time(TimerEvent.TreeLearnerAdjustTreeOutputs))
            {
                double[] backupScores = null;
                // when doing dropouts we need to replace the TrainingScores with the scores without the dropped trees
                if (DropoutRate > 0)
                {
                    backupScores          = TrainingScores.Scores;
                    TrainingScores.Scores = _scores;
                }

                if (AdjustTreeOutputsOverride != null)
                {
                    AdjustTreeOutputsOverride.AdjustTreeOutputs(ch, tree, TreeLearner.Partitioning, TrainingScores);
                }
                else if (ObjectiveFunction is IStepSearch)
                {
                    (ObjectiveFunction as IStepSearch).AdjustTreeOutputs(ch, tree, TreeLearner.Partitioning, TrainingScores);
                }
                else
                {
                    throw ch.Except("No AdjustTreeOutputs defined. Objective function should define IStepSearch or AdjustTreeOutputsOverride should be set");
                }
                if (DropoutRate > 0)
                {
                    // Returning the original scores.
                    TrainingScores.Scores = backupScores;
                }
            }
            if (Smoothing != 0.0)
            {
                SmoothTree(tree, Smoothing);
                UseFastTrainingScoresUpdate = false;
            }
            if (DropoutRate > 0)
            {
                // Don't do shrinkage if you do dropouts.
                double scaling = (1.0 / (1.0 + _numberOfDroppedTrees));
                tree.ScaleOutputsBy(scaling);
                _treeScores.Add(tree.GetOutputs(TrainingScores.Dataset));
            }
            UpdateAllScores(ch, tree);
            Ensemble.AddTree(tree);
            return(tree);
        }
Exemplo n.º 9
0
            public void AdjustTreeOutputs(IChannel ch, InternalRegressionTree tree, DocumentPartitioning partitioning, ScoreTracker trainingScores)
            {
                double shrinkage = LearningRate * Shrinkage;

                for (int l = 0; l < tree.NumLeaves; ++l)
                {
                    double output = tree.GetOutput(l) * shrinkage;
                    tree.SetOutput(l, output);
                }
            }
Exemplo n.º 10
0
        // Divides output values of leaves to bag count.
        // This brings back the final scores generated by model on a same
        // range as when we didn't use bagging
        internal void ScaleEnsembleLeaves(int numTrees, int bagSize, InternalTreeEnsemble ensemble)
        {
            int bagCount = GetBagCount(numTrees, bagSize);

            for (int t = 0; t < ensemble.NumTrees; t++)
            {
                InternalRegressionTree tree = ensemble.GetTreeAt(t);
                tree.ScaleOutputsBy(1.0 / bagCount);
            }
        }
            public void AdjustTreeOutputs(IChannel ch, InternalRegressionTree tree, DocumentPartitioning partitioning, ScoreTracker trainingScores)
            {
                double shrinkage = LearningRate * Shrinkage;
                var    scores    = trainingScores.Scores;
                var    weights   = trainingScores.Dataset.SampleWeights;

                // Following equation 18, and line 2c of algorithm 1 in the source paper.
                for (int l = 0; l < tree.NumLeaves; ++l)
                {
                    Double num   = 0;
                    Double denom = 0;

                    if (_index1 == 0)
                    {
                        // The index == 1 Poisson case.
                        foreach (int i in partitioning.DocumentsInLeaf(l))
                        {
                            var s = scores[i];
                            var w = weights == null ? 1 : weights[i];
                            num   += w * _labels[i];
                            denom += w * Math.Exp(s);
                        }
                    }
                    else
                    {
                        // The index in (1,2] case.
                        foreach (int i in partitioning.DocumentsInLeaf(l))
                        {
                            var s = scores[i];
                            var w = weights == null ? 1 : weights[i];
                            num   += w * _labels[i] * Math.Exp(_index1 * s);
                            denom += w * Math.Exp(_index2 * s);
                        }
                    }

                    var step = shrinkage * (Math.Log(num) - Math.Log(denom));
                    if (num == 0 && denom == 0)
                    {
                        step = 0;
                    }
                    // If we do not clamp, it is entirely possible for num to be 0 (with 0 labels), which
                    // means that we will have negative infinities in the leaf nodes. This has a number of
                    // bad negative effects we'd prefer to avoid. Nonetheless, we do give up a substantial
                    // amount of "gain" for those examples.
                    if (step < -_maxClamp)
                    {
                        step = -_maxClamp;
                    }
                    else if (step > _maxClamp)
                    {
                        step = _maxClamp;
                    }
                    tree.SetOutput(l, step);
                }
            }
 internal RecursiveRegressionTree(InternalRegressionTree t, DocumentPartitioning p, int n)
     : base(t, p, n)
 {
     _weightedOutput = double.NaN;
     _nodeCount      = int.MaxValue;
     if (!IsLeaf)
     {
         LteNode = new RecursiveRegressionTree(Tree, Partitioning, Tree.GetLteChildForNode(NodeIndex));
         GtNode  = new RecursiveRegressionTree(Tree, Partitioning, Tree.GetGtChildForNode(NodeIndex));
     }
 }
Exemplo n.º 13
0
 internal override void UpdateScores(ScoreTracker t, InternalRegressionTree tree)
 {
     if (t == TrainingScores)
     {
         return;
         //Special optimized routine for updating TrainingScores is implemented as part of TrainingItearation
     }
     else
     {
         base.UpdateScores(t, tree);
     }
 }
Exemplo n.º 14
0
        internal RegressionTreeBase(InternalRegressionTree tree)
        {
            _tree = tree;

            _lteChild = ImmutableArray.Create(_tree.LteChild, 0, _tree.NumNodes);
            _gtChild  = ImmutableArray.Create(_tree.GtChild, 0, _tree.NumNodes);

            _numericalSplitFeatureIndexes = ImmutableArray.Create(_tree.SplitFeatures, 0, _tree.NumNodes);
            _numericalSplitThresholds     = ImmutableArray.Create(_tree.RawThresholds, 0, _tree.NumNodes);
            _categoricalSplitFlags        = ImmutableArray.Create(_tree.CategoricalSplit, 0, _tree.NumNodes);
            _leafValues = ImmutableArray.Create(_tree.LeafValues, 0, _tree.NumLeaves);
        }
Exemplo n.º 15
0
 internal virtual void UpdateAllScores(IChannel ch, InternalRegressionTree tree)
 {
     if (PreScoreUpdateEvent != null)
     {
         PreScoreUpdateEvent(ch);
     }
     using (Timer.Time(TimerEvent.UpdateScores))
     {
         foreach (ScoreTracker t in TrackedScores)
         {
             UpdateScores(t, tree);
         }
     }
 }
Exemplo n.º 16
0
 //Creates linear combination of scores1 + tree * multiplier
 internal void Initialize(ScoreTracker scores1, InternalRegressionTree tree, DocumentPartitioning partitioning, double multiplier)
 {
     InitScores = null;
     if (Scores == null || Scores.Length != scores1.Scores.Length)
     {
         Scores = (double[])scores1.Scores.Clone();
     }
     else
     {
         Array.Copy(scores1.Scores, Scores, Scores.Length);
     }
     AddScores(tree, partitioning, multiplier);
     SendScoresUpdatedMessage();
 }
 public double[] GlobalMean(Dataset dataset, InternalRegressionTree tree, DocumentPartitioning partitioning, double[] weights, bool filterZeroLambdas)
 {
     Assert.True(_isInitEnv);
     Assert.True(_isInitTreeLearner);
     Assert.NotNull(dataset);
     Assert.NotNull(tree);
     Assert.NotNull(partitioning);
     double[] means = new double[tree.NumLeaves];
     for (int l = 0; l < tree.NumLeaves; ++l)
     {
         means[l] = partitioning.Mean(weights, dataset.SampleWeights, l, filterZeroLambdas);
     }
     return(means);
 }
Exemplo n.º 18
0
        /// <summary>
        /// Regularize a regression tree with smoothing paramter alpha.
        /// </summary>
        protected virtual void SmoothTree(InternalRegressionTree tree, double smoothing)
        {
            if (smoothing == 0.0)
            {
                return;
            }

            //Create recursive structure of the tree starting from root node
            var regularizer = new RecursiveRegressionTree(tree, TreeLearner.Partitioning, 0);

            //Perform bottom-up computation of weighted interior node output
            double rootNodeOutput = regularizer.GetWeightedOutput();

            //followed by top-down propagation of parent's output value
            regularizer.SmoothLeafOutputs(rootNodeOutput, smoothing);
        }
Exemplo n.º 19
0
        internal override InternalRegressionTree TrainingIteration(IChannel ch, bool[] activeFeatures)
        {
            Contracts.CheckValue(ch, nameof(ch));

            double[] sampleWeights      = null;
            double[] targets            = GetGradient(ch);
            double[] weightedTargets    = _gradientWrapper.AdjustTargetAndSetWeights(targets, ObjectiveFunction, out sampleWeights);
            InternalRegressionTree tree = ((RandomForestLeastSquaresTreeLearner)TreeLearner).FitTargets(ch, activeFeatures, weightedTargets,
                                                                                                        targets, sampleWeights);

            if (tree != null)
            {
                Ensemble.AddTree(tree);
            }
            return(tree);
        }
Exemplo n.º 20
0
        //Use faster method for score update with Partitioning
        // suitable for TrainSet
        internal virtual void AddScores(InternalRegressionTree tree, DocumentPartitioning partitioning, double multiplier)
        {
            Parallel.For(0, tree.NumLeaves, new ParallelOptions {
                MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
            }, (leaf) =>
            {
                int[] documents;
                int begin;
                int count;
                partitioning.ReferenceLeafDocuments(leaf, out documents, out begin, out count);
                double output = tree.LeafValue(leaf) * multiplier;
                for (int i = begin; i < begin + count; ++i)
                {
                    Scores[documents[i]] += output;
                }
            });

            SendScoresUpdatedMessage();
        }
Exemplo n.º 21
0
        private InternalTreeEnsemble GetEnsembleFromSolution(LassoFit fit, int solutionIdx, InternalTreeEnsemble originalEnsemble)
        {
            InternalTreeEnsemble ensemble = new InternalTreeEnsemble();

            int weightsCount = fit.NumberOfWeights[solutionIdx];

            for (int i = 0; i < weightsCount; i++)
            {
                double weight = fit.CompressedWeights[solutionIdx][i];
                if (weight != 0)
                {
                    InternalRegressionTree tree = originalEnsemble.GetTreeAt(fit.Indices[i]);
                    tree.Weight = weight;
                    ensemble.AddTree(tree);
                }
            }

            ensemble.Bias = fit.Intercepts[solutionIdx];
            return(ensemble);
        }
Exemplo n.º 22
0
 internal virtual void UpdateScores(ScoreTracker t, InternalRegressionTree tree)
 {
     if (t == TrainingScores)
     {
         IFastTrainingScoresUpdate fastUpdate = AdjustTreeOutputsOverride as IFastTrainingScoresUpdate;
         ScoreTracker updatedScores           = (UseFastTrainingScoresUpdate && fastUpdate != null) ? fastUpdate.GetUpdatedTrainingScores() : null;
         if (updatedScores != null)
         {
             t.SetScores(updatedScores.Scores);
         }
         else
         {
             t.AddScores(tree, TreeLearner.Partitioning, 1.0);
         }
     }
     else
     {
         t.AddScores(tree, 1.0);
     }
 }
Exemplo n.º 23
0
        public InternalTreeEnsemble(ModelLoadContext ctx, bool usingDefaultValues, bool categoricalSplits)
        {
            // REVIEW: Verify the contents of the ensemble, both during building,
            // and during deserialization.

            // *** Binary format ***
            // int: Number of trees
            // Regression trees (num trees of these)
            // double: Bias
            // int: Id to InputInitializationContent string, currently ignored

            _trees = new List <InternalRegressionTree>();
            int numTrees = ctx.Reader.ReadInt32();

            Contracts.CheckDecode(numTrees >= 0);
            for (int t = 0; t < numTrees; ++t)
            {
                AddTree(InternalRegressionTree.Load(ctx, usingDefaultValues, categoricalSplits));
            }
            Bias = ctx.Reader.ReadDouble();
            _firstInputInitializationContent = ctx.LoadStringOrNull();
        }
Exemplo n.º 24
0
        void IStepSearch.AdjustTreeOutputs(IChannel ch, InternalRegressionTree tree, DocumentPartitioning partitioning,
                                           ScoreTracker previousScores)
        {
            _lo.Initialize(tree, partitioning, previousScores);
            _hi.Initialize(tree, partitioning, previousScores);
            _left.Initialize(tree, partitioning, previousScores);
            _right.Initialize(tree, partitioning, previousScores);

            _lo.Step   = _historicStepSize / _phi;
            _left.Step = _historicStepSize;

            if (_lo.Loss.CompareTo(_left.Loss) == 1) // backtrack
            {
                do
                {
                    Rotate(ref _hi, ref _left, ref _lo);
                    if (_hi.Step <= _minStepSize)
                    {
                        goto FINISHED;
                    }
                    _lo.Step = _left.Step / _phi;
                } while (_lo.Loss.CompareTo(_left.Loss) == 1);
            }
            else // extend (or stay)
            {
                _hi.Step = _historicStepSize * _phi;
                while (_hi.Loss.CompareTo(_left.Loss) == 1)
                {
                    Rotate(ref _lo, ref _left, ref _hi);
                    _hi.Step = _left.Step * _phi;
                }
            }

            if (_numPostbracketSteps > 0)
            {
                _right.Step = _lo.Step + (_hi.Step - _lo.Step) / _phi;
                for (int step = 0; step < _numPostbracketSteps; ++step)
                {
                    int cmp = _right.Loss.CompareTo(_left.Loss);
                    if (cmp == 0)
                    {
                        break;
                    }

                    if (cmp == 1) // move right
                    {
                        Rotate(ref _lo, ref _left, ref _right);
                        _right.Step = _lo.Step + (_hi.Step - _lo.Step) / _phi;
                    }
                    else // move left
                    {
                        Rotate(ref _hi, ref _right, ref _left);
                        if (_hi.Step <= _minStepSize)
                        {
                            goto FINISHED;
                        }
                        _left.Step = _hi.Step - (_hi.Step - _lo.Step) / _phi;
                    }
                }

                // prepare to return _left
                if (_right.Loss.CompareTo(_left.Loss) == 1)
                {
                    Swap(ref _left, ref _right);
                }
            }

FINISHED:
            if (_hi.Step < _minStepSize)
            {
                _left.Step = _minStepSize;
            }
            else if (_hi.Step == _minStepSize)
            {
                Swap(ref _hi, ref _left);
            }

            double bestStep = _left.Step;

            ch.Info("multiplier: {0}", bestStep);
            _historicStepSize = bestStep;
            tree.ScaleOutputsBy(bestStep);
        }
Exemplo n.º 25
0
 public void Initialize(InternalRegressionTree tree, DocumentPartitioning partitioning, ScoreTracker previousScores)
 {
     _tree           = tree;
     _partitioning   = partitioning;
     _previousScores = previousScores;
 }
Exemplo n.º 26
0
        IPredictor IModelCombiner.CombineModels(IEnumerable <IPredictor> models)
        {
            _host.CheckValue(models, nameof(models));

            var  ensemble         = new InternalTreeEnsemble();
            int  modelCount       = 0;
            int  featureCount     = -1;
            bool binaryClassifier = false;

            foreach (var model in models)
            {
                modelCount++;

                var predictor = model;
                _host.CheckValue(predictor, nameof(models), "One of the models is null");

                var    calibrated = predictor as IWeaklyTypedCalibratedModelParameters;
                double paramA     = 1;
                if (calibrated != null)
                {
                    _host.Check(calibrated.WeeklyTypedCalibrator is PlattCalibrator,
                                "Combining FastTree models can only be done when the models are calibrated with Platt calibrator");
                }

                predictor = calibrated.WeeklyTypedSubModel;
                paramA    = -((PlattCalibrator)calibrated.WeeklyTypedCalibrator).Slope;

                var tree = predictor as TreeEnsembleModelParameters;

                if (tree == null)
                {
                    throw _host.Except("Model is not a tree ensemble");
                }
                foreach (var t in tree.TrainedEnsemble.Trees)
                {
                    var bytes    = new byte[t.SizeInBytes()];
                    int position = -1;
                    t.ToByteArray(bytes, ref position);
                    position = -1;
                    var tNew = new InternalRegressionTree(bytes, ref position);
                    if (paramA != 1)
                    {
                        for (int i = 0; i < tNew.NumLeaves; i++)
                        {
                            tNew.SetOutput(i, tNew.LeafValues[i] * paramA);
                        }
                    }
                    ensemble.AddTree(tNew);
                }

                if (modelCount == 1)
                {
                    binaryClassifier = calibrated != null;
                    featureCount     = tree.InputType.GetValueCount();
                }
                else
                {
                    _host.Check((calibrated != null) == binaryClassifier, "Ensemble contains both calibrated and uncalibrated models");
                    _host.Check(featureCount == tree.InputType.GetValueCount(), "Found models with different number of features");
                }
            }

            var scale = 1 / (double)modelCount;

            foreach (var t in ensemble.Trees)
            {
                for (int i = 0; i < t.NumLeaves; i++)
                {
                    t.SetOutput(i, t.LeafValues[i] * scale);
                }
            }

            switch (_kind)
            {
            case PredictionKind.BinaryClassification:
                if (!binaryClassifier)
                {
                    return(new FastTreeBinaryModelParameters(_host, ensemble, featureCount, null));
                }

                var cali          = new PlattCalibrator(_host, -1, 0);
                var fastTreeModel = new FastTreeBinaryModelParameters(_host, ensemble, featureCount, null);
                return(new FeatureWeightsCalibratedModelParameters <FastTreeBinaryModelParameters, PlattCalibrator>(_host, fastTreeModel, cali));

            case PredictionKind.Regression:
                return(new FastTreeRegressionModelParameters(_host, ensemble, featureCount, null));

            case PredictionKind.Ranking:
                return(new FastTreeRankingModelParameters(_host, ensemble, featureCount, null));

            default:
                _host.Assert(false);
                throw _host.ExceptNotSupp();
            }
        }
Exemplo n.º 27
0
 public void AddTreeAt(InternalRegressionTree tree, int index) => _trees.Insert(index, tree);
Exemplo n.º 28
0
        /// <summary>
        /// Constructs partitioning object based on the documents and RegressionTree splits
        /// NOTE: It has been optimized for speed and multiprocs with 10x gain on naive LINQ implementation
        /// </summary>
        internal DocumentPartitioning(InternalRegressionTree tree, Dataset dataset)
            : this(dataset.NumDocs, tree.NumLeaves)
        {
            using (Timer.Time(TimerEvent.DocumentPartitioningConstruction))
            {
                // figure out which leaf each document belongs to
                // NOTE: break it up into NumThreads chunks. This minimizes the number of re-computations necessary in
                // the row-wise indexer.
                int innerLoopSize = 1 + dataset.NumDocs / BlockingThreadPool.NumThreads; // +1 is to make sure we don't have a few left over at the end

                // figure out the exact number of chunks, needed in pathological cases when NumDocs < NumThreads
                int numChunks = dataset.NumDocs / innerLoopSize;
                if (dataset.NumDocs % innerLoopSize != 0)
                {
                    ++numChunks;
                }
                var perChunkDocumentLists = new List <int> [numChunks][];
                // REVIEW: This partitioning doesn't look optimal.
                // Probably make sence to investigate better ways of splitting data?
                var actions     = new Action[(int)Math.Ceiling(1.0 * dataset.NumDocs / innerLoopSize)];
                var actionIndex = 0;
                for (int docStart = 0; docStart < dataset.NumDocs; docStart += innerLoopSize)
                {
                    var fromDoc    = docStart;
                    var toDoc      = Math.Min(docStart + innerLoopSize, dataset.NumDocs);
                    var chunkIndex = docStart / innerLoopSize;
                    actions[actionIndex++] = () =>
                    {
                        Contracts.Assert(perChunkDocumentLists[chunkIndex] == null);

                        var featureBins = dataset.GetFeatureBinRowwiseIndexer();

                        List <int>[] perLeafDocumentLists = Enumerable.Range(0, tree.NumLeaves)
                                                            .Select(x => new List <int>(innerLoopSize / tree.NumLeaves))
                                                            .ToArray();

                        for (int d = fromDoc; d < toDoc; d++)
                        {
                            int leaf = tree.GetLeaf(featureBins[d]);
                            perLeafDocumentLists[leaf].Add(d);
                        }

                        perChunkDocumentLists[chunkIndex] = perLeafDocumentLists;
                    };
                }
                Parallel.Invoke(new ParallelOptions {
                    MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
                }, actions);

                // establish leaf starts and document counts
                _leafCount = Enumerable.Range(0, tree.NumLeaves)
                             .Select(leaf => Enumerable.Range(0, perChunkDocumentLists.Length)
                                     .Select(thread => perChunkDocumentLists[thread][leaf].Count)
                                     .Sum())
                             .ToArray();

                var cumulativeLength = _leafCount.CumulativeSum <int>().Take(tree.NumLeaves - 1);
                _leafBegin = Enumerable.Range(0, 1).Concat(cumulativeLength).ToArray();

                // move all documents that belong to the same leaf together
                Contracts.Assert(_documents.Length == _leafBegin[tree.NumLeaves - 1] + _leafCount[tree.NumLeaves - 1]);
                actions     = new Action[tree.NumLeaves];
                actionIndex = 0;
                for (int leaf = 0; leaf < tree.NumLeaves; leaf++)
                {
                    var l = leaf;
                    actions[actionIndex++] = () =>
                    {
                        int documentPos = _leafBegin[l];
                        for (int chunkIndex = 0; chunkIndex < perChunkDocumentLists.Length; chunkIndex++)
                        {
                            foreach (int d in perChunkDocumentLists[chunkIndex][l])
                            {
                                _documents[documentPos++] = d;
                            }
                            perChunkDocumentLists[chunkIndex][l] = null;
                        }
                    };
                }
                Parallel.Invoke(new ParallelOptions {
                    MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
                }, actions);
            }
        }
 internal RegressionTree(InternalRegressionTree tree) : base(tree)
 {
 }
Exemplo n.º 30
0
 public void AddTree(InternalRegressionTree tree) => _trees.Add(tree);