Example #1
0
 public Test(string datasetName, Dataset set, double[] initScores)
     : this(new ScoreTracker(datasetName, set, initScores))
 {
 }
Example #2
0
 protected abstract ScoreTracker ConstructScoreTracker(string name, Dataset set, double[] initScores);
Example #3
0
 public void SetTrainingData(Dataset trainData, double[] initTrainScores)
 {
     TrainingScores   = ConstructScoreTracker("train", trainData, initTrainScores);
     TrackedScores[0] = TrainingScores;
 }
 // REVIEW: When the FastTree appliation is decoupled with tree learner and boosting logic, this class should be removed.
 public RandomForestOptimizer(TreeEnsemble ensemble, Dataset trainData, double[] initTrainScores, IGradientAdjuster gradientWrapper)
     : base(ensemble, trainData, initTrainScores, gradientWrapper)
 {
     _gradientWrapper = gradientWrapper;
 }
 protected override ScoreTracker ConstructScoreTracker(string name, Dataset set, double[] initScores)
 {
     //REVIEW: This is not necessary. We can remove this by creating dummy scorer.
     return(new ScoreTracker(name, set, initScores));
 }
Example #6
0
 public ConjugateGradientDescent(TreeEnsemble ensemble, Dataset trainData, double[] initTrainScores, IGradientAdjuster gradientWrapper)
     : base(ensemble, trainData, initTrainScores, gradientWrapper)
 {
     _currentDk = new double[trainData.NumDocs];
 }
Example #7
0
 public RankingBaggingProvider(Dataset completeTrainingSet, int maxLeaves, int randomSeed, double trainFraction) :
     base(completeTrainingSet, maxLeaves, randomSeed, trainFraction)
 {
 }
Example #8
0
        // thread worker
        private void WinLossRangeWorkerFromScores(Dataset dataset, short[] labels, double[] scores, double[] result, int query, int threadIndex)
        {
            int begin = dataset.Boundaries[query];
            int count = dataset.Boundaries[query + 1] - begin;

            int[] permutation = _permutationBuffers[threadIndex];
            DcgPermutationComparer comparer = _comparers[threadIndex];

            // set values for the comparer
            comparer.Scores       = scores;
            comparer.Labels       = labels;
            comparer.ScoresOffset = begin;
            comparer.LabelsOffset = begin;

            // calculate the permutation
            Array.Copy(_oneTwoThree, permutation, count);
            Array.Sort(permutation, 0, count, comparer);

            int surplus       = 0;
            int maxsurplus    = 0;
            int maxsurpluspos = 0;

            for (int t = 0; t < count; ++t)
            {
                if (labels[begin + permutation[t]] > 0 ||
                    labels[begin + permutation[t]] < 0)
                {
                    surplus += labels[begin + permutation[t]];
                }
                else
                {
                    surplus--;
                }
                if (surplus > maxsurplus)
                {
                    maxsurplus    = surplus;
                    maxsurpluspos = t;
                }
                if (t == 100)
                {
                    Utils.InterlockedAdd(ref result[0], surplus);
                }
                if (t == 200)
                {
                    Utils.InterlockedAdd(ref result[1], surplus);
                }
                if (t == 300)
                {
                    Utils.InterlockedAdd(ref result[2], surplus);
                }
                if (t == 400)
                {
                    Utils.InterlockedAdd(ref result[3], surplus);
                }
                if (t == 500)
                {
                    Utils.InterlockedAdd(ref result[4], surplus);
                }
                if (t == 1000)
                {
                    Utils.InterlockedAdd(ref result[5], surplus);
                }
            }
            Utils.InterlockedAdd(ref result[6], maxsurplus);
            Utils.InterlockedAdd(ref result[7], maxsurpluspos);
            Utils.InterlockedAdd(ref result[8], count);
        }
 public GradientDescent(TreeEnsemble ensemble, Dataset trainData, double[] initTrainScores, IGradientAdjuster gradientWrapper)
     : base(ensemble, trainData, initTrainScores)
 {
     _gradientWrapper = gradientWrapper;
     _treeScores      = new List <double[]>();
 }
Example #10
0
        public Dataset GetSubDataset(int[] docIndices, bool destroyThisDataset, FileObjectStore <IntArrayFormatter> newBinsCache)
        {
#endif
            int[]   queryIndices   = docIndices.Select(d => DocToQuery[d]).ToArray();
            ulong[] uniqueQueryIds = queryIndices.Distinct().Select(q => QueryIds[q]).ToArray();

            // calculate boundaries
            int[] boundaries = new int[uniqueQueryIds.Length + 1];
            boundaries[0] = 0;
            int queryIndex = 1;
            for (int q = 1; q < queryIndices.Length; ++q)
            {
                if (queryIndices[q] != queryIndices[q - 1])
                {
                    boundaries[queryIndex++] = q;
                }
            }
            boundaries[uniqueQueryIds.Length] = queryIndices.Length;

            // construct skeleton
            DatasetSkeleton datasetSkeleton = new DatasetSkeleton(docIndices.Select(d => Ratings[d]).ToArray(),
                                                                  boundaries,
                                                                  uniqueQueryIds,
                                                                  docIndices.Select(d => DocIds[d]).ToArray());

            // create features
            FeatureFlockBase[] features   = new FeatureFlockBase[NumFlocks];
            int[][]            assignment = new int[][] { docIndices };
            Parallel.For(0, NumFlocks, new ParallelOptions {
                MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
            },
                         (int flockIndex) =>
            {
#if !NO_STORE
                GetSubDataset_ThreadWorker(features, flockIndex, assignment, destroyThisDataset, newBinsCache);
#else
                GetSubDatasetThreadWorker(features, flockIndex, assignment, destroyThisDataset);
#endif
            });

            uint[] filteredDupeIds = null;

            // Filter the dupe ids, if any
            if (DupeIds != null)
            {
                uint[] dupeIds = DupeIds;
                filteredDupeIds = docIndices.Select(i => dupeIds[i]).ToArray();
            }

            // auxiliary data
            Dictionary <string, DatasetSkeletonQueryDocData> auxData    = _datasetSkeleton.AuxiliaryData;
            Dictionary <string, DatasetSkeletonQueryDocData> newAuxData = new Dictionary <string, DatasetSkeletonQueryDocData>();
            foreach (KeyValuePair <string, DatasetSkeletonQueryDocData> pair in auxData)
            {
                newAuxData[pair.Key] = pair.Value.GetSubset(pair.Value.IsQueryLevel ? queryIndices.Distinct().ToArray() : docIndices);
            }
            datasetSkeleton.AuxiliaryData = newAuxData;

            // create new Dataset
            Dataset dataset = new Dataset(datasetSkeleton, features);
            dataset.DupeIds = filteredDupeIds;
            return(dataset);
        }
Example #11
0
        /// <summary>
        /// Constructs partitioning object based on the documents and RegressionTree splits
        /// NOTE: It has been optimized for speed and multiprocs with 10x gain on naive LINQ implementation
        /// </summary>
        internal DocumentPartitioning(InternalRegressionTree tree, Dataset dataset)
            : this(dataset.NumDocs, tree.NumLeaves)
        {
            using (Timer.Time(TimerEvent.DocumentPartitioningConstruction))
            {
                // figure out which leaf each document belongs to
                // NOTE: break it up into NumThreads chunks. This minimizes the number of re-computations necessary in
                // the row-wise indexer.
                int innerLoopSize = 1 + dataset.NumDocs / BlockingThreadPool.NumThreads; // +1 is to make sure we don't have a few left over at the end

                // figure out the exact number of chunks, needed in pathological cases when NumDocs < NumThreads
                int numChunks = dataset.NumDocs / innerLoopSize;
                if (dataset.NumDocs % innerLoopSize != 0)
                {
                    ++numChunks;
                }
                var perChunkDocumentLists = new List <int> [numChunks][];
                // REVIEW: This partitioning doesn't look optimal.
                // Probably make sence to investigate better ways of splitting data?
                var actions     = new Action[(int)Math.Ceiling(1.0 * dataset.NumDocs / innerLoopSize)];
                var actionIndex = 0;
                for (int docStart = 0; docStart < dataset.NumDocs; docStart += innerLoopSize)
                {
                    var fromDoc    = docStart;
                    var toDoc      = Math.Min(docStart + innerLoopSize, dataset.NumDocs);
                    var chunkIndex = docStart / innerLoopSize;
                    actions[actionIndex++] = () =>
                    {
                        Contracts.Assert(perChunkDocumentLists[chunkIndex] == null);

                        var featureBins = dataset.GetFeatureBinRowwiseIndexer();

                        List <int>[] perLeafDocumentLists = Enumerable.Range(0, tree.NumLeaves)
                                                            .Select(x => new List <int>(innerLoopSize / tree.NumLeaves))
                                                            .ToArray();

                        for (int d = fromDoc; d < toDoc; d++)
                        {
                            int leaf = tree.GetLeaf(featureBins[d]);
                            perLeafDocumentLists[leaf].Add(d);
                        }

                        perChunkDocumentLists[chunkIndex] = perLeafDocumentLists;
                    };
                }
                Parallel.Invoke(new ParallelOptions {
                    MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
                }, actions);

                // establish leaf starts and document counts
                _leafCount = Enumerable.Range(0, tree.NumLeaves)
                             .Select(leaf => Enumerable.Range(0, perChunkDocumentLists.Length)
                                     .Select(thread => perChunkDocumentLists[thread][leaf].Count)
                                     .Sum())
                             .ToArray();

                var cumulativeLength = _leafCount.CumulativeSum <int>().Take(tree.NumLeaves - 1);
                _leafBegin = Enumerable.Range(0, 1).Concat(cumulativeLength).ToArray();

                // move all documents that belong to the same leaf together
                Contracts.Assert(_documents.Length == _leafBegin[tree.NumLeaves - 1] + _leafCount[tree.NumLeaves - 1]);
                actions     = new Action[tree.NumLeaves];
                actionIndex = 0;
                for (int leaf = 0; leaf < tree.NumLeaves; leaf++)
                {
                    var l = leaf;
                    actions[actionIndex++] = () =>
                    {
                        int documentPos = _leafBegin[l];
                        for (int chunkIndex = 0; chunkIndex < perChunkDocumentLists.Length; chunkIndex++)
                        {
                            foreach (int d in perChunkDocumentLists[chunkIndex][l])
                            {
                                _documents[documentPos++] = d;
                            }
                            perChunkDocumentLists[chunkIndex][l] = null;
                        }
                    };
                }
                Parallel.Invoke(new ParallelOptions {
                    MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
                }, actions);
            }
        }
 public AcceleratedGradientDescent(TreeEnsemble ensemble, Dataset trainData, double[] initTrainScores, IGradientAdjuster gradientWrapper)
     : base(ensemble, trainData, initTrainScores, gradientWrapper)
 {
     UseFastTrainingScoresUpdate = false;
 }
Example #13
0
 public ScoreTracker(string datasetName, Dataset set, double[] initScores)
 {
     Initialize(datasetName, set, initScores);
 }
Example #14
0
 }                                                                     //An Xk is an alias to scores
 public AgdScoreTracker(string datsetName, Dataset set, double[] initScores)
     : base(datsetName, set, initScores)
 {
     _k = 0;
     YK = (double[])XK.Clone();
 }
 public LogLossObjectiveFunction(Dataset trainSet, LogLossCommandLineArgs cmd)
     : base(trainSet, trainSet.Ratings, cmd)
 {
     _mode = cmd.loglossmode;
     _coef = cmd.loglosscoef;
 }
 protected override ScoreTracker ConstructScoreTracker(string name, Dataset set, double[] initScores)
 {
     return(new ScoreTracker(name, set, initScores));
 }
Example #17
0
 protected TreeLearner(Dataset trainData, int numLeaves)
 {
     TrainData    = trainData;
     NumLeaves    = numLeaves;
     Partitioning = new DocumentPartitioning(TrainData.NumDocs, numLeaves);
 }
 public WinLossSurplusObjectiveFunction(Dataset trainSet, short[] labels, WinLossSurplusCommandLineArgs cmd)
     : base(trainSet, labels, cmd)
 {
 }