Ejemplo n.º 1
0
        public override void AddScores(RegressionTree tree, double multiplier)
        {
            _k++;
            double coeff = (_k - 1.0) / (_k + 2.0);

            int innerLoopSize = 1 + Dataset.NumDocs / BlockingThreadPool.NumThreads;   // +1 is to make sure we don't have a few left over at the end
            // REVIEW: This partitioning doesn't look optimal.
            // Probably make sence to investigate better ways of splitting data?
            var actions     = new Action[(int)Math.Ceiling(1.0 * Dataset.NumDocs / innerLoopSize)];
            var actionIndex = 0;

            for (int d = 0; d < Dataset.NumDocs; d += innerLoopSize)
            {
                var fromDoc = d;
                var toDoc   = Math.Min(d + innerLoopSize, Dataset.NumDocs);
                actions[actionIndex++] = () =>
                {
                    var featureBins = Dataset.GetFeatureBinRowwiseIndexer();
                    for (int doc = fromDoc; doc < toDoc; doc++)
                    {
                        double output = multiplier * tree.GetOutput(featureBins[doc]);
                        double newXK  = YK[doc] + output;
                        double newYK  = newXK + coeff * (newXK - XK[doc]);
                        XK[doc] = newXK;
                        YK[doc] = newYK;
                    }
                };
            }
            Parallel.Invoke(new ParallelOptions {
                MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
            }, actions);
            SendScoresUpdatedMessage();
        }
Ejemplo n.º 2
0
        public void GetOutputs(Dataset dataset, double[] outputs, int prefix)
        {
            if (prefix > _trees.Count || prefix < 0)
            {
                prefix = _trees.Count;
            }

            int innerLoopSize = 1 + dataset.NumDocs / BlockingThreadPool.NumThreads;  // minimize number of times we have to skip forward in the sparse arrays
            // REVIEW: This partitioning doesn't look optimal.
            // Probably make sence to investigate better ways of splitting data?
            var actions     = new Action[(int)Math.Ceiling(1.0 * dataset.NumDocs / innerLoopSize)];
            var actionIndex = 0;

            for (int d = 0; d < dataset.NumDocs; d += innerLoopSize)
            {
                actions[actionIndex++] = () =>
                {
                    var featureBins = dataset.GetFeatureBinRowwiseIndexer();
                    var toDoc       = Math.Min(d + innerLoopSize, dataset.NumDocs);
                    for (int doc = d; doc < toDoc; doc++)
                    {
                        outputs[doc] = GetOutput(featureBins[doc], prefix);
                    }
                };
            }
            Parallel.Invoke(new ParallelOptions {
                MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
            }, actions);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Constructs partitioning object based on the documents and RegressionTree splits
        /// NOTE: It has been optimized for speed and multiprocs with 10x gain on naive LINQ implementation
        /// </summary>
        public DocumentPartitioning(RegressionTree tree, Dataset dataset)
            : this(dataset.NumDocs, tree.NumLeaves)
        {
            using (Timer.Time(TimerEvent.DocumentPartitioningConstruction))
            {
                // figure out which leaf each document belongs to
                // NOTE: break it up into NumThreads chunks. This minimizes the number of re-computations necessary in
                // the row-wise indexer.
                int innerLoopSize = 1 + dataset.NumDocs / BlockingThreadPool.NumThreads; // +1 is to make sure we don't have a few left over at the end

                // figure out the exact number of chunks, needed in pathological cases when NumDocs < NumThreads
                int numChunks = dataset.NumDocs / innerLoopSize;
                if (dataset.NumDocs % innerLoopSize != 0)
                {
                    ++numChunks;
                }
                var perChunkDocumentLists = new List <int> [numChunks][];
                // REVIEW: This partitioning doesn't look optimal.
                // Probably make sence to investigate better ways of splitting data?
                var actions     = new Action[(int)Math.Ceiling(1.0 * dataset.NumDocs / innerLoopSize)];
                var actionIndex = 0;
                for (int docStart = 0; docStart < dataset.NumDocs; docStart += innerLoopSize)
                {
                    var fromDoc    = docStart;
                    var toDoc      = Math.Min(docStart + innerLoopSize, dataset.NumDocs);
                    var chunkIndex = docStart / innerLoopSize;
                    actions[actionIndex++] = () =>
                    {
                        Contracts.Assert(perChunkDocumentLists[chunkIndex] == null);

                        var featureBins = dataset.GetFeatureBinRowwiseIndexer();

                        List <int>[] perLeafDocumentLists = Enumerable.Range(0, tree.NumLeaves)
                                                            .Select(x => new List <int>(innerLoopSize / tree.NumLeaves))
                                                            .ToArray();

                        for (int d = fromDoc; d < toDoc; d++)
                        {
                            int leaf = tree.GetLeaf(featureBins[d]);
                            perLeafDocumentLists[leaf].Add(d);
                        }

                        perChunkDocumentLists[chunkIndex] = perLeafDocumentLists;
                    };
                }
                Parallel.Invoke(new ParallelOptions {
                    MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
                }, actions);

                // establish leaf starts and document counts
                _leafCount = Enumerable.Range(0, tree.NumLeaves)
                             .Select(leaf => Enumerable.Range(0, perChunkDocumentLists.Length)
                                     .Select(thread => perChunkDocumentLists[thread][leaf].Count)
                                     .Sum())
                             .ToArray();

                var cumulativeLength = _leafCount.CumulativeSum <int>().Take(tree.NumLeaves - 1);
                _leafBegin = Enumerable.Range(0, 1).Concat(cumulativeLength).ToArray();

                // move all documents that belong to the same leaf together
                Contracts.Assert(_documents.Length == _leafBegin[tree.NumLeaves - 1] + _leafCount[tree.NumLeaves - 1]);
                actions     = new Action[tree.NumLeaves];
                actionIndex = 0;
                for (int leaf = 0; leaf < tree.NumLeaves; leaf++)
                {
                    var l = leaf;
                    actions[actionIndex++] = () =>
                    {
                        int documentPos = _leafBegin[l];
                        for (int chunkIndex = 0; chunkIndex < perChunkDocumentLists.Length; chunkIndex++)
                        {
                            foreach (int d in perChunkDocumentLists[chunkIndex][l])
                            {
                                _documents[documentPos++] = d;
                            }
                            perChunkDocumentLists[chunkIndex][l] = null;
                        }
                    };
                }
                Parallel.Invoke(new ParallelOptions {
                    MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
                }, actions);
            }
        }