Пример #1
0
#pragma warning restore TLC_GeneralName

        public unsafe void SumupCPlusPlus(SumupInputData input, FeatureHistogram histogram)
        {
            using (Timer.Time(TimerEvent.SumupSegment))
            {
                fixed(FloatType *pSumTargetsByBin = histogram.SumTargetsByBin)
                fixed(FloatType * pSampleOutputs  = input.Outputs)
                fixed(double *pSumWeightsByBin    = histogram.SumWeightsByBin)
                fixed(double *pSampleOuputWeights = input.Weights)
                fixed(uint *pData      = _data)
                fixed(byte *pSegType   = _segType)
                fixed(int *pSegLength  = _segLength)
                fixed(int *pIndices    = input.DocIndices)
                fixed(int *pCountByBin = histogram.CountByBin)
                {
                    int rv =
#if USE_SINGLE_PRECISION
                        C_SumupSegment_float
#else
                        C_SumupSegment_double
#endif
                            (pData, pSegType, pSegLength, pIndices, pSampleOutputs, pSampleOuputWeights,
                            pSumTargetsByBin,
                            pSumWeightsByBin, pCountByBin, input.TotalCount,
                            input.SumTargets);

                    if (rv < 0)
                    {
                        throw Contracts.Except("CSumup returned error {0}", rv);
                    }
                }
            }
        }
        public void SumupWeighted(int numDocsInLeaf, double sumTargets, double sumWeights, FloatType[] outputs, double[] weights, int[] docIndices)
        {
            using (Timer.Time(TimerEvent.Sumup))
            {
#if TLC_REVISION
                Array.Clear(SumWeightedTargetsByBin, 0, SumWeightedTargetsByBin.Length);
#else
                Array.Clear(SumTargetsByBin, 0, SumTargetsByBin.Length);
#endif

                if (SumWeightsByBin != null)
                {
                    Array.Clear(SumWeightsByBin, 0, SumWeightsByBin.Length);
                }

                Array.Clear(CountByBin, 0, CountByBin.Length);

                if (numDocsInLeaf > 0)
                {
                    SumupInputData input = new SumupInputData(
                        numDocsInLeaf,
                        sumTargets,
                        sumWeights,
                        outputs,
                        weights,
                        docIndices);

                    _bins.Sumup(input, this);
                }
            }
        }
Пример #3
0
        public virtual double[] GetGradient(IChannel ch, double[] scores)
        {
            Scores = scores;
            int sampleIndex = _rnd.Next(GradSamplingRate);

            using (Timer.Time(TimerEvent.ObjectiveFunctionGetDerivatives))
            {
                // REVIEW: This partitioning doesn't look optimal.
                // Probably make sence to investigate better ways of splitting data?
                var actions     = new Action[(int)Math.Ceiling((double)Dataset.NumQueries / QueryThreadChunkSize)];
                var actionIndex = 0;
                var queue       = new ConcurrentQueue <int>(Enumerable.Range(0, BlockingThreadPool.NumThreads));
                // fill the vectors with their correct values, query-by-query
                for (int q = 0; q < Dataset.NumQueries; q += QueryThreadChunkSize)
                {
                    int start = q;
                    actions[actionIndex++] = () =>
                    {
                        var threadIndex = 0;
                        Contracts.Check(queue.TryDequeue(out threadIndex));
                        GetGradientChunk(start, start + Math.Min(QueryThreadChunkSize, Dataset.NumQueries - start), GradSamplingRate, sampleIndex, threadIndex);
                        queue.Enqueue(threadIndex);
                    };
                }

                Parallel.Invoke(new ParallelOptions()
                {
                    MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
                }, actions);
            }
            return(Gradient);
        }
Пример #4
0
        /// <summary>
        /// Training algorithm for the single-feature functions f(x)
        /// </summary>
        /// <param name="ch">The channel to write to</param>
        private void TrainMainEffectsModel(IChannel ch)
        {
            Contracts.AssertValue(ch);
            int iterations = Args.NumIterations;

            ch.Info("Starting to train ...");

            using (var pch = Host.StartProgressChannel("GAM training"))
            {
                _objectiveFunction = CreateObjectiveFunction();
                var sumWeights = HasWeights ? TrainSet.SampleWeights.Sum() : 0;

                int iteration = 0;
                pch.SetHeader(new ProgressHeader("iterations"), e => e.SetProgress(0, iteration, iterations));
                for (int i = iteration; iteration < iterations; iteration++)
                {
                    using (Timer.Time(TimerEvent.Iteration))
                    {
                        var gradient   = _objectiveFunction.GetGradient(ch, TrainSetScore.Scores);
                        var sumTargets = gradient.Sum();

                        SumUpsAcrossFlocks(gradient, sumTargets, sumWeights);
                        TrainOnEachFeature(gradient, TrainSetScore.Scores, sumTargets, sumWeights, iteration);
                        UpdateScores(iteration);
                    }
                }
            }

            CombineGraphs(ch);
        }
Пример #5
0
        protected static unsafe void SumupCPlusPlusDense(SumupInputData input, FeatureHistogram histogram,
                                                         byte *data, int numBits)
        {
            using (Timer.Time(TimerEvent.SumupCppDense))
            {
                fixed(FloatType *pSumTargetsByBin = histogram.SumTargetsByBin)
                fixed(FloatType * pSampleOutputs = input.Outputs)
                fixed(double *pSumWeightsByBin   = histogram.SumWeightsByBin)
                fixed(double *pSampleWeights     = input.Weights)
                fixed(int *pIndices    = input.DocIndices)
                fixed(int *pCountByBin = histogram.CountByBin)
                {
                    int rv =
#if USE_SINGLE_PRECISION
                        C_Sumup_float
#else
                        C_Sumup_double
#endif
                            (numBits, data, pIndices, pSampleOutputs, pSampleWeights,
                            pSumTargetsByBin, pSumWeightsByBin, pCountByBin,
                            input.TotalCount, input.SumTargets, input.SumWeights);

                    if (rv < 0)
                    {
                        throw Contracts.Except("CSumup returned error {0}", rv);
                    }
                }
            }
        }
Пример #6
0
        internal override InternalRegressionTree TrainingIteration(IChannel ch, bool[] activeFeatures)
        {
            Contracts.CheckValue(ch, nameof(ch));
            // Fit a regression tree to the gradient using least squares.
            InternalRegressionTree tree = TreeLearner.FitTargets(ch, activeFeatures, AdjustTargetsAndSetWeights(ch));

            if (tree == null)
            {
                return(null); // Could not learn a tree. Exit.
            }
            // Adjust output values of tree by performing a Newton step.

            // REVIEW: This should be part of OptimizingAlgorithm.
            using (Timer.Time(TimerEvent.TreeLearnerAdjustTreeOutputs))
            {
                double[] backupScores = null;
                // when doing dropouts we need to replace the TrainingScores with the scores without the dropped trees
                if (DropoutRate > 0)
                {
                    backupScores          = TrainingScores.Scores;
                    TrainingScores.Scores = _scores;
                }

                if (AdjustTreeOutputsOverride != null)
                {
                    AdjustTreeOutputsOverride.AdjustTreeOutputs(ch, tree, TreeLearner.Partitioning, TrainingScores);
                }
                else if (ObjectiveFunction is IStepSearch)
                {
                    (ObjectiveFunction as IStepSearch).AdjustTreeOutputs(ch, tree, TreeLearner.Partitioning, TrainingScores);
                }
                else
                {
                    throw ch.Except("No AdjustTreeOutputs defined. Objective function should define IStepSearch or AdjustTreeOutputsOverride should be set");
                }
                if (DropoutRate > 0)
                {
                    // Returning the original scores.
                    TrainingScores.Scores = backupScores;
                }
            }
            if (Smoothing != 0.0)
            {
                SmoothTree(tree, Smoothing);
                UseFastTrainingScoresUpdate = false;
            }
            if (DropoutRate > 0)
            {
                // Don't do shrinkage if you do dropouts.
                double scaling = (1.0 / (1.0 + _numberOfDroppedTrees));
                tree.ScaleOutputsBy(scaling);
                _treeScores.Add(tree.GetOutputs(TrainingScores.Dataset));
            }
            UpdateAllScores(ch, tree);
            Ensemble.AddTree(tree);
            return(tree);
        }
Пример #7
0
        /// <summary>
        /// Construct a sparse int array from index, value pairs.
        /// </summary>
        /// <param name="length">The total length of the constructed array.</param>
        /// <param name="bitsPerItem">The number of bits required to store the values.</param>
        /// <param name="nonZeroValues">An ordered enumerable of (index,value) pairs.
        /// Each index should be strictly increasing as the iterable proceeds.</param>
        public DeltaSparseIntArray(int length, IntArrayBits bitsPerItem, IEnumerable <KeyValuePair <int, int> > nonZeroValues)
        {
            using (Timer.Time(TimerEvent.SparseConstruction))
            {
                List <int>  tempValueList = new List <int>();
                List <byte> tempDeltaList = new List <byte>();

                int currentIndex = 0;
                foreach (KeyValuePair <int, int> pair in nonZeroValues)
                {
                    int index = pair.Key;
                    int value = pair.Value;
                    if (index <= currentIndex && (index < 0 || tempValueList.Count > 0))
                    {
                        throw Contracts.Except("index {0} occurred after {1}", index, currentIndex);
                    }
                    while (index - currentIndex > byte.MaxValue)
                    {
                        tempDeltaList.Add(byte.MaxValue);
                        tempValueList.Add(0);
                        currentIndex += byte.MaxValue;
                    }
                    tempDeltaList.Add((byte)(index - currentIndex));
                    tempValueList.Add(value);
                    currentIndex = index;
                }
                // Add the final chunks of 0's if it ended early
                while (length - currentIndex > byte.MaxValue)
                {
                    tempDeltaList.Add(byte.MaxValue);
                    tempValueList.Add(0);
                    currentIndex += byte.MaxValue;
                }
                if (currentIndex >= length && currentIndex > 0)
                {
                    throw Contracts.Except("Index {0} inconsistent with length {1}", currentIndex, length);
                }
                _length = length;

                // It is faster not to use a 4-bit dense array here. The memory difference is minor, since it's just
                //  the sparse values that are saved on.
                // TODO: Implement a special iterator for 4-bit array, and change this code to use the iterator, which
                //          may be faster
                if (bitsPerItem == IntArrayBits.Bits0)
                {
                    throw Contracts.Except("Use dense arrays for 0 bits");
                }
                if (bitsPerItem <= IntArrayBits.Bits8)
                {
                    bitsPerItem = IntArrayBits.Bits8;
                }

                _values = IntArray.New(tempValueList.Count, IntArrayType.Dense, bitsPerItem, tempValueList) as DenseIntArray;
                _deltas = tempDeltaList.ToArray();
            }
        }
Пример #8
0
 private void Initialize(IChannel ch)
 {
     using (Timer.Time(TimerEvent.InitializeTraining))
     {
         InitializeGamHistograms();
         _subGraph            = new SubGraph(TrainSet.NumFeatures, Args.NumIterations);
         _leafSplitCandidates = new LeastSquaresRegressionTreeLearner.LeafSplitCandidates(TrainSet);
         _leafSplitHelper     = new LeafSplitHelper(HasWeights);
     }
 }
Пример #9
0
        public DeltaRepeatIntArray(int length, IntArrayBits bitsPerItem, IEnumerable <int> values)
        {
            using (Timer.Time(TimerEvent.SparseConstruction))
            {
                List <int>  tempValueList = new List <int>();
                List <byte> tempDeltaList = new List <byte>();

                _length = 0;

                byte delta   = 0;
                int  lastVal = -1;

                foreach (int val in values)
                {
                    if (val != lastVal || delta == byte.MaxValue)
                    {
                        tempValueList.Add(val);
                        lastVal = val;
                        if (_length != 0)
                        {
                            tempDeltaList.Add(delta);
                        }
                        delta = 0;
                    }
                    ++delta;
                    ++_length;
                }
                if (delta > 0)
                {
                    tempDeltaList.Add(delta);
                }

                if (_length != length)
                {
                    throw Contracts.Except("Length provided to repeat vector is inconsistent with value enumeration");
                }

                // It is faster not to use a 4-bit dense array here. The memory difference is minor, since it's just
                //  the sparse values that are saved on.
                // TODO: Implement a special iterator for 4-bit array, and change this code to use the iterator, which
                //          may be faster
                if (bitsPerItem == IntArrayBits.Bits0)
                {
                    throw Contracts.Except("Use dense arrays for 0 bits");
                }
                if (bitsPerItem <= IntArrayBits.Bits8)
                {
                    bitsPerItem = IntArrayBits.Bits8;
                }

                _values = IntArray.New(tempValueList.Count, IntArrayType.Dense, bitsPerItem, tempValueList) as DenseIntArray;
                _deltas = tempDeltaList.ToArray();
            }
        }
Пример #10
0
 private void TrainCore(IChannel ch)
 {
     Contracts.CheckValue(ch, nameof(ch));
     // REVIEW:Get rid of this lock then we completly remove all static classes from Gam such as BlockingThreadPool.
     lock (FastTreeShared.TrainLock)
     {
         using (Timer.Time(TimerEvent.TotalInitialization))
             Initialize(ch);
         using (Timer.Time(TimerEvent.TotalTrain))
             TrainMainEffectsModel(ch);
     }
 }
Пример #11
0
        public override unsafe void Sumup(SumupInputData input, FeatureHistogram histogram)
        {
            using (Timer.Time(TimerEvent.SumupDense10))
            {
                if (input.DocIndices == null)
                {
                    SumupRoot(histogram, input.Outputs, input.Weights);
                    return;
                }

                int fval = 0;
                fixed(uint *pData = _data)
                fixed(int *pCountByBin             = histogram.CountByBin)
                fixed(int *pDocIndicies            = input.DocIndices)
                fixed(FloatType * pSumTargetsByBin = histogram.SumTargetsByBin)
                fixed(FloatType * pTargets         = input.Outputs)
                {
                    if (histogram.SumWeightsByBin != null)
                    {
                        fixed(double *pSumWeightsByBin = histogram.SumWeightsByBin)
                        fixed(double *pWeights = input.Weights)
                        {
                            for (int ii = 0; ii < input.TotalCount; ++ii)
                            {
                                long offset = pDocIndicies[ii];
                                offset = (offset << 3) + (offset << 1);
                                int minor = (int)(offset & 0x1f);
                                int major = (int)(offset >> 5);
                                fval = (int)(((*(ulong *)(pData + major)) >> minor) & _mask);
                                pSumTargetsByBin[fval] += pTargets[ii];
                                pSumWeightsByBin[fval] += pWeights[ii];
                                ++pCountByBin[fval];
                            }
                        }
                    }
                    else
                    {
                        int end = input.TotalCount;
                        for (int ii = 0; ii < end; ++ii)
                        {
                            long offset = pDocIndicies[ii];
                            offset = (offset << 3) + (offset << 1);
                            int minor = (int)(offset & 0x1f);
                            int major = (int)(offset >> 5);
                            fval = (int)(((*(ulong *)(pData + major)) >> minor) & _mask);
                            pSumTargetsByBin[fval] += pTargets[ii];
                            ++pCountByBin[fval];
                        }
                    }
                }
            }
        }
Пример #12
0
 public DatasetSkeleton(byte[] buffer, ref int position)
 {
     AuxiliaryData = new Dictionary <string, DatasetSkeletonQueryDocData>();
     using (Timer.Time(TimerEvent.ConstructFromByteArray))
     {
         _ratings    = buffer.ToShortArray(ref position);
         Boundaries  = buffer.ToIntArray(ref position);
         QueryIds    = buffer.ToULongArray(ref position);
         DocIds      = buffer.ToULongArray(ref position);
         MaxDcg      = buffer.ToDoubleJaggedArray(ref position);
         _docToQuery = buffer.ToIntArray(ref position);
     }
 }
Пример #13
0
 public override void Sumup(SumupInputData input, FeatureHistogram histogram)
 {
     using (Timer.Time(TimerEvent.SumupRepeat))
     {
         if (input.DocIndices == null)
         {
             SumupRoot(input, histogram);
         }
         else
         {
             SumupLeaf(input, histogram);
         }
     }
 }
Пример #14
0
 internal virtual void UpdateAllScores(IChannel ch, InternalRegressionTree tree)
 {
     if (PreScoreUpdateEvent != null)
     {
         PreScoreUpdateEvent(ch);
     }
     using (Timer.Time(TimerEvent.UpdateScores))
     {
         foreach (ScoreTracker t in TrackedScores)
         {
             UpdateScores(t, tree);
         }
     }
 }
Пример #15
0
        public override void Sumup(SumupInputData input, FeatureHistogram histogram)
        {
            using (Timer.Time(TimerEvent.SumupSegment))
            {
                if (_length == 0)
                {
                    return;
                }
#if USE_FASTTREENATIVE
                SumupCPlusPlus(input, histogram);
#else
                base.Sumup(input, histogram);
#endif
            }
        }
Пример #16
0
        /// <summary>
        /// Splits the documents of a specified leaf to its two children based on a feature and a threshold value
        /// </summary>
        /// <param name="leaf">the leaf being split</param>
        /// <param name="bins">Split feature flock's bin</param>
        /// <param name="categoricalIndices">Catgeorical feature indices</param>
        /// <param name="gtChildIndex">Index of child node that contains documents whose split
        /// feature value is greater than the split threshold</param>
        public unsafe void Split(int leaf, IntArray bins, HashSet <int> categoricalIndices, int gtChildIndex)
        {
            Contracts.Assert(bins != null);

            using (Timer.Time(TimerEvent.DocumentPartitioningSplit))
            {
                if (_tempDocuments == null)
                {
                    _tempDocuments = new int[_documents.Length];
                }

                // Note: lteChildIndex = leaf
                int begin         = _leafBegin[leaf];
                int end           = begin + _leafCount[leaf];
                int newEnd        = begin;
                int tempEnd       = begin;
                var flockBinIndex = bins.GetIndexer();
                fixed(int *pDocuments = _documents)
                fixed(int *pTempDocuments = _tempDocuments)
                {
                    for (int curr = begin; curr < end; ++curr)
                    {
                        int doc        = pDocuments[curr];
                        int hotFeature = flockBinIndex[doc];

                        if (categoricalIndices.Contains(hotFeature - 1))
                        {
                            pTempDocuments[tempEnd++] = doc;
                        }
                        else
                        {
                            pDocuments[newEnd++] = doc;
                        }
                    }
                }

                int newCount = newEnd - begin;
                int gtCount  = tempEnd - begin;
                Array.Copy(_tempDocuments, begin, _documents, newEnd, gtCount);

                _leafCount[leaf]         = newCount;
                _leafBegin[gtChildIndex] = newEnd;
                _leafCount[gtChildIndex] = gtCount;
            }
        }
Пример #17
0
        private static IntArray ConcatBins(TsvFeature[] parts, uint[] concatValueMap)
        {
            using (Timer.Time(TimerEvent.ConcatBins))
            {
                int length = parts.Sum(x => x.Length);

                IntArrayBits  bitsPerItem = IntArray.NumBitsNeeded(concatValueMap.Length);
                DenseIntArray concatBins  = (DenseIntArray)IntArray.New(length, IntArrayType.Dense, bitsPerItem);

                int pos = 0;

                for (int partIndex = 0; partIndex < parts.Length; ++partIndex)
                {
                    IntArray bins = parts[partIndex].Bins;

                    if (concatValueMap.Length == parts[partIndex].ValueMap.Length)
                    {
                        foreach (int bin in bins)
                        {
                            concatBins[pos++] = bin;
                        }
                    }
                    else
                    {
                        int[] binMap = MakeBinMap(parts[partIndex]._valueMap, concatValueMap);

                        foreach (int bin in bins)
                        {
                            concatBins[pos++] = binMap[bin];
                        }
                    }
                }

                if (bitsPerItem != IntArrayBits.Bits0 && parts.All(x => x.Bins is DeltaSparseIntArray))
                {
                    return(new DeltaSparseIntArray(length, bitsPerItem, concatBins));
                }
                else
                {
                    return(concatBins);
                }
            }
        }
Пример #18
0
        public static Feature New(byte[] buffer, ref int position)
        {
            using (Timer.Time(TimerEvent.ConstructFromByteArray))
            {
                FeatureType type = (FeatureType)buffer.ToInt(ref position);

                switch (type)
                {
                case FeatureType.Raw:
                    TsvFeature tf = new TsvFeature(buffer, ref position);
#if !NO_STORE
                    tf.BinsCache = FileObjectStore <IntArrayFormatter> .GetDefaultInstance();
#endif
                    return(tf);

                default:
                    throw Contracts.Except("Impossible!");
                }
            }
        }
Пример #19
0
        /// <summary>
        /// Splits the documents of a specified leaf to its two children based on a feature and a threshold value
        /// </summary>
        /// <param name="leaf">the leaf being split</param>
        /// <param name="indexer"></param>
        /// <param name="threshold">the threshold</param>
        /// <param name="gtChildIndex">Index of child node that contains documents whose split
        /// feature value is greater than the split threshold</param>
        public unsafe void Split(int leaf, IIntArrayForwardIndexer indexer, UInt32 threshold, int gtChildIndex)
        {
            using (Timer.Time(TimerEvent.DocumentPartitioningSplit))
            {
                if (_tempDocuments == null)
                {
                    _tempDocuments = new int[_documents.Length];
                }

                // Note: lteChildIndex = leaf
                int begin   = _leafBegin[leaf];
                int end     = begin + _leafCount[leaf];
                int newEnd  = begin;
                int tempEnd = begin;

                fixed(int *pDocuments = _documents)
                fixed(int *pTempDocuments = _tempDocuments)
                {
                    for (int curr = begin; curr < end; ++curr)
                    {
                        int doc = pDocuments[curr];
                        if (indexer[doc] > threshold)
                        {
                            pTempDocuments[tempEnd++] = doc;
                        }
                        else
                        {
                            pDocuments[newEnd++] = doc;
                        }
                    }
                }

                int newCount = newEnd - begin;
                int gtCount  = tempEnd - begin;
                Array.Copy(_tempDocuments, begin, _documents, newEnd, gtCount);

                _leafCount[leaf]         = newCount;
                _leafBegin[gtChildIndex] = newEnd;
                _leafCount[gtChildIndex] = gtCount;
            }
        }
Пример #20
0
        /// <summary>
        /// Get the document partitions of a specified leaf if it is split based on a feature and a threshold value.
        /// </summary>
        /// <param name="leaf">the leaf being split</param>
        /// <param name="indexer">the indexer to access the feature value</param>
        /// <param name="threshold">the threshold</param>
        /// <param name="leftDocuments">[out] the left documents split from the leaf</param>
        /// <param name="leftDocumentSize">[out] the size of left documents</param>
        /// <param name="rightDocuments">[out] the right documents split from the leaf</param>
        /// <param name="rightDocumentSize">[out] the size of right documents</param>
        public unsafe void GetLeafDocumentPartitions(
            int leaf,
            IIntArrayForwardIndexer indexer,
            UInt32 threshold,
            out int[] leftDocuments,
            out int leftDocumentSize,
            out int[] rightDocuments,
            out int rightDocumentSize)
        {
            using (Timer.Time(TimerEvent.DocumentPartitioningSplit))
            {
                leftDocuments    = new int[_leafCount[leaf]];
                leftDocumentSize = 0;

                rightDocuments    = new int[_leafCount[leaf]];
                rightDocumentSize = 0;

                int begin = _leafBegin[leaf];
                int end   = begin + _leafCount[leaf];

                fixed(int *pDocuments = _documents)
                fixed(int *pTempLeftDocIndices  = leftDocuments)
                fixed(int *pTempRightDocIndices = rightDocuments)
                {
                    for (int curr = begin; curr < end; ++curr)
                    {
                        int doc = pDocuments[curr];
                        if (indexer[doc] > threshold)
                        {
                            pTempRightDocIndices[rightDocumentSize++] = doc;
                        }
                        else
                        {
                            pTempLeftDocIndices[leftDocumentSize++] = doc;
                        }
                    }
                }
            }
        }
Пример #21
0
        public SegmentIntArray(int length, IEnumerable <int> values)
        {
            using (Timer.Time(TimerEvent.SparseConstruction))
            {
                uint[] vals = new uint[length];
                uint   pos  = 0;
                uint   max  = 0;
                foreach (int v in values)
                {
                    if (pos >= length)
                    {
                        throw Contracts.Except("Length provided to segment vector is inconsistent with value enumeration");
                    }
                    vals[pos++] = (uint)v;
                    if ((uint)v > max)
                    {
                        max = (uint)v;
                    }
                }
                if (pos != length)
                {
                    throw Contracts.Except("Length provided to segment vector is inconsistent with value enumeration");
                }

                int  maxbits = BitsForValue(max);
                int  transitions;
                long bits;
                SegmentFindOptimalPath(vals, vals.Length, maxbits, out bits, out transitions);
                var b = FromWorkArray(vals, vals.Length, bits, transitions);
                _segType   = b._segType;
                _segLength = b._segLength;
                _data      = b._data;
                _length    = b._length;
                _bpi       = b._bpi;
            }
        }
Пример #22
0
        /// <summary>
        /// Constructs partitioning object based on the documents and RegressionTree splits
        /// NOTE: It has been optimized for speed and multiprocs with 10x gain on naive LINQ implementation
        /// </summary>
        public DocumentPartitioning(RegressionTree tree, Dataset dataset)
            : this(dataset.NumDocs, tree.NumLeaves)
        {
            using (Timer.Time(TimerEvent.DocumentPartitioningConstruction))
            {
                // figure out which leaf each document belongs to
                // NOTE: break it up into NumThreads chunks. This minimizes the number of re-computations necessary in
                // the row-wise indexer.
                int innerLoopSize = 1 + dataset.NumDocs / BlockingThreadPool.NumThreads; // +1 is to make sure we don't have a few left over at the end

                // figure out the exact number of chunks, needed in pathological cases when NumDocs < NumThreads
                int numChunks = dataset.NumDocs / innerLoopSize;
                if (dataset.NumDocs % innerLoopSize != 0)
                {
                    ++numChunks;
                }
                var perChunkDocumentLists = new List <int> [numChunks][];
                // REVIEW: This partitioning doesn't look optimal.
                // Probably make sence to investigate better ways of splitting data?
                var actions     = new Action[(int)Math.Ceiling(1.0 * dataset.NumDocs / innerLoopSize)];
                var actionIndex = 0;
                for (int docStart = 0; docStart < dataset.NumDocs; docStart += innerLoopSize)
                {
                    var fromDoc    = docStart;
                    var toDoc      = Math.Min(docStart + innerLoopSize, dataset.NumDocs);
                    var chunkIndex = docStart / innerLoopSize;
                    actions[actionIndex++] = () =>
                    {
                        Contracts.Assert(perChunkDocumentLists[chunkIndex] == null);

                        var featureBins = dataset.GetFeatureBinRowwiseIndexer();

                        List <int>[] perLeafDocumentLists = Enumerable.Range(0, tree.NumLeaves)
                                                            .Select(x => new List <int>(innerLoopSize / tree.NumLeaves))
                                                            .ToArray();

                        for (int d = fromDoc; d < toDoc; d++)
                        {
                            int leaf = tree.GetLeaf(featureBins[d]);
                            perLeafDocumentLists[leaf].Add(d);
                        }

                        perChunkDocumentLists[chunkIndex] = perLeafDocumentLists;
                    };
                }
                Parallel.Invoke(new ParallelOptions {
                    MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
                }, actions);

                // establish leaf starts and document counts
                _leafCount = Enumerable.Range(0, tree.NumLeaves)
                             .Select(leaf => Enumerable.Range(0, perChunkDocumentLists.Length)
                                     .Select(thread => perChunkDocumentLists[thread][leaf].Count)
                                     .Sum())
                             .ToArray();

                var cumulativeLength = _leafCount.CumulativeSum <int>().Take(tree.NumLeaves - 1);
                _leafBegin = Enumerable.Range(0, 1).Concat(cumulativeLength).ToArray();

                // move all documents that belong to the same leaf together
                Contracts.Assert(_documents.Length == _leafBegin[tree.NumLeaves - 1] + _leafCount[tree.NumLeaves - 1]);
                actions     = new Action[tree.NumLeaves];
                actionIndex = 0;
                for (int leaf = 0; leaf < tree.NumLeaves; leaf++)
                {
                    var l = leaf;
                    actions[actionIndex++] = () =>
                    {
                        int documentPos = _leafBegin[l];
                        for (int chunkIndex = 0; chunkIndex < perChunkDocumentLists.Length; chunkIndex++)
                        {
                            foreach (int d in perChunkDocumentLists[chunkIndex][l])
                            {
                                _documents[documentPos++] = d;
                            }
                            perChunkDocumentLists[chunkIndex][l] = null;
                        }
                    };
                }
                Parallel.Invoke(new ParallelOptions {
                    MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
                }, actions);
            }
        }