public Dataset GetSubDataset(int[] docIndices, bool destroyThisDataset, FileObjectStore <IntArrayFormatter> newBinsCache) { #endif int[] queryIndices = docIndices.Select(d => DocToQuery[d]).ToArray(); ulong[] uniqueQueryIds = queryIndices.Distinct().Select(q => QueryIds[q]).ToArray(); // calculate boundaries int[] boundaries = new int[uniqueQueryIds.Length + 1]; boundaries[0] = 0; int queryIndex = 1; for (int q = 1; q < queryIndices.Length; ++q) { if (queryIndices[q] != queryIndices[q - 1]) { boundaries[queryIndex++] = q; } } boundaries[uniqueQueryIds.Length] = queryIndices.Length; // construct skeleton DatasetSkeleton datasetSkeleton = new DatasetSkeleton(docIndices.Select(d => Ratings[d]).ToArray(), boundaries, uniqueQueryIds, docIndices.Select(d => DocIds[d]).ToArray()); // create features FeatureFlockBase[] features = new FeatureFlockBase[NumFlocks]; int[][] assignment = new int[][] { docIndices }; Parallel.For(0, NumFlocks, new ParallelOptions { MaxDegreeOfParallelism = BlockingThreadPool.NumThreads }, (int flockIndex) => { #if !NO_STORE GetSubDataset_ThreadWorker(features, flockIndex, assignment, destroyThisDataset, newBinsCache); #else GetSubDatasetThreadWorker(features, flockIndex, assignment, destroyThisDataset); #endif }); uint[] filteredDupeIds = null; // Filter the dupe ids, if any if (DupeIds != null) { uint[] dupeIds = DupeIds; filteredDupeIds = docIndices.Select(i => dupeIds[i]).ToArray(); } // auxiliary data Dictionary <string, DatasetSkeletonQueryDocData> auxData = _datasetSkeleton.AuxiliaryData; Dictionary <string, DatasetSkeletonQueryDocData> newAuxData = new Dictionary <string, DatasetSkeletonQueryDocData>(); foreach (KeyValuePair <string, DatasetSkeletonQueryDocData> pair in auxData) { newAuxData[pair.Key] = pair.Value.GetSubset(pair.Value.IsQueryLevel ? queryIndices.Distinct().ToArray() : docIndices); } datasetSkeleton.AuxiliaryData = newAuxData; // create new Dataset Dataset dataset = new Dataset(datasetSkeleton, features); dataset.DupeIds = filteredDupeIds; return(dataset); }
} //An Xk is an alias to scores public AgdScoreTracker(string datsetName, Dataset set, double[] initScores) : base(datsetName, set, initScores) { _k = 0; YK = (double[])XK.Clone(); }
internal AcceleratedGradientDescent(InternalTreeEnsemble ensemble, Dataset trainData, double[] initTrainScores, IGradientAdjuster gradientWrapper) : base(ensemble, trainData, initTrainScores, gradientWrapper) { UseFastTrainingScoresUpdate = false; }
void IParallelTraining.InitTreeLearner(Dataset trainData, int maxNumLeaves, int maxCatSplitPoints, ref int minDocInLeaf) { return; }
internal GradientDescent(InternalTreeEnsemble ensemble, Dataset trainData, double[] initTrainScores, IGradientAdjuster gradientWrapper) : base(ensemble, trainData, initTrainScores) { _gradientWrapper = gradientWrapper; _treeScores = new List <double[]>(); }
// REVIEW: When the FastTree appliation is decoupled with tree learner and boosting logic, this class should be removed. internal RandomForestOptimizer(InternalTreeEnsemble ensemble, Dataset trainData, double[] initTrainScores, IGradientAdjuster gradientWrapper) : base(ensemble, trainData, initTrainScores, gradientWrapper) { _gradientWrapper = gradientWrapper; }
public Test(string datasetName, Dataset set, double[] initScores) : this(new ScoreTracker(datasetName, set, initScores)) { }
public ObjectiveFunctionImpl(Dataset trainSet, bool[] trainSetLabels, Arguments args) : base(trainSet, args, args.MaxTreeOutput) { _labels = trainSetLabels; }
public void SetTrainingData(Dataset trainData, double[] initTrainScores) { TrainingScores = ConstructScoreTracker("train", trainData, initTrainScores); TrackedScores[0] = TrainingScores; }
// thread worker private void WinLossRangeWorkerFromScores(Dataset dataset, short[] labels, double[] scores, double[] result, int query, int threadIndex) { int begin = dataset.Boundaries[query]; int count = dataset.Boundaries[query + 1] - begin; int[] permutation = _permutationBuffers[threadIndex]; DcgPermutationComparer comparer = _comparers[threadIndex]; // set values for the comparer comparer.Scores = scores; comparer.Labels = labels; comparer.ScoresOffset = begin; comparer.LabelsOffset = begin; // calculate the permutation Array.Copy(_oneTwoThree, permutation, count); Array.Sort(permutation, 0, count, comparer); int surplus = 0; int maxsurplus = 0; int maxsurpluspos = 0; for (int t = 0; t < count; ++t) { if (labels[begin + permutation[t]] > 0 || labels[begin + permutation[t]] < 0) { surplus += labels[begin + permutation[t]]; } else { surplus--; } if (surplus > maxsurplus) { maxsurplus = surplus; maxsurpluspos = t; } if (t == 100) { Utils.InterlockedAdd(ref result[0], surplus); } if (t == 200) { Utils.InterlockedAdd(ref result[1], surplus); } if (t == 300) { Utils.InterlockedAdd(ref result[2], surplus); } if (t == 400) { Utils.InterlockedAdd(ref result[3], surplus); } if (t == 500) { Utils.InterlockedAdd(ref result[4], surplus); } if (t == 1000) { Utils.InterlockedAdd(ref result[5], surplus); } } Utils.InterlockedAdd(ref result[6], maxsurplus); Utils.InterlockedAdd(ref result[7], maxsurpluspos); Utils.InterlockedAdd(ref result[8], count); }
protected abstract ScoreTracker ConstructScoreTracker(string name, Dataset set, double[] initScores);
public void InitTreeLearner(Dataset trainData, int maxNumLeaves, int maxCatSplitPoints, ref int minDocInLeaf) { return; }
protected override ScoreTracker ConstructScoreTracker(string name, Dataset set, double[] initScores) { //REVIEW: This is not necessary. We can remove this by creating dummy scorer. return(new ScoreTracker(name, set, initScores)); }
public ScoreTracker(string datasetName, Dataset set, double[] initScores) { Initialize(datasetName, set, initScores); }
public ObjectiveFunctionImpl(Dataset trainSet, bool[] trainSetLabels, Options options) : base(trainSet, options, options.MaximumOutputMagnitudePerTree) { _labels = trainSetLabels; }
private IEnumerable <bool> GetClassificationLabelsFromRatings(Dataset set) { // REVIEW: Historically FastTree has this test as >= 1. TLC however // generally uses > 0. Consider changing FastTree to be consistent. return(set.Ratings.Select(x => x >= 1)); }
public ObjectiveFunctionImpl(Dataset trainSet, bool[] trainSetLabels, Options options) : base(trainSet, options, options.MaxTreeOutput) { _labels = trainSetLabels; }
public BinaryClassGamPredictor(IHostEnvironment env, int inputLength, Dataset trainset, double meanEffect, double[][] binEffects, int[] featureMap) : base(env, LoaderSignature, inputLength, trainset, meanEffect, binEffects, featureMap) { }
/// <summary> /// Constructs partitioning object based on the documents and RegressionTree splits /// NOTE: It has been optimized for speed and multiprocs with 10x gain on naive LINQ implementation /// </summary> internal DocumentPartitioning(InternalRegressionTree tree, Dataset dataset) : this(dataset.NumDocs, tree.NumLeaves) { using (Timer.Time(TimerEvent.DocumentPartitioningConstruction)) { // figure out which leaf each document belongs to // NOTE: break it up into NumThreads chunks. This minimizes the number of re-computations necessary in // the row-wise indexer. int innerLoopSize = 1 + dataset.NumDocs / BlockingThreadPool.NumThreads; // +1 is to make sure we don't have a few left over at the end // figure out the exact number of chunks, needed in pathological cases when NumDocs < NumThreads int numChunks = dataset.NumDocs / innerLoopSize; if (dataset.NumDocs % innerLoopSize != 0) { ++numChunks; } var perChunkDocumentLists = new List <int> [numChunks][]; // REVIEW: This partitioning doesn't look optimal. // Probably make sense to investigate better ways of splitting data? var actions = new Action[(int)Math.Ceiling(1.0 * dataset.NumDocs / innerLoopSize)]; var actionIndex = 0; for (int docStart = 0; docStart < dataset.NumDocs; docStart += innerLoopSize) { var fromDoc = docStart; var toDoc = Math.Min(docStart + innerLoopSize, dataset.NumDocs); var chunkIndex = docStart / innerLoopSize; actions[actionIndex++] = () => { Contracts.Assert(perChunkDocumentLists[chunkIndex] == null); var featureBins = dataset.GetFeatureBinRowwiseIndexer(); List <int>[] perLeafDocumentLists = Enumerable.Range(0, tree.NumLeaves) .Select(x => new List <int>(innerLoopSize / tree.NumLeaves)) .ToArray(); for (int d = fromDoc; d < toDoc; d++) { int leaf = tree.GetLeaf(featureBins[d]); perLeafDocumentLists[leaf].Add(d); } perChunkDocumentLists[chunkIndex] = perLeafDocumentLists; }; } Parallel.Invoke(new ParallelOptions { MaxDegreeOfParallelism = BlockingThreadPool.NumThreads }, actions); // establish leaf starts and document counts _leafCount = Enumerable.Range(0, tree.NumLeaves) .Select(leaf => Enumerable.Range(0, perChunkDocumentLists.Length) .Select(thread => perChunkDocumentLists[thread][leaf].Count) .Sum()) .ToArray(); var cumulativeLength = _leafCount.CumulativeSum <int>().Take(tree.NumLeaves - 1); _leafBegin = Enumerable.Range(0, 1).Concat(cumulativeLength).ToArray(); // move all documents that belong to the same leaf together Contracts.Assert(_documents.Length == _leafBegin[tree.NumLeaves - 1] + _leafCount[tree.NumLeaves - 1]); actions = new Action[tree.NumLeaves]; actionIndex = 0; for (int leaf = 0; leaf < tree.NumLeaves; leaf++) { var l = leaf; actions[actionIndex++] = () => { int documentPos = _leafBegin[l]; for (int chunkIndex = 0; chunkIndex < perChunkDocumentLists.Length; chunkIndex++) { foreach (int d in perChunkDocumentLists[chunkIndex][l]) { _documents[documentPos++] = d; } perChunkDocumentLists[chunkIndex][l] = null; } }; } Parallel.Invoke(new ParallelOptions { MaxDegreeOfParallelism = BlockingThreadPool.NumThreads }, actions); } }
protected override ScoreTracker ConstructScoreTracker(string name, Dataset set, double[] initScores) { return(new ScoreTracker(name, set, initScores)); }
public ConjugateGradientDescent(InternalTreeEnsemble ensemble, Dataset trainData, double[] initTrainScores, IGradientAdjuster gradientWrapper) : base(ensemble, trainData, initTrainScores, gradientWrapper) { _currentDk = new double[trainData.NumDocs]; }