public List <Tree> MakeTreesParallel(int ntrees) { int cores = Environment.ProcessorCount; int treesPerCore = ntrees / cores; // int division == floor int[] coreChunks = Yarr.Repeat(treesPerCore, cores); int diff = ntrees - (treesPerCore * cores); for (int i = 0; i < diff; i++) { coreChunks[i]++; } Task <List <Tree> >[] tasks = new Task <List <Tree> > [cores]; for (int i = 0; i < cores; i++) { int privateI = i; tasks[i] = Task.Factory.StartNew( () => MakeTrees(coreChunks[privateI]) ); } Task.WaitAll(tasks); return(tasks.SelectMany(t => t.Result).ToList()); }
private double[] CalcGlobalMaxs() { double[] maxs = new double[this.NFeatures]; for (int i = 0; i < this.NFeatures; i++) { maxs[i] = Yarr.Max(this._FeatureCols[i]); } return(maxs); }
private double[] CalcGlobalMins() { double[] mins = new double[this.NFeatures]; for (int featureIndex = 0; featureIndex < this.NFeatures; featureIndex++) { mins[featureIndex] = Yarr.Min(this._FeatureCols[featureIndex]); } return(mins); }
private List <Tree> Split() { var splitInfo = FindBestSplit(); //var splitInfo = FindBestRandomSplit(); int bestLocalDimIndex = splitInfo.Item1; double bestSplit = splitInfo.Item2; double maxExpectedInfo = splitInfo.Item3; if (bestLocalDimIndex == -1) { return(null); } int bestGlobalDimIndex = this.TargetFeatures[bestLocalDimIndex]; bool[] filter = TrainPoints.FeatureCols[bestGlobalDimIndex].Geq(bestSplit); var upperMinCorner = new double[NDim]; MinCorner.CopyTo(upperMinCorner, 0); upperMinCorner[bestLocalDimIndex] = bestSplit; var lowerMaxCorner = new double[NDim]; MaxCorner.CopyTo(lowerMaxCorner, 0); lowerMaxCorner[bestLocalDimIndex] = bestSplit; var upperTree = new Tree( TrainPoints.Filter(filter), this.TargetFeatures, upperMinCorner, MaxCorner, includeMax: IncludeMax, normalizingConstant: NormalizingConstant ); bool[] lowerIncludeMax = new bool[NDim]; IncludeMax.CopyTo(lowerIncludeMax, 0); lowerIncludeMax[bestLocalDimIndex] = false; Yarr.InlineNot(filter); var lowerTree = new Tree( TrainPoints.Filter(filter), this.TargetFeatures, MinCorner, lowerMaxCorner, includeMax: lowerIncludeMax, normalizingConstant: NormalizingConstant ); this.SplitDim = bestLocalDimIndex; this.SplitVal = bestSplit; return(new List <Tree> { upperTree, lowerTree }); }
public char[] Classify(RecordSet data, bool parallel = true) { Score scores = this.Scorer.Score(data, parallel); double[] ratios = Yarr.Div(scores.SScores, scores.BScores); char[] result = new char[data.NRows]; for (int i = 0; i < data.NRows; i++) { result[i] = (ratios[i] >= this.Cutoff) ? 's' : 'b'; } return(result); }
private Tree MakeTree() { var cols = RandomUtils.Choice(this.ColIndices, this.ColsPerTree).ToArray(); bool[] filter = Yarr.InlineNot(this.Data.HasNaN(cols)); var filteredData = this.Data.Filter(filter); var result = new Tree( filteredData, cols, Yarr.FancyIndex(this.GlobalMinCorner, cols), Yarr.FancyIndex(this.GlobalMaxCorner, cols) ); result.Train(); return(result); }
protected RecordSet(RecordSet copyFrom, bool[] filter) { this.Index = Yarr.Filter(copyFrom.Index, filter); this.NRows = this.Index.Length; this.NFeatures = copyFrom.NFeatures; this._EventIds = copyFrom._EventIds; this.EventIds = new Indexer <int>(this, this._EventIds); this._FeatureCols = copyFrom._FeatureCols; this.FeatureCols = new Indexer <double> [this.NFeatures]; for (int featureNum = 0; featureNum < this.NFeatures; featureNum++) { this.FeatureCols[featureNum] = new Indexer <double>(this, this._FeatureCols[featureNum]); } this._GlobalMins = copyFrom._GlobalMins; this._GlobalMaxs = copyFrom._GlobalMaxs; }
public Score Score(RecordSet data, bool parallel = true) { //NOTE: ignore parallel parameter double[] sScores = Yarr.Repeat(double.NaN, data.NRows); double[] bScores = Yarr.Repeat(double.NaN, data.NRows); bool[] filter = Yarr.InlineNot(data.HasNaN(this.TargetFeatures)); var filteredData = data.Filter(filter); data = null; // unlikely to let anything be GC'ed (lots of references to same obj) but it can't hurt this._Score( filteredData, Yarr.Range(filteredData.NRows).MakeSlice(), sScores, bScores ); return(new Score(sScores, bScores)); }
private static double AMS(char[] predictions, TrainingRecordSet actual) { bool[] predictedSignal = Yarr.Equ(predictions, 's'); bool[] actualSignal = actual.Labels.Equ('s'); bool[] actualBackground = Yarr.Not(actualSignal); double total_s = actual.Filter(actualSignal).Weights.Sum(); double total_b = actual.Filter(actualBackground).Weights.Sum(); double[] scaledWeights = new double[predictions.Length]; for (int i = 0; i < predictions.Length; i++) { if (actualSignal[i]) { scaledWeights[i] = actual.Weights[i] * (TOTAL_S / total_s); } else { scaledWeights[i] = actual.Weights[i] * (TOTAL_B / total_b); } } bool[] truePositives = Yarr.And(predictedSignal, actualSignal); bool[] falsePositives = Yarr.And(predictedSignal, actualBackground); double s = actual.Filter(truePositives).Weights.Sum(); double b = actual.Filter(falsePositives).Weights.Sum(); const double B_R = 10.0; double radicand = 2.0 * ((s + b + B_R) * Math.Log(1.0 + (s / (b + B_R))) - s); if (radicand < 0.0) { throw new Exception("radicand < 0.0, aborting"); } else { return(Math.Sqrt(radicand)); } }
private double Entropy(params int[] setCounts) { double sum = (double)Yarr.Sum(setCounts); if (sum == 0.0) { return(0.0); } double nent = 0.0; for (int i = 0; i < setCounts.Length; i++) { if (setCounts[i] == 0) { continue; // prop*logProp == 0 * -inf == NaN even though it should be 0. Better to be lazy and skip it. } double prob = setCounts[i] / sum; double logProb = Math.Log(prob, 2.0); nent += prob * logProb; } return(-nent); }
public Tree( TrainingRecordSet trainPoints, int[] targetFeatures, double[] minCorner, double[] maxCorner, bool[] includeMax = null, double?normalizingConstant = null ) { this.TargetFeatures = targetFeatures; int ndim = this.NDim = targetFeatures.Length; this.TrainPoints = trainPoints; int npoints = this.NTrainPoints = trainPoints.NRows; this.MinCorner = minCorner; this.MaxCorner = maxCorner; if (includeMax == null) { includeMax = Yarr.Repeat(true, ndim); } this.IncludeMax = includeMax; if (!normalizingConstant.HasValue) { normalizingConstant = npoints; } this.NormalizingConstant = normalizingConstant.Value; double normalizedVolume = (CalcVolume() * this.NormalizingConstant); int nS = trainPoints.Labels.CountEqu('s'); int nB = npoints - nS; this.SDensity = nS / normalizedVolume; this.BDensity = nB / normalizedVolume; }
// private Score ScoreParallel(RecordSet data) // { // // } private Score GMean(IEnumerable <Score> scores, int nrows) { double[] sSums = Yarr.Repeat <double>(0.0, nrows); double[] bSums = Yarr.Repeat <double>(0.0, nrows); int[] sCounts = Yarr.Repeat <int>(0, nrows); int[] bCounts = Yarr.Repeat <int>(0, nrows); foreach (Score score in scores) { for (int rowIndex = 0; rowIndex < nrows; rowIndex++) { double sScore = score.SScores[rowIndex]; if (!double.IsNaN(sScore)) { sSums[rowIndex] += Math.Log(sScore); sCounts[rowIndex]++; } double bScore = score.BScores[rowIndex]; if (!double.IsNaN(bScore)) { bSums[rowIndex] += Math.Log(bScore); bCounts[rowIndex]++; } } } double[] sScores = new double[nrows]; double[] bScores = new double[nrows]; for (int rowIndex = 0; rowIndex < nrows; rowIndex++) { sScores[rowIndex] = Math.Exp(sSums[rowIndex] / sCounts[rowIndex]); bScores[rowIndex] = Math.Exp(bSums[rowIndex] / bCounts[rowIndex]); } return(new Score(sScores, bScores)); }
private Tuple <double, double> FindBestSplit(int localDimIndex) { const int NSPLITS = 5; double totalEntropy = TotalEntropy(); int globalDimIndex = this.TargetFeatures[localDimIndex]; int[] globalDimIndices = Yarr.Repeat <int>(globalDimIndex, 1); double[] localMins = this.TrainPoints.CalcLocalMins(globalDimIndices); double[] localMaxs = this.TrainPoints.CalcLocalMaxs(globalDimIndices); double dimMin = localMins[0]; double dimMax = localMaxs[0]; double[] splits = RandomUtils.RandBetween(dimMin, dimMax, NSPLITS); double maxExpectedInfo = 0.0; double bestSplit = double.NaN; for (int i = 0; i < NSPLITS; i++) { double split = splits[i]; int nAbove, sAbove, bAbove; int nBelow, sBelow, bBelow; nAbove = sAbove = bAbove = nBelow = sBelow = bBelow = 0; for (int rowNum = 0; rowNum < NTrainPoints; rowNum++) { double val = TrainPoints.FeatureCols[globalDimIndex][rowNum]; bool isSignal = TrainPoints.Labels[rowNum] == 's'; if (val >= split) { nAbove++; if (isSignal) { sAbove++; } else { bAbove++; } } else { nBelow++; if (isSignal) { sBelow++; } else { bBelow++; } } } double probAbove = ((double)nAbove) / NTrainPoints; double probBelow = 1.0 - probAbove; // == ((double)nBelow) / NTrainPoints double entropyAbove = Entropy(sAbove, bAbove); double entropyBelow = Entropy(sBelow, bBelow); double expectedInfo = totalEntropy - ((probAbove * entropyAbove) + (probBelow * entropyBelow)); if (expectedInfo > maxExpectedInfo) { maxExpectedInfo = expectedInfo; bestSplit = split; } } return(new Tuple <double, double>(maxExpectedInfo, bestSplit)); }
private static void MainMain() { Write("Running Random Forest ({0} trees)", NUM_MODELS); Write("loading training data"); var traindata = Parser.LoadTrainData(); var featureCols = CsvRecord.FEATURE_COLS; int[] colIndices = Yarr.Range(featureCols.Count); WriteDone(); Write("creating random forest"); var treeCreator = new TreeCreator(traindata, colIndices, COLS_PER_MODEL); var trees = treeCreator.MakeTreesParallel(NUM_MODELS); ScoreAverager <Tree> forest = new ScoreAverager <Tree>(trees); WriteDone(); Console.WriteLine(string.Format("\t\tcreated {0} trees", trees.Count)); Write("creating and tuning classifier (parallel2)"); PlayDingSound(); double bestCutoff = double.NaN; double bestExponent = double.NaN; double bestScore = double.NegativeInfinity; var classifier = new Classifier(new ScoreCacher(forest)); foreach (double exponent in Yarr.XRange(-0.5, 0.6, 0.1)) { double cutoff = Math.Exp(exponent); classifier.Cutoff = cutoff; double score = AMS(classifier.Classify(traindata, parallel: PARALLEL), traindata); if (score > bestScore) { bestScore = score; bestCutoff = cutoff; bestExponent = exponent; } } classifier = new Classifier(forest); classifier.Cutoff = bestCutoff; WriteDone(); Console.WriteLine(string.Format("\t\tpredicted ams: {0}", bestScore)); Console.WriteLine(string.Format("\t\tcutoff: {0} (e^{1})", bestCutoff, bestExponent)); if (bestScore < 3.5) { WriteDone(); PlayFailSound(); return; } Write("loading test data"); var testdata = Parser.LoadTestData(); WriteDone(); Write("scoring test data"); var predictions = classifier.Classify(testdata, parallel: PARALLEL); var confidences = Yarr.Range(1, testdata.NRows + 1); WriteDone(); Write("writing output"); Parser.WritePredictions(testdata.EventIds, predictions, confidences); WriteDone(); WriteDone(); //whole-method timer PlayWinSound(); }
private Score ParallelGmeaner(BlockingCollection <Score> scores, int nrows) { double[] sSums = Yarr.Repeat <double>(0.0, nrows); double[] bSums = Yarr.Repeat <double>(0.0, nrows); int[] sCounts = Yarr.Repeat <int>(0, nrows); int[] bCounts = Yarr.Repeat <int>(0, nrows); int cores = Environment.ProcessorCount; BlockingCollection <double[]> sScoresCollection = new BlockingCollection <double[]>(cores * 10); BlockingCollection <double[]> bScoresCollection = new BlockingCollection <double[]>(cores * 10); Func <int[], double[], BlockingCollection <double[]>, Task> taskMaker = (counts, sums, scoreCollection) => Task.Factory.StartNew( () => { double[] scoreArr; while (!scoreCollection.IsCompleted) { try { scoreArr = scoreCollection.Take(); } catch (InvalidOperationException) { continue; } for (int rowIndex = 0; rowIndex < nrows; rowIndex++) { double val = scoreArr[rowIndex]; if (!double.IsNaN(val)) { sums[rowIndex] += Math.Log(val); counts[rowIndex]++; } } } } ); Task sTask = taskMaker(sCounts, sSums, sScoresCollection); Task bTask = taskMaker(bCounts, bSums, bScoresCollection); Score score; while (!scores.IsCompleted) { try { score = scores.Take(); } catch (InvalidOperationException) { continue; } sScoresCollection.Add(score.SScores); bScoresCollection.Add(score.BScores); } sScoresCollection.CompleteAdding(); bScoresCollection.CompleteAdding(); Task.WaitAll(sTask, bTask); double[] sScores = new double[nrows]; double[] bScores = new double[nrows]; for (int rowIndex = 0; rowIndex < nrows; rowIndex++) { sScores[rowIndex] = Math.Exp(sSums[rowIndex] / sCounts[rowIndex]); bScores[rowIndex] = Math.Exp(bSums[rowIndex] / bCounts[rowIndex]); } return(new Score(sScores, bScores)); }
public RecordSet(List <CsvRecord> data, double nanValue = -999.0) { NRows = data.Count; NFeatures = CsvRecord.NUM_FEATURES; Index = new int[NRows]; _EventIds = new int[NRows]; EventIds = new Indexer <int>(this, _EventIds); _FeatureCols = new double[NFeatures][]; FeatureCols = new Indexer <double> [NFeatures]; for (int featureIndex = 0; featureIndex < NFeatures; featureIndex++) { _FeatureCols[featureIndex] = new double[NRows]; FeatureCols[featureIndex] = new Indexer <double>(this, _FeatureCols[featureIndex]); } #region fill arrays for (int rownum = 0; rownum < NRows; rownum++) { var row = data[rownum]; _EventIds[rownum] = row.EventId; Index[rownum] = rownum; _FeatureCols[0][rownum] = row.DER_mass_MMC; _FeatureCols[1][rownum] = row.DER_mass_transverse_met_lep; _FeatureCols[2][rownum] = row.DER_mass_vis; _FeatureCols[3][rownum] = row.DER_pt_h; _FeatureCols[4][rownum] = row.DER_deltaeta_jet_jet; _FeatureCols[5][rownum] = row.DER_mass_jet_jet; _FeatureCols[6][rownum] = row.DER_prodeta_jet_jet; _FeatureCols[7][rownum] = row.DER_deltar_tau_lep; _FeatureCols[8][rownum] = row.DER_pt_tot; _FeatureCols[9][rownum] = row.DER_sum_pt; _FeatureCols[10][rownum] = row.DER_pt_ratio_lep_tau; _FeatureCols[11][rownum] = row.DER_met_phi_centrality; _FeatureCols[12][rownum] = row.DER_lep_eta_centrality; _FeatureCols[13][rownum] = row.PRI_tau_pt; _FeatureCols[14][rownum] = row.PRI_tau_eta; _FeatureCols[15][rownum] = row.PRI_tau_phi; _FeatureCols[16][rownum] = row.PRI_lep_pt; _FeatureCols[17][rownum] = row.PRI_lep_eta; _FeatureCols[18][rownum] = row.PRI_lep_phi; _FeatureCols[19][rownum] = row.PRI_met; _FeatureCols[20][rownum] = row.PRI_met_phi; _FeatureCols[21][rownum] = row.PRI_met_sumet; _FeatureCols[22][rownum] = row.PRI_jet_num; _FeatureCols[23][rownum] = row.PRI_jet_leading_pt; _FeatureCols[24][rownum] = row.PRI_jet_leading_eta; _FeatureCols[25][rownum] = row.PRI_jet_leading_phi; _FeatureCols[26][rownum] = row.PRI_jet_subleading_pt; _FeatureCols[27][rownum] = row.PRI_jet_subleading_eta; _FeatureCols[28][rownum] = row.PRI_jet_subleading_phi; _FeatureCols[29][rownum] = row.PRI_jet_all_pt; } #endregion for (int featureNum = 0; featureNum < NFeatures; featureNum++) { Yarr.InlineReplace( _FeatureCols[featureNum], nanValue, double.NaN ); } }