Пример #1
0
        public List <Tree> MakeTreesParallel(int ntrees)
        {
            int cores = Environment.ProcessorCount;

            int treesPerCore = ntrees / cores;             // int division == floor

            int[] coreChunks = Yarr.Repeat(treesPerCore, cores);

            int diff = ntrees - (treesPerCore * cores);

            for (int i = 0; i < diff; i++)
            {
                coreChunks[i]++;
            }

            Task <List <Tree> >[] tasks = new Task <List <Tree> > [cores];
            for (int i = 0; i < cores; i++)
            {
                int privateI = i;
                tasks[i] = Task.Factory.StartNew(
                    () => MakeTrees(coreChunks[privateI])
                    );
            }
            Task.WaitAll(tasks);

            return(tasks.SelectMany(t => t.Result).ToList());
        }
Пример #2
0
 private double[] CalcGlobalMaxs()
 {
     double[] maxs = new double[this.NFeatures];
     for (int i = 0; i < this.NFeatures; i++)
     {
         maxs[i] = Yarr.Max(this._FeatureCols[i]);
     }
     return(maxs);
 }
Пример #3
0
 private double[] CalcGlobalMins()
 {
     double[] mins = new double[this.NFeatures];
     for (int featureIndex = 0; featureIndex < this.NFeatures; featureIndex++)
     {
         mins[featureIndex] = Yarr.Min(this._FeatureCols[featureIndex]);
     }
     return(mins);
 }
Пример #4
0
        private List <Tree> Split()
        {
            var splitInfo = FindBestSplit();
            //var splitInfo = FindBestRandomSplit();
            int    bestLocalDimIndex = splitInfo.Item1;
            double bestSplit         = splitInfo.Item2;
            double maxExpectedInfo   = splitInfo.Item3;

            if (bestLocalDimIndex == -1)
            {
                return(null);
            }

            int bestGlobalDimIndex = this.TargetFeatures[bestLocalDimIndex];

            bool[] filter = TrainPoints.FeatureCols[bestGlobalDimIndex].Geq(bestSplit);

            var upperMinCorner = new double[NDim];

            MinCorner.CopyTo(upperMinCorner, 0);
            upperMinCorner[bestLocalDimIndex] = bestSplit;

            var lowerMaxCorner = new double[NDim];

            MaxCorner.CopyTo(lowerMaxCorner, 0);
            lowerMaxCorner[bestLocalDimIndex] = bestSplit;

            var upperTree = new Tree(
                TrainPoints.Filter(filter),
                this.TargetFeatures,
                upperMinCorner,
                MaxCorner,
                includeMax: IncludeMax,
                normalizingConstant: NormalizingConstant
                );

            bool[] lowerIncludeMax = new bool[NDim];
            IncludeMax.CopyTo(lowerIncludeMax, 0);
            lowerIncludeMax[bestLocalDimIndex] = false;

            Yarr.InlineNot(filter);
            var lowerTree = new Tree(
                TrainPoints.Filter(filter),
                this.TargetFeatures,
                MinCorner,
                lowerMaxCorner,
                includeMax: lowerIncludeMax,
                normalizingConstant: NormalizingConstant
                );

            this.SplitDim = bestLocalDimIndex;
            this.SplitVal = bestSplit;
            return(new List <Tree> {
                upperTree, lowerTree
            });
        }
Пример #5
0
        public char[] Classify(RecordSet data, bool parallel = true)
        {
            Score scores = this.Scorer.Score(data, parallel);

            double[] ratios = Yarr.Div(scores.SScores, scores.BScores);

            char[] result = new char[data.NRows];
            for (int i = 0; i < data.NRows; i++)
            {
                result[i] = (ratios[i] >= this.Cutoff) ? 's' : 'b';
            }
            return(result);
        }
Пример #6
0
        private Tree MakeTree()
        {
            var cols = RandomUtils.Choice(this.ColIndices, this.ColsPerTree).ToArray();

            bool[] filter       = Yarr.InlineNot(this.Data.HasNaN(cols));
            var    filteredData = this.Data.Filter(filter);

            var result = new Tree(
                filteredData,
                cols,
                Yarr.FancyIndex(this.GlobalMinCorner, cols),
                Yarr.FancyIndex(this.GlobalMaxCorner, cols)
                );

            result.Train();

            return(result);
        }
Пример #7
0
        protected RecordSet(RecordSet copyFrom, bool[] filter)
        {
            this.Index     = Yarr.Filter(copyFrom.Index, filter);
            this.NRows     = this.Index.Length;
            this.NFeatures = copyFrom.NFeatures;

            this._EventIds = copyFrom._EventIds;
            this.EventIds  = new Indexer <int>(this, this._EventIds);

            this._FeatureCols = copyFrom._FeatureCols;
            this.FeatureCols  = new Indexer <double> [this.NFeatures];
            for (int featureNum = 0; featureNum < this.NFeatures; featureNum++)
            {
                this.FeatureCols[featureNum] = new Indexer <double>(this, this._FeatureCols[featureNum]);
            }

            this._GlobalMins = copyFrom._GlobalMins;
            this._GlobalMaxs = copyFrom._GlobalMaxs;
        }
Пример #8
0
        public Score Score(RecordSet data, bool parallel = true)
        {
            //NOTE: ignore parallel parameter

            double[] sScores = Yarr.Repeat(double.NaN, data.NRows);
            double[] bScores = Yarr.Repeat(double.NaN, data.NRows);

            bool[] filter       = Yarr.InlineNot(data.HasNaN(this.TargetFeatures));
            var    filteredData = data.Filter(filter);

            data = null;             // unlikely to let anything be GC'ed (lots of references to same obj) but it can't hurt

            this._Score(
                filteredData,
                Yarr.Range(filteredData.NRows).MakeSlice(),
                sScores,
                bScores
                );
            return(new Score(sScores, bScores));
        }
Пример #9
0
        private static double AMS(char[] predictions, TrainingRecordSet actual)
        {
            bool[] predictedSignal  = Yarr.Equ(predictions, 's');
            bool[] actualSignal     = actual.Labels.Equ('s');
            bool[] actualBackground = Yarr.Not(actualSignal);

            double total_s = actual.Filter(actualSignal).Weights.Sum();
            double total_b = actual.Filter(actualBackground).Weights.Sum();

            double[] scaledWeights = new double[predictions.Length];
            for (int i = 0; i < predictions.Length; i++)
            {
                if (actualSignal[i])
                {
                    scaledWeights[i] = actual.Weights[i] * (TOTAL_S / total_s);
                }
                else
                {
                    scaledWeights[i] = actual.Weights[i] * (TOTAL_B / total_b);
                }
            }

            bool[] truePositives  = Yarr.And(predictedSignal, actualSignal);
            bool[] falsePositives = Yarr.And(predictedSignal, actualBackground);

            double       s   = actual.Filter(truePositives).Weights.Sum();
            double       b   = actual.Filter(falsePositives).Weights.Sum();
            const double B_R = 10.0;

            double radicand = 2.0 * ((s + b + B_R) * Math.Log(1.0 + (s / (b + B_R))) - s);

            if (radicand < 0.0)
            {
                throw new Exception("radicand < 0.0, aborting");
            }
            else
            {
                return(Math.Sqrt(radicand));
            }
        }
Пример #10
0
        private double Entropy(params int[] setCounts)
        {
            double sum = (double)Yarr.Sum(setCounts);

            if (sum == 0.0)
            {
                return(0.0);
            }

            double nent = 0.0;

            for (int i = 0; i < setCounts.Length; i++)
            {
                if (setCounts[i] == 0)
                {
                    continue;                     // prop*logProp == 0 * -inf == NaN even though it should be 0. Better to be lazy and skip it.
                }
                double prob    = setCounts[i] / sum;
                double logProb = Math.Log(prob, 2.0);
                nent += prob * logProb;
            }
            return(-nent);
        }
Пример #11
0
        public Tree(
            TrainingRecordSet trainPoints,
            int[] targetFeatures,
            double[] minCorner,
            double[] maxCorner,
            bool[] includeMax          = null,
            double?normalizingConstant = null
            )
        {
            this.TargetFeatures = targetFeatures;
            int ndim = this.NDim = targetFeatures.Length;

            this.TrainPoints = trainPoints;
            int npoints = this.NTrainPoints = trainPoints.NRows;

            this.MinCorner = minCorner;
            this.MaxCorner = maxCorner;

            if (includeMax == null)
            {
                includeMax = Yarr.Repeat(true, ndim);
            }
            this.IncludeMax = includeMax;

            if (!normalizingConstant.HasValue)
            {
                normalizingConstant = npoints;
            }
            this.NormalizingConstant = normalizingConstant.Value;

            double normalizedVolume = (CalcVolume() * this.NormalizingConstant);
            int    nS = trainPoints.Labels.CountEqu('s');
            int    nB = npoints - nS;

            this.SDensity = nS / normalizedVolume;
            this.BDensity = nB / normalizedVolume;
        }
Пример #12
0
//		private Score ScoreParallel(RecordSet data)
//		{
//
//		}

        private Score GMean(IEnumerable <Score> scores, int nrows)
        {
            double[] sSums   = Yarr.Repeat <double>(0.0, nrows);
            double[] bSums   = Yarr.Repeat <double>(0.0, nrows);
            int[]    sCounts = Yarr.Repeat <int>(0, nrows);
            int[]    bCounts = Yarr.Repeat <int>(0, nrows);

            foreach (Score score in scores)
            {
                for (int rowIndex = 0; rowIndex < nrows; rowIndex++)
                {
                    double sScore = score.SScores[rowIndex];
                    if (!double.IsNaN(sScore))
                    {
                        sSums[rowIndex] += Math.Log(sScore);
                        sCounts[rowIndex]++;
                    }

                    double bScore = score.BScores[rowIndex];
                    if (!double.IsNaN(bScore))
                    {
                        bSums[rowIndex] += Math.Log(bScore);
                        bCounts[rowIndex]++;
                    }
                }
            }

            double[] sScores = new double[nrows];
            double[] bScores = new double[nrows];
            for (int rowIndex = 0; rowIndex < nrows; rowIndex++)
            {
                sScores[rowIndex] = Math.Exp(sSums[rowIndex] / sCounts[rowIndex]);
                bScores[rowIndex] = Math.Exp(bSums[rowIndex] / bCounts[rowIndex]);
            }

            return(new Score(sScores, bScores));
        }
Пример #13
0
        private Tuple <double, double> FindBestSplit(int localDimIndex)
        {
            const int NSPLITS      = 5;
            double    totalEntropy = TotalEntropy();

            int globalDimIndex = this.TargetFeatures[localDimIndex];

            int[] globalDimIndices = Yarr.Repeat <int>(globalDimIndex, 1);

            double[] localMins = this.TrainPoints.CalcLocalMins(globalDimIndices);
            double[] localMaxs = this.TrainPoints.CalcLocalMaxs(globalDimIndices);

            double dimMin = localMins[0];
            double dimMax = localMaxs[0];

            double[] splits = RandomUtils.RandBetween(dimMin, dimMax, NSPLITS);

            double maxExpectedInfo = 0.0;
            double bestSplit       = double.NaN;

            for (int i = 0; i < NSPLITS; i++)
            {
                double split = splits[i];

                int nAbove, sAbove, bAbove;
                int nBelow, sBelow, bBelow;
                nAbove = sAbove = bAbove = nBelow = sBelow = bBelow = 0;
                for (int rowNum = 0; rowNum < NTrainPoints; rowNum++)
                {
                    double val      = TrainPoints.FeatureCols[globalDimIndex][rowNum];
                    bool   isSignal = TrainPoints.Labels[rowNum] == 's';

                    if (val >= split)
                    {
                        nAbove++;
                        if (isSignal)
                        {
                            sAbove++;
                        }
                        else
                        {
                            bAbove++;
                        }
                    }
                    else
                    {
                        nBelow++;
                        if (isSignal)
                        {
                            sBelow++;
                        }
                        else
                        {
                            bBelow++;
                        }
                    }
                }

                double probAbove = ((double)nAbove) / NTrainPoints;
                double probBelow = 1.0 - probAbove; // == ((double)nBelow) / NTrainPoints

                double entropyAbove = Entropy(sAbove, bAbove);
                double entropyBelow = Entropy(sBelow, bBelow);

                double expectedInfo = totalEntropy - ((probAbove * entropyAbove) + (probBelow * entropyBelow));

                if (expectedInfo > maxExpectedInfo)
                {
                    maxExpectedInfo = expectedInfo;
                    bestSplit       = split;
                }
            }

            return(new Tuple <double, double>(maxExpectedInfo, bestSplit));
        }
Пример #14
0
        private static void MainMain()
        {
            Write("Running Random Forest ({0} trees)", NUM_MODELS);

            Write("loading training data");
            var traindata   = Parser.LoadTrainData();
            var featureCols = CsvRecord.FEATURE_COLS;

            int[] colIndices = Yarr.Range(featureCols.Count);
            WriteDone();

            Write("creating random forest");
            var treeCreator             = new TreeCreator(traindata, colIndices, COLS_PER_MODEL);
            var trees                   = treeCreator.MakeTreesParallel(NUM_MODELS);
            ScoreAverager <Tree> forest = new ScoreAverager <Tree>(trees);

            WriteDone();
            Console.WriteLine(string.Format("\t\tcreated {0} trees", trees.Count));

            Write("creating and tuning classifier (parallel2)");
            PlayDingSound();
            double bestCutoff   = double.NaN;
            double bestExponent = double.NaN;
            double bestScore    = double.NegativeInfinity;
            var    classifier   = new Classifier(new ScoreCacher(forest));

            foreach (double exponent in Yarr.XRange(-0.5, 0.6, 0.1))
            {
                double cutoff = Math.Exp(exponent);
                classifier.Cutoff = cutoff;

                double score = AMS(classifier.Classify(traindata, parallel: PARALLEL), traindata);

                if (score > bestScore)
                {
                    bestScore    = score;
                    bestCutoff   = cutoff;
                    bestExponent = exponent;
                }
            }
            classifier        = new Classifier(forest);
            classifier.Cutoff = bestCutoff;
            WriteDone();
            Console.WriteLine(string.Format("\t\tpredicted ams: {0}", bestScore));
            Console.WriteLine(string.Format("\t\tcutoff: {0} (e^{1})", bestCutoff, bestExponent));

            if (bestScore < 3.5)
            {
                WriteDone();
                PlayFailSound();
                return;
            }

            Write("loading test data");
            var testdata = Parser.LoadTestData();

            WriteDone();

            Write("scoring test data");
            var predictions = classifier.Classify(testdata, parallel: PARALLEL);
            var confidences = Yarr.Range(1, testdata.NRows + 1);

            WriteDone();

            Write("writing output");
            Parser.WritePredictions(testdata.EventIds, predictions, confidences);
            WriteDone();

            WriteDone();             //whole-method timer
            PlayWinSound();
        }
Пример #15
0
        private Score ParallelGmeaner(BlockingCollection <Score> scores, int nrows)
        {
            double[] sSums   = Yarr.Repeat <double>(0.0, nrows);
            double[] bSums   = Yarr.Repeat <double>(0.0, nrows);
            int[]    sCounts = Yarr.Repeat <int>(0, nrows);
            int[]    bCounts = Yarr.Repeat <int>(0, nrows);

            int cores = Environment.ProcessorCount;
            BlockingCollection <double[]> sScoresCollection = new BlockingCollection <double[]>(cores * 10);
            BlockingCollection <double[]> bScoresCollection = new BlockingCollection <double[]>(cores * 10);

            Func <int[], double[], BlockingCollection <double[]>, Task> taskMaker =
                (counts, sums, scoreCollection) => Task.Factory.StartNew(
                    () =>
            {
                double[] scoreArr;
                while (!scoreCollection.IsCompleted)
                {
                    try
                    {
                        scoreArr = scoreCollection.Take();
                    }
                    catch (InvalidOperationException)
                    {
                        continue;
                    }

                    for (int rowIndex = 0; rowIndex < nrows; rowIndex++)
                    {
                        double val = scoreArr[rowIndex];
                        if (!double.IsNaN(val))
                        {
                            sums[rowIndex] += Math.Log(val);
                            counts[rowIndex]++;
                        }
                    }
                }
            }
                    );

            Task sTask = taskMaker(sCounts, sSums, sScoresCollection);
            Task bTask = taskMaker(bCounts, bSums, bScoresCollection);

            Score score;

            while (!scores.IsCompleted)
            {
                try
                {
                    score = scores.Take();
                }
                catch (InvalidOperationException)
                {
                    continue;
                }

                sScoresCollection.Add(score.SScores);
                bScoresCollection.Add(score.BScores);
            }

            sScoresCollection.CompleteAdding();
            bScoresCollection.CompleteAdding();
            Task.WaitAll(sTask, bTask);

            double[] sScores = new double[nrows];
            double[] bScores = new double[nrows];
            for (int rowIndex = 0; rowIndex < nrows; rowIndex++)
            {
                sScores[rowIndex] = Math.Exp(sSums[rowIndex] / sCounts[rowIndex]);
                bScores[rowIndex] = Math.Exp(bSums[rowIndex] / bCounts[rowIndex]);
            }

            return(new Score(sScores, bScores));
        }
Пример #16
0
        public RecordSet(List <CsvRecord> data, double nanValue = -999.0)
        {
            NRows     = data.Count;
            NFeatures = CsvRecord.NUM_FEATURES;

            Index = new int[NRows];

            _EventIds = new int[NRows];
            EventIds  = new Indexer <int>(this, _EventIds);

            _FeatureCols = new double[NFeatures][];
            FeatureCols  = new Indexer <double> [NFeatures];
            for (int featureIndex = 0; featureIndex < NFeatures; featureIndex++)
            {
                _FeatureCols[featureIndex] = new double[NRows];
                FeatureCols[featureIndex]  = new Indexer <double>(this, _FeatureCols[featureIndex]);
            }


            #region fill arrays
            for (int rownum = 0; rownum < NRows; rownum++)
            {
                var row = data[rownum];

                _EventIds[rownum] = row.EventId;
                Index[rownum]     = rownum;

                _FeatureCols[0][rownum]  = row.DER_mass_MMC;
                _FeatureCols[1][rownum]  = row.DER_mass_transverse_met_lep;
                _FeatureCols[2][rownum]  = row.DER_mass_vis;
                _FeatureCols[3][rownum]  = row.DER_pt_h;
                _FeatureCols[4][rownum]  = row.DER_deltaeta_jet_jet;
                _FeatureCols[5][rownum]  = row.DER_mass_jet_jet;
                _FeatureCols[6][rownum]  = row.DER_prodeta_jet_jet;
                _FeatureCols[7][rownum]  = row.DER_deltar_tau_lep;
                _FeatureCols[8][rownum]  = row.DER_pt_tot;
                _FeatureCols[9][rownum]  = row.DER_sum_pt;
                _FeatureCols[10][rownum] = row.DER_pt_ratio_lep_tau;
                _FeatureCols[11][rownum] = row.DER_met_phi_centrality;
                _FeatureCols[12][rownum] = row.DER_lep_eta_centrality;
                _FeatureCols[13][rownum] = row.PRI_tau_pt;
                _FeatureCols[14][rownum] = row.PRI_tau_eta;
                _FeatureCols[15][rownum] = row.PRI_tau_phi;
                _FeatureCols[16][rownum] = row.PRI_lep_pt;
                _FeatureCols[17][rownum] = row.PRI_lep_eta;
                _FeatureCols[18][rownum] = row.PRI_lep_phi;
                _FeatureCols[19][rownum] = row.PRI_met;
                _FeatureCols[20][rownum] = row.PRI_met_phi;
                _FeatureCols[21][rownum] = row.PRI_met_sumet;
                _FeatureCols[22][rownum] = row.PRI_jet_num;
                _FeatureCols[23][rownum] = row.PRI_jet_leading_pt;
                _FeatureCols[24][rownum] = row.PRI_jet_leading_eta;
                _FeatureCols[25][rownum] = row.PRI_jet_leading_phi;
                _FeatureCols[26][rownum] = row.PRI_jet_subleading_pt;
                _FeatureCols[27][rownum] = row.PRI_jet_subleading_eta;
                _FeatureCols[28][rownum] = row.PRI_jet_subleading_phi;
                _FeatureCols[29][rownum] = row.PRI_jet_all_pt;
            }
            #endregion

            for (int featureNum = 0; featureNum < NFeatures; featureNum++)
            {
                Yarr.InlineReplace(
                    _FeatureCols[featureNum],
                    nanValue,
                    double.NaN
                    );
            }
        }