Example #1
0
        /// <summary>
        /// Applies the Bayesian Discriminant function at protein level
        /// </summary>
        /// <param name="myParams"></param>
        public void BayesianCleaningAtProteinLevel()
        {
            int minNoExamplesPerClass = 5;

            PatternTools.GaussianDiscriminant.Discriminant gd = new PatternTools.GaussianDiscriminant.Discriminant();
            List <int> dims = new List <int> {
                0, 1, 2, 3, 4, 5
            };

            int negativeClassExampleCounter = 0;

            foreach (MyProtein p in MyProteinList)
            {
                //Find out what class does this belong
                double label = 1;
                if (p.Locus.StartsWith(myParams.LabeledDecoyTag))
                {
                    label = -1;
                    negativeClassExampleCounter++;
                }

                gd.MySparseMatrix.addRow(new sparseMatrixRow((int)label, dims, p.InputVector));
            }


            //We need to make sure everything is working properly in this new normalization
            //This greately degrades the classifier!!! never use!!!!
            //gd.MySparseMatrix.NormalizeAllColumnsToRangeFrom0To1New();

            //------
            Console.WriteLine("Target examples for protein model = " + gd.MySparseMatrix.theMatrixInRows.FindAll(a => a.Lable == 1).Count);
            Console.WriteLine("Decoy examples for protein model = " + gd.MySparseMatrix.theMatrixInRows.FindAll(a => a.Lable == -1).Count);
            gd.Model(false, new List <int>(), minNoExamplesPerClass, true, false);

            if (gd.ClassLableClassPackageDic.Keys.Count != 2)
            {
                throw new System.ArgumentException("Not enough examples to generate protein classification model.  No available negative datapoints: " + negativeClassExampleCounter);
            }


            Parallel.ForEach(MyProteinList, r =>
                             //foreach (Scan s in p.MyScans)
            {
                //The result is ordered by class number
                var results = gd.Classify(r.InputVector.ToArray());
                double BayesianDiference = results[0].Score - results[1].Score;
                r.BayesianScore          = BayesianDiference;
            }
                             );

            double BayesianMin = MyProteinList.Min(a => a.BayesianScore);
            double BayesianMax = MyProteinList.Max(a => a.BayesianScore);
            double BayesianDif = BayesianMax - BayesianMin;

            MyProteinList.Sort((a, b) => b.BayesianScore.CompareTo(a.BayesianScore));
            int numberOfReverseProteins = MyProteinList.FindAll(a => a.Locus.StartsWith(myParams.LabeledDecoyTag)).Count;
            int numberOfForwardProteins = MyProteinList.FindAll(a => !a.Locus.StartsWith(myParams.LabeledDecoyTag)).Count;

            //Now lets do the filtering
            int cutOffValue = MyProteinList.Count;

            for (cutOffValue = MyProteinList.Count - 1; cutOffValue > 0; cutOffValue--)
            {
                if (MyProteinList[cutOffValue].Locus.StartsWith(myParams.LabeledDecoyTag))
                {
                    numberOfReverseProteins--;
                }
                else
                {
                    numberOfForwardProteins--;
                }
                //Calculate FDR;
                double fdr = (double)numberOfReverseProteins / ((double)numberOfForwardProteins + (double)numberOfReverseProteins);

                if (fdr <= myParams.ProteinFDR)
                {
                    break;
                }
            }

            MyProteinList.RemoveRange(cutOffValue, MyProteinList.Count - cutOffValue);

            //Must cal this method to correct for the removed proteins
            RebuildScansFromModifiedProteinList();
        }