/// <summary> /// Applies the Bayesian Discriminant function at protein level /// </summary> /// <param name="myParams"></param> public void BayesianCleaningAtProteinLevel() { int minNoExamplesPerClass = 5; PatternTools.GaussianDiscriminant.Discriminant gd = new PatternTools.GaussianDiscriminant.Discriminant(); List <int> dims = new List <int> { 0, 1, 2, 3, 4, 5 }; int negativeClassExampleCounter = 0; foreach (MyProtein p in MyProteinList) { //Find out what class does this belong double label = 1; if (p.Locus.StartsWith(myParams.LabeledDecoyTag)) { label = -1; negativeClassExampleCounter++; } gd.MySparseMatrix.addRow(new sparseMatrixRow((int)label, dims, p.InputVector)); } //We need to make sure everything is working properly in this new normalization //This greately degrades the classifier!!! never use!!!! //gd.MySparseMatrix.NormalizeAllColumnsToRangeFrom0To1New(); //------ Console.WriteLine("Target examples for protein model = " + gd.MySparseMatrix.theMatrixInRows.FindAll(a => a.Lable == 1).Count); Console.WriteLine("Decoy examples for protein model = " + gd.MySparseMatrix.theMatrixInRows.FindAll(a => a.Lable == -1).Count); gd.Model(false, new List <int>(), minNoExamplesPerClass, true, false); if (gd.ClassLableClassPackageDic.Keys.Count != 2) { throw new System.ArgumentException("Not enough examples to generate protein classification model. No available negative datapoints: " + negativeClassExampleCounter); } Parallel.ForEach(MyProteinList, r => //foreach (Scan s in p.MyScans) { //The result is ordered by class number var results = gd.Classify(r.InputVector.ToArray()); double BayesianDiference = results[0].Score - results[1].Score; r.BayesianScore = BayesianDiference; } ); double BayesianMin = MyProteinList.Min(a => a.BayesianScore); double BayesianMax = MyProteinList.Max(a => a.BayesianScore); double BayesianDif = BayesianMax - BayesianMin; MyProteinList.Sort((a, b) => b.BayesianScore.CompareTo(a.BayesianScore)); int numberOfReverseProteins = MyProteinList.FindAll(a => a.Locus.StartsWith(myParams.LabeledDecoyTag)).Count; int numberOfForwardProteins = MyProteinList.FindAll(a => !a.Locus.StartsWith(myParams.LabeledDecoyTag)).Count; //Now lets do the filtering int cutOffValue = MyProteinList.Count; for (cutOffValue = MyProteinList.Count - 1; cutOffValue > 0; cutOffValue--) { if (MyProteinList[cutOffValue].Locus.StartsWith(myParams.LabeledDecoyTag)) { numberOfReverseProteins--; } else { numberOfForwardProteins--; } //Calculate FDR; double fdr = (double)numberOfReverseProteins / ((double)numberOfForwardProteins + (double)numberOfReverseProteins); if (fdr <= myParams.ProteinFDR) { break; } } MyProteinList.RemoveRange(cutOffValue, MyProteinList.Count - cutOffValue); //Must cal this method to correct for the removed proteins RebuildScansFromModifiedProteinList(); }