/// <summary> /// Applies the Bayesian Discriminant function at protein level /// </summary> /// <param name="myParams"></param> public void BayesianCleaningAtProteinLevel() { int minNoExamplesPerClass = 5; PatternTools.GaussianDiscriminant.Discriminant gd = new PatternTools.GaussianDiscriminant.Discriminant(); List <int> dims = new List <int> { 0, 1, 2, 3, 4, 5 }; int negativeClassExampleCounter = 0; foreach (MyProtein p in MyProteinList) { //Find out what class does this belong double label = 1; if (p.Locus.StartsWith(myParams.LabeledDecoyTag)) { label = -1; negativeClassExampleCounter++; } gd.MySparseMatrix.addRow(new sparseMatrixRow((int)label, dims, p.InputVector)); } //We need to make sure everything is working properly in this new normalization //This greately degrades the classifier!!! never use!!!! //gd.MySparseMatrix.NormalizeAllColumnsToRangeFrom0To1New(); //------ Console.WriteLine("Target examples for protein model = " + gd.MySparseMatrix.theMatrixInRows.FindAll(a => a.Lable == 1).Count); Console.WriteLine("Decoy examples for protein model = " + gd.MySparseMatrix.theMatrixInRows.FindAll(a => a.Lable == -1).Count); gd.Model(false, new List <int>(), minNoExamplesPerClass, true, false); if (gd.ClassLableClassPackageDic.Keys.Count != 2) { throw new System.ArgumentException("Not enough examples to generate protein classification model. No available negative datapoints: " + negativeClassExampleCounter); } Parallel.ForEach(MyProteinList, r => //foreach (Scan s in p.MyScans) { //The result is ordered by class number var results = gd.Classify(r.InputVector.ToArray()); double BayesianDiference = results[0].Score - results[1].Score; r.BayesianScore = BayesianDiference; } ); double BayesianMin = MyProteinList.Min(a => a.BayesianScore); double BayesianMax = MyProteinList.Max(a => a.BayesianScore); double BayesianDif = BayesianMax - BayesianMin; MyProteinList.Sort((a, b) => b.BayesianScore.CompareTo(a.BayesianScore)); int numberOfReverseProteins = MyProteinList.FindAll(a => a.Locus.StartsWith(myParams.LabeledDecoyTag)).Count; int numberOfForwardProteins = MyProteinList.FindAll(a => !a.Locus.StartsWith(myParams.LabeledDecoyTag)).Count; //Now lets do the filtering int cutOffValue = MyProteinList.Count; for (cutOffValue = MyProteinList.Count - 1; cutOffValue > 0; cutOffValue--) { if (MyProteinList[cutOffValue].Locus.StartsWith(myParams.LabeledDecoyTag)) { numberOfReverseProteins--; } else { numberOfForwardProteins--; } //Calculate FDR; double fdr = (double)numberOfReverseProteins / ((double)numberOfForwardProteins + (double)numberOfReverseProteins); if (fdr <= myParams.ProteinFDR) { break; } } MyProteinList.RemoveRange(cutOffValue, MyProteinList.Count - cutOffValue); //Must cal this method to correct for the removed proteins RebuildScansFromModifiedProteinList(); }
//Optionally returns the training sparse matrix public static SparseMatrix BayesianScoringPSM(List <SQTScan> myScans, Parameters myParams, bool considerPresence, bool considerForms, Dictionary <int, double> peptidePriors) { int minNoExamplesForClassModel = 4; PatternTools.GaussianDiscriminant.Discriminant gd = new PatternTools.GaussianDiscriminant.Discriminant(); List <sparseMatrixRow> theRows = new List <sparseMatrixRow>(); Utils.GenerateSparseMatrix(myScans, myParams, theRows, considerPresence, considerForms); //we will just train accepting there are no outliers List <int> unstableDims = PatternTools.GaussianDiscriminant.StabilityVerifier.Verify(theRows); gd.MySparseMatrix.theMatrixInRows = theRows; gd.Model(false, new List <int>(), minNoExamplesForClassModel, true, false); if (myParams.QFilterMahalanobisDistance) { List <sparseMatrixRow> outlierRows = new List <sparseMatrixRow>(); //Measure the MH distance for all vectors and eliminate the ones with MH greater than specified value foreach (sparseMatrixRow smr in gd.MySparseMatrix.theMatrixInRows) { double md = gd.MahalanobisDistance(smr); if (md > myParams.QFilterMahalanobisDistanceValue) { outlierRows.Add(smr); } } List <PatternTools.sparseMatrixRow> positiveRows = gd.MySparseMatrix.theMatrixInRows.FindAll(a => a.Lable == 1); List <PatternTools.sparseMatrixRow> negativeRows = gd.MySparseMatrix.theMatrixInRows.FindAll(a => a.Lable == -1); int outliersPositive = outlierRows.FindAll(a => a.Lable == 1).Count; int outliersNegative = outlierRows.FindAll(a => a.Lable == -1).Count; Console.WriteLine("\n===== Outlier detection with Mahalanobis Distance > {0} =====", myParams.QFilterMahalanobisDistanceValue); Console.WriteLine("= Target class : " + outliersPositive + " / " + positiveRows.Count); Console.WriteLine("= Labeled Decoy class : " + outliersNegative + " / " + negativeRows.Count); Console.WriteLine("=============================================================\n"); //Make sure we have enough juice so to afford eliminating outliers if ((negativeRows.Count - outliersNegative) > minNoExamplesForClassModel + 1) { //delete previous classification model foreach (sparseMatrixRow smr in outlierRows) { gd.MySparseMatrix.theMatrixInRows.Remove(smr); } List <sparseMatrixRow> cleanedMatrix = PatternTools.ObjectCopier.Clone(gd.MySparseMatrix.theMatrixInRows); unstableDims.AddRange(PatternTools.GaussianDiscriminant.StabilityVerifier.Verify(theRows)); unstableDims.Sort((a, b) => b.CompareTo(a)); //generate a new model based on cloned and cleaned sparse matrix gd = new PatternTools.GaussianDiscriminant.Discriminant(); gd.MySparseMatrix.theMatrixInRows = cleanedMatrix; gd.Model(false, new List <int>(), minNoExamplesForClassModel, true, false); Console.WriteLine("Done cleaning outliers and generating clean model"); } } if (gd.ClassLableClassPackageDic.Keys.Count == 1) { //throw new System.ArgumentException("Not enough trainning datapoints to generate spectra / peptide classification model"); //There are no negatives! all are positives foreach (SQTScan s in myScans) { //We do not provide maximum Bayesian Score so we can let other good spectra provide good examples for the protein classifier s.BayesianScore = 1; s.BayesianClass = 1; } } else { gd.ClassLableClassPackageDic[-1].Prior = 0.5; gd.ClassLableClassPackageDic[1].Prior = 0.5; //Parallel.ForEach(thePath.MyScans, s => foreach (SQTScan s in myScans) { //The result is ordered by class number if (gd.MySparseMatrix.theMatrixInRows[0].Values.Count < s.Bayes_InputVector.Count) { foreach (int i in unstableDims) { s.Bayes_InputVector.RemoveAt(i - 1); } } if (myParams.FormsPriorForPeptides && considerForms) { //calculate a penalty for the negative class gd.ClassLableClassPackageDic[-1].Prior = peptidePriors[s.NoForms]; gd.ClassLableClassPackageDic[1].Prior = 1 - peptidePriors[s.NoForms]; } var results = gd.Classify(s.Bayes_InputVector.ToArray()); double BayesianDiference = results[0].Score - results[1].Score; s.BayesianScore = BayesianDiference; s.BayesianClass = 1; if (results[0].Score < results[1].Score) { s.BayesianClass = -1; } } //); double BayesianMin = myScans.Min(a => a.BayesianScore); double BayesianMax = myScans.Max(a => a.BayesianScore); double BayesianDif = BayesianMax - BayesianMin; foreach (SQTScan s in myScans) { s.BayesianScore = (s.BayesianScore - BayesianMin) / BayesianDif; } Console.WriteLine("Class modeled"); } return(gd.MySparseMatrix); }