public static void FilterForConfidentlyIdentifiedProteinOnly()
        {
            List <String> ppr = ProteinProphetEvaluator.extractIdentifiedProteinNames(InputFileOrganizer.OriginalProtXMLFile);
            String        excludedSpectraPerProteinAll = Path.Combine("ProteinTotalSpectraAgainstExcludedSpectra.txt");
            String        outputFile = Path.Combine(InputFileOrganizer.OutputFolderOfTheRun, "FilteredProteinWithExcludedSpectraCount.txt");
            StreamReader  sr         = new StreamReader(excludedSpectraPerProteinAll);
            StreamWriter  sw         = new StreamWriter(outputFile);
            String        line       = sr.ReadLine();

            sw.WriteLine(line);
            line = sr.ReadLine();
            int count = 0;

            while (line != null)
            {
                String protName = line.Split("\t".ToCharArray())[0];
                if (ppr.Contains(protName))
                {
                    sw.WriteLine(line);
                    count++;
                }
                line = sr.ReadLine();
            }
            sw.Close();
            Console.WriteLine(count);
        }
Ejemplo n.º 2
0
        private static void writeFeatures(List <IdentificationFeatures> idf)
        {
            log.Info("Classifying positive and negative sets");
            // Extract which proteins were confidently identified at 0.01 FDR with protein prophet
            List <String> identifiedProteins = ProteinProphetEvaluator.extractIdentifiedProteinNames(InputFileOrganizer.OriginalProtXMLFile);
            // Extract which proteins were not confidently identified, with a specified FDR
            // threshold
            List <String> negativeTrainingSetProteins = ProteinProphetEvaluator.extractNegativeTrainingSetProteinNames(InputFileOrganizer.OriginalProtXMLFile, 0.25);
            // 2019-05-23 FOUND IT! Here is where we filter the negative training set with
            // above 20% FDR

            // Proteins identified with a 0.01 FDR with protein prophet
            List <IdentificationFeatures> positiveTrainingSet = new List <IdentificationFeatures>();
            // Proteins not identified with a 0.01 FDR protein prophet
            List <IdentificationFeatures> negativeTrainingSet = new List <IdentificationFeatures>();

            List <IdentificationFeatures> nonPositiveTrainingSet = new List <IdentificationFeatures>();


            // Determine which features are in positive or negative training set
            foreach (IdentificationFeatures i in idf)
            {
                String accession = i.getAccession();
                if (i.getCardinality() > 0)
                {
                    if (!accession.StartsWith(GlobalVar.DecoyPrefix))
                    {
                        //if this is a real protein
                        if (identifiedProteins.Contains(accession))
                        {
                            positiveTrainingSet.Add(i);
                        }
                        else
                        {
                            nonPositiveTrainingSet.Add(i);
                        }

                        if (negativeTrainingSetProteins.Contains(accession))
                        {
                            negativeTrainingSet.Add(i);
                        }
                    }
                    else
                    {
                        //if it's a decoy protein
                        negativeTrainingSet.Add(i);
                        nonPositiveTrainingSet.Add(i);
                    }
                }
            }
            WriteIdentificationFeaturesFile(OutputFile_PositiveAndNegative, positiveTrainingSet, negativeTrainingSet);
            WriteIdentificationFeaturesFile(OutputFile_PositiveAndNonPositive, positiveTrainingSet, nonPositiveTrainingSet);

            List <IdentificationFeatures> positiveSetNoDecoy    = new List <IdentificationFeatures>();
            List <IdentificationFeatures> nonPositiveSetNoDecoy = new List <IdentificationFeatures>();

            foreach (IdentificationFeatures i in positiveTrainingSet)
            {
                if (!i.getAccession().Contains(GlobalVar.DecoyPrefix))
                {
                    positiveSetNoDecoy.Add(i);
                }
            }
            foreach (IdentificationFeatures i in nonPositiveTrainingSet)
            {
                if (!i.getAccession().Contains(GlobalVar.DecoyPrefix))
                {
                    nonPositiveSetNoDecoy.Add(i);
                }
            }

            WriteIdentificationFeaturesFile(OutputFile_PositiveAndNonPositive_NoDecoy, positiveSetNoDecoy, nonPositiveSetNoDecoy);
        }