public IdentifiedSpectrumBuilderResult Build(string parameterFile)
        {
            Options = new BuildSummaryOptions(parameterFile);
            Options.DatasetList.RemoveDisabled();

            IIdentifiedProteinBuilder      proteinBuilder = new IdentifiedProteinBuilder();
            IIdentifiedProteinGroupBuilder groupBuilder   = new IdentifiedProteinGroupBuilder();

            var fdrCalc = Options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator();

            BuildResult = new DatasetList();

            //从配置进行初始化
            BuildResult.InitFromOptions(Options.DatasetList, this.Progress, parameterFile);

            var totalCount = BuildResult.GetOptimalSpectrumCount();

            string optimalResultFile = FileUtils.ChangeExtension(parameterFile, ".optimal");

            using (var sw = new StreamWriter(optimalResultFile))
            {
                new OptimalFileTextWriter().WriteToStream(sw, BuildResult);

                UniformProteinFdrOptimalResultCalculator proteinCalc = new UniformProteinFdrOptimalResultCalculator(fdrCalc, Options.GetDecoyGroupFilter())
                {
                    Progress = this.Progress
                };

                Progress.SetMessage("Filtering PSMs by protein fdr {0}, using peptide fdr {1}...", Options.FalseDiscoveryRate.FdrValue, Options.FalseDiscoveryRate.MaxPeptideFdr);

                var groupFilter = Options.FalseDiscoveryRate.FilterOneHitWonder ? new IdentifiedProteinGroupSingleWonderPeptideCountFilter(Options.FalseDiscoveryRate.MinOneHitWonderPeptideCount) : null;
                var ret         = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, Options.FalseDiscoveryRate.MaxPeptideFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter);

                //只保留通过筛选的蛋白质包含的PSMs。
                BuildResult.KeepOptimalResultInSetOnly(ret.AcceptedSpectra);

                GC.Collect();
                GC.WaitForPendingFinalizers();

                sw.WriteLine("After SimpleProteinFDR filter {0} with condition {1}, required peptide fdr = {2} ", ret.ProteinFdr, ret.ProteinCondition, ret.PeptideFdr);
                BuildResult.ForEach(ds =>
                {
                    sw.WriteLine("Dataset {0}", ds.Options.Name);
                    OptimalResultConditionUtils.WriteSpectrumBin(sw, ds, f1, f2);
                });

                //sw.WriteLine();
                //new OptimalFileTextWriter().WriteToStream(sw, BuildResult);

                return(new IdentifiedSpectrumBuilderResult()
                {
                    Spectra = ret.AcceptedSpectra.ToList(),
                    PeptideFDR = ret.PeptideFdr,
                    ProteinFDR = ret.ProteinFdr
                });
            }
        }
        public void BuildResult(Dictionary <string, MaxQuantModificationItem> shortModMap, string noredundantFile, string iniFile, List <IIdentifiedSpectrum> spectra)
        {
            var proteins = new IdentifiedProteinBuilder().Build(spectra);
            var groups   = new IdentifiedProteinGroupBuilder().Build(proteins);
            var ir       = new IdentifiedResultBuilder(DefaultAccessNumberParser.GetInstance(), "").Build(groups);

            new MascotResultTextFormat().WriteToFile(noredundantFile, ir);

            Aminoacids samAminoacids, refAminoacids;

            InitializeAminoacids(out samAminoacids, out refAminoacids);

            using (var sw = new StreamWriter(iniFile))
            {
                for (var idx = 0; idx < samAminoacids.Count; idx++)
                {
                    var samAA = samAminoacids[idx];
                    var refAA = refAminoacids[idx];
                    if (samAA.Visible)
                    {
                        sw.WriteLine("<{0}>\tSAM\tREF", samAA.OneName);
                        WriteAtom(sw, samAA, refAA, new[] { Atom.C }, "C");
                        WriteAtom(sw, samAA, refAA, new[] { Atom.H }, "H");
                        WriteAtom(sw, samAA, refAA, new[] { Atom.O }, "O");
                        WriteAtom(sw, samAA, refAA, new[] { Atom.N }, "N");
                        WriteAtom(sw, samAA, refAA, new[] { Atom.S }, "S");
                        WriteAtom(sw, samAA, refAA, new[] { Atom.P }, "P");
                        WriteAtom(sw, samAA, refAA, new[] { Atom.N15, Atom.Nx }, "15N");
                        WriteAtom(sw, samAA, refAA, new[] { Atom.H2, Atom.Hx }, "2H");
                        WriteAtom(sw, samAA, refAA, new[] { Atom.C13, Atom.Cx }, "13C");
                        WriteAtom(sw, samAA, refAA, new[] { Atom.O18, Atom.Ox }, "18O", false);
                        sw.WriteLine();
                    }
                }

                var terms = new[] { new MaxQuantModificationItem()
                                    {
                                        Symbol = "NTERM", Composition = new AtomComposition("H")
                                    },
                                    new MaxQuantModificationItem()
                                    {
                                        Symbol = "CTERM", Composition = new AtomComposition("OH")
                                    } };
                var usedMods = (from v in shortModMap.Values
                                orderby v.Symbol
                                select v).ToList();
                usedMods.AddRange(terms);

                foreach (var mod in usedMods)
                {
                    mod.WriteToSilacINI(sw);
                }
            }
        }
Beispiel #3
0
        public override IEnumerable <string> Process(string fileName)
        {
            var proteins = ParseProteins(fileName);

            Progress.SetMessage("Building protein groups ...");
            var groups = new IdentifiedProteinGroupBuilder().Build(proteins);

            Progress.SetMessage("Building result ...");
            var ir = new IdentifiedResultBuilder(null, null).Build(groups);

            var result = FileUtils.ChangeExtension(fileName, ".noredundant");

            RefineModifications(ir);

            Progress.SetMessage("Saving result ...");
            new SequestResultTextFormat(SequestHeader.SEQUEST_PROTEIN_HEADER, SequestHeader.SEQUEST_PEPTIDE_HEADER + "\tModification").WriteToFile(result, ir);

            Progress.SetMessage("Finished.");

            return(new string[] { result });
        }
    public void TestBuild()
    {
      List<IIdentifiedSpectrum> spectra = new SequestPeptideTextFormat().ReadFromFile(@"../../../data/TestBuilder.peptides");
      Assert.AreEqual(4, spectra.Count);

      IAccessNumberParser parser = AccessNumberParserFactory.FindOrCreateParser(@"(IPI\d+)", "IPI");

      List<IIdentifiedProtein> proteins = new IdentifiedProteinBuilder().Build(spectra);
      Assert.AreEqual(4, proteins.Count);

      List<IIdentifiedProteinGroup> groups = new IdentifiedProteinGroupBuilder().Build(proteins);
      Assert.AreEqual(2, groups.Count);

      Assert.AreEqual(1, groups[0].Count);
      Assert.AreEqual("IPI:IPI00784154.1|SW", groups[0][0].Name);

      Assert.AreEqual(2, groups[1].Count);
      Assert.AreEqual("REVERSED_00000001", groups[1][0].Name);
      Assert.AreEqual("REVERSED_00000002", groups[1][1].Name);

      IIdentifiedResult result = new IdentifiedResultBuilder(parser,"").Build(groups);
    }
    public void TestBuild()
    {
      var pep1 = new IdentifiedPeptide(new IdentifiedSpectrum(new SequestFilename("A", 1, 1, 1, ".dta"))) { Sequence = "A" };
      var pep2 = new IdentifiedPeptide(new IdentifiedSpectrum(new SequestFilename("B", 1, 1, 1, ".dta"))) { Sequence = "B" };
      var pep3 = new IdentifiedPeptide(new IdentifiedSpectrum(new SequestFilename("C", 1, 1, 1, ".dta"))) { Sequence = "C" };
      var pep4 = new IdentifiedPeptide(new IdentifiedSpectrum(new SequestFilename("D", 1, 1, 1, ".dta"))) { Sequence = "D" };
      var pep5 = new IdentifiedPeptide(new IdentifiedSpectrum(new SequestFilename("E", 1, 1, 1, ".dta"))) { Sequence = "E" };
      var pep6 = new IdentifiedPeptide(new IdentifiedSpectrum(new SequestFilename("F", 1, 1, 1, ".dta"))) { Sequence = "F" };

      var protein1 = new IdentifiedProtein()
      {
        Peptides = new IIdentifiedPeptide[] { pep1, pep3, pep5, pep6 }.ToList()
      };

      var protein2 = new IdentifiedProtein()
      {
        Peptides = new IIdentifiedPeptide[] { pep2, pep3, pep4 }.ToList()
      };

      //should be removed from final result since all peptides has been included in protein1 and protein2, even one protein contains both peptides
      var protein3 = new IdentifiedProtein()
      {
        Peptides = new IIdentifiedPeptide[] { pep1, pep2 }.ToList()
      };

      //should be removed from final result since all peptides has been included in protein1
      var protein4 = new IdentifiedProtein()
      {
        Peptides = new IIdentifiedPeptide[] { pep1, pep5 }.ToList()
      };

      var actual = new IdentifiedProteinGroupBuilder().Build(new IIdentifiedProtein[] { protein1, protein2, protein3 }.ToList());
      Assert.AreEqual(2, actual.Count);
      Assert.AreSame(protein1, actual[0][0]);
      Assert.AreSame(protein2, actual[1][0]);
    }
Beispiel #6
0
        public IdentifiedSpectrumBuilderResult Build(string parameterFile)
        {
            Options = new BuildSummaryOptions(parameterFile);
            Options.DatasetList.RemoveDisabled();

            IIdentifiedProteinGroupFilter conFilter = Options.Database.GetNotContaminationDescriptionFilter(this.Progress);

            var fdrCalc = Options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator();

            BuildResult = new DatasetList();

            //从配置进行初始化
            BuildResult.InitFromOptions(Options.DatasetList, this.Progress, parameterFile);

            var totalCount = BuildResult.GetOptimalSpectrumCount();

            string optimalResultFile = FileUtils.ChangeExtension(parameterFile, ".optimal");

            using (var sw = new StreamWriter(optimalResultFile))
            {
                new OptimalFileTextWriter().WriteToStream(sw, BuildResult);

                UniformProteinFdrOptimalResultCalculator proteinCalc = new UniformProteinFdrOptimalResultCalculator(fdrCalc, Options.GetDecoyGroupFilter())
                {
                    Progress = this.Progress
                };

                sw.WriteLine(OptimalFilteredItem.GetHeader());

                var uniqueFilter = new IdentifiedProteinGroupUniquePeptideCountFilter(2);

                OptimalFilteredItem finalItem = null;

                List <IIdentifiedSpectrum> allSpectrum = Options.PeptideRetrieval ? BuildResult.GetSpectra() : null;

                int    fdrPeptideCount = Options.FalseDiscoveryRate.FdrPeptideCount > 2 ? Options.FalseDiscoveryRate.FdrPeptideCount : 2;
                double firstStepFdr    = Options.FalseDiscoveryRate.MaxPeptideFdr;
                bool   bFirst          = true;
                for (int curPeptideCount = fdrPeptideCount; curPeptideCount >= 2; curPeptideCount--)
                {
                    //重新根据保留的Spectra构建SpectrumBin。
                    if (!bFirst)
                    {
                        BuildResult.BuildSpectrumBin();
                    }
                    bFirst = false;

                    var curItem = new OptimalFilteredItem();

                    IIdentifiedProteinGroupFilter groupFilter;

                    bool bNeedFirstStep = curPeptideCount > 2;
                    if (bNeedFirstStep)
                    {
                        Progress.SetMessage("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 and peptide count >= {1} using peptide fdr {2}...", Options.FalseDiscoveryRate.FdrValue, curPeptideCount, firstStepFdr);

                        //第一步,根据UniquePeptideCount和PeptideCount进行筛选,得到满足蛋白质Fdr要求所对应的肽段fdr。
                        var countFilter = new IdentifiedProteinGroupPeptideCountFilter(curPeptideCount);

                        if (conFilter != null)
                        {
                            groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { conFilter, uniqueFilter, countFilter });
                        }
                        else
                        {
                            groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { uniqueFilter, countFilter });
                        }

                        curItem.Unique2CountResult = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, firstStepFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter);
                        firstStepFdr = curItem.Unique2CountResult.PeptideFdr;

                        //只保留没有被通过筛选的蛋白质包含的PSMs。
                        BuildResult.KeepOptimalResultInSetOnly(new HashSet <IIdentifiedSpectrum>(curItem.Unique2CountResult.RejectedSpectra));

                        GC.Collect();
                        GC.WaitForPendingFinalizers();
                    }
                    else
                    {
                        curItem.Unique2CountResult = new ProteinFdrFilteredItem();
                    }

                    Progress.SetMessage("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 using peptide fdr {1}...", Options.FalseDiscoveryRate.FdrValue, firstStepFdr);

                    //第二步,根据UniquePeptideCount进行筛选,计算得到满足给定蛋白质fdr的结果。
                    double secondStepFdr = bNeedFirstStep ? Options.FalseDiscoveryRate.MaxPeptideFdr : firstStepFdr;

                    if (conFilter != null)
                    {
                        groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { conFilter, uniqueFilter });
                    }
                    else
                    {
                        groupFilter = uniqueFilter;
                    }

                    curItem.Unique2Result = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, secondStepFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter);

                    //只保留没有被通过筛选的蛋白质包含的PSMs。
                    BuildResult.KeepOptimalResultInSetOnly(new HashSet <IIdentifiedSpectrum>(curItem.Unique2Result.RejectedSpectra));
                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    curItem.Unique1Result = FilterOneHitWonders(conFilter, proteinCalc);

                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    sw.WriteLine(curItem.ToString());

                    if (finalItem == null || finalItem.TotalProteinCount < curItem.TotalProteinCount)
                    {
                        finalItem = curItem;
                    }

                    curItem = null;

                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    Console.WriteLine(MyConvert.Format("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 and peptide count >= {1} using peptide fdr {2}...cost {3}.", Options.FalseDiscoveryRate.FdrValue, curPeptideCount, firstStepFdr, SystemUtils.CostMemory()));
                }

                Progress.SetMessage("Filtering PSMs by protein fdr {0} finished, free memory...", Options.FalseDiscoveryRate.FdrValue);

                if (finalItem != null)
                {
                    sw.WriteLine();
                    sw.WriteLine("Final result : ");

                    WriteScoreMap(sw, BuildResult, finalItem.Unique2CountResult);
                    WriteScoreMap(sw, BuildResult, finalItem.Unique2Result);
                    WriteScoreMap(sw, BuildResult, finalItem.Unique1Result);

                    var finalSpectra = finalItem.GetSpectra();
                    if (Options.PeptideRetrieval)
                    {
                        Progress.SetMessage("Retrivaling peptides passed maximum peptide FDR for proteins passed protein FDR...");
                        var proteinBuilder = new IdentifiedProteinBuilder();
                        var groupBuilder   = new IdentifiedProteinGroupBuilder();
                        List <IIdentifiedProtein>      proteins = proteinBuilder.Build(finalSpectra);
                        List <IIdentifiedProteinGroup> groups   = groupBuilder.Build(proteins);

                        var proteinMap = new Dictionary <string, IIdentifiedProteinGroup>();
                        foreach (var g in groups)
                        {
                            foreach (var p in g)
                            {
                                proteinMap[p.Name] = g;
                            }
                        }

                        var savedSpectra = new HashSet <IIdentifiedSpectrum>(finalItem.GetSpectra());
                        foreach (var spectrum in allSpectrum)
                        {
                            if (savedSpectra.Contains(spectrum))
                            {
                                continue;
                            }

                            var pgs = new HashSet <IIdentifiedProteinGroup>();
                            foreach (var protein in spectrum.Proteins)
                            {
                                IIdentifiedProteinGroup pg;
                                if (proteinMap.TryGetValue(protein, out pg))
                                {
                                    pgs.Add(pg);
                                }
                            }

                            //if the spectrum doesn't map to protein passed FDR filter, ignore
                            //if the spectrum maps to multiple groups, ignore
                            if (pgs.Count == 0 || pgs.Count > 1)
                            {
                                continue;
                            }

                            //The spectrum should map to all proteins in the group
                            if (pgs.First().All(l => spectrum.Proteins.Contains(l.Name)))
                            {
                                finalSpectra.Add(spectrum);
                            }
                        }
                    }

                    BuildResult.ClearSpectra();
                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    return(new IdentifiedSpectrumBuilderResult()
                    {
                        Spectra = finalSpectra,
                        PeptideFDR = finalItem.Unique2Result.PeptideFdr,
                        ProteinFDR = Options.FalseDiscoveryRate.FdrValue
                    });
                }
                else
                {
                    return(new IdentifiedSpectrumBuilderResult()
                    {
                        Spectra = new List <IIdentifiedSpectrum>()
                    });
                }
            }
        }