public IdentifiedSpectrumBuilderResult Build(string parameterFile)
        {
            Options = new BuildSummaryOptions(parameterFile);
            Options.DatasetList.RemoveDisabled();

            IIdentifiedProteinBuilder      proteinBuilder = new IdentifiedProteinBuilder();
            IIdentifiedProteinGroupBuilder groupBuilder   = new IdentifiedProteinGroupBuilder();

            var fdrCalc = Options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator();

            BuildResult = new DatasetList();

            //从配置进行初始化
            BuildResult.InitFromOptions(Options.DatasetList, this.Progress, parameterFile);

            var totalCount = BuildResult.GetOptimalSpectrumCount();

            string optimalResultFile = FileUtils.ChangeExtension(parameterFile, ".optimal");

            using (var sw = new StreamWriter(optimalResultFile))
            {
                new OptimalFileTextWriter().WriteToStream(sw, BuildResult);

                UniformProteinFdrOptimalResultCalculator proteinCalc = new UniformProteinFdrOptimalResultCalculator(fdrCalc, Options.GetDecoyGroupFilter())
                {
                    Progress = this.Progress
                };

                Progress.SetMessage("Filtering PSMs by protein fdr {0}, using peptide fdr {1}...", Options.FalseDiscoveryRate.FdrValue, Options.FalseDiscoveryRate.MaxPeptideFdr);

                var groupFilter = Options.FalseDiscoveryRate.FilterOneHitWonder ? new IdentifiedProteinGroupSingleWonderPeptideCountFilter(Options.FalseDiscoveryRate.MinOneHitWonderPeptideCount) : null;
                var ret         = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, Options.FalseDiscoveryRate.MaxPeptideFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter);

                //只保留通过筛选的蛋白质包含的PSMs。
                BuildResult.KeepOptimalResultInSetOnly(ret.AcceptedSpectra);

                GC.Collect();
                GC.WaitForPendingFinalizers();

                sw.WriteLine("After SimpleProteinFDR filter {0} with condition {1}, required peptide fdr = {2} ", ret.ProteinFdr, ret.ProteinCondition, ret.PeptideFdr);
                BuildResult.ForEach(ds =>
                {
                    sw.WriteLine("Dataset {0}", ds.Options.Name);
                    OptimalResultConditionUtils.WriteSpectrumBin(sw, ds, f1, f2);
                });

                //sw.WriteLine();
                //new OptimalFileTextWriter().WriteToStream(sw, BuildResult);

                return(new IdentifiedSpectrumBuilderResult()
                {
                    Spectra = ret.AcceptedSpectra.ToList(),
                    PeptideFDR = ret.PeptideFdr,
                    ProteinFDR = ret.ProteinFdr
                });
            }
        }
        public IIdentifiedResultBuilder GetIdentifiedResultBuilder()
        {
            if (DatasetList.All(m => m.PathNames.All(l => l.ToLower().EndsWith("msf"))))
            {
                return(new IdentifiedResultMsfBuilder((from ds in DatasetList
                                                       from file in ds.PathNames
                                                       select file).ToArray(), Database.GetAccessNumberParser()));
            }

            return(new IdentifiedResultBuilder(Database.GetAccessNumberParser(), Database.Location));
        }
        public void LoadFromFile(string fileName)
        {
            if (!File.Exists(fileName))
            {
                throw new FileNotFoundException("Parameter file not found", fileName);
            }

            XElement docRoot = XElement.Load(fileName);

            ApplicationTitle = docRoot.Element("Version").Value;

            MergeResult = Convert.ToBoolean(docRoot.Element("MergeResult").Value);

            ConflictType = ResolveSearchEngineConflictTypeFactory.Find(docRoot.GetChildValue("ConflictType", ResolveSearchEngineConflictTypeFactory.DiscardAll.Name));

            if (docRoot.Element("MinimumEngineAgreeCount") != null)
            {
                MinimumEngineAgreeCount = int.Parse(docRoot.Element("MinimumEngineAgreeCount").Value);
            }

            if (docRoot.Element("MergeResultFromSameEngineButDifferentSearchParameters") != null)
            {
                KeepTopPeptideFromSameEngineButDifferentSearchParameters = bool.Parse(docRoot.Element("MergeResultFromSameEngineButDifferentSearchParameters").Value);
            }

            if (docRoot.Element("PeptideRetrieval") != null)
            {
                PeptideRetrieval = bool.Parse(docRoot.Element("PeptideRetrieval").Value);
            }

            Database.Load(docRoot);

            FalseDiscoveryRate.Load(docRoot);

            Classification.Load(docRoot);

            PeptideFilter.Load(docRoot);

            try
            {
                DatasetList.Load(docRoot);
                DatasetList.ForEach(m => m.Parent = this);
            }
            catch (Exception ex)
            {
                MessageBox.Show("Load dataset error :" + ex.Message);
            }
        }
예제 #4
0
        private void WriteScoreMap(StreamWriter sw, DatasetList BuildResult, ProteinFdrFilteredItem item)
        {
            if (item.ProteinCount == 0)
            {
                return;
            }

            BuildResult.BuildSpectrumBin();
            BuildResult.KeepOptimalResultInSetOnly(item.AcceptedSpectra);

            sw.WriteLine(MyConvert.Format("Filtering condition = {0}, PeptideFdr = {1}", item.ProteinCondition, item.PeptideFdr));
            BuildResult.ForEach(ds =>
            {
                sw.WriteLine("Dataset {0}", ds.Options.Name);
                OptimalResultConditionUtils.WriteSpectrumBin(sw, ds, f1, f2);
            });
            sw.WriteLine();
        }
        protected override IdentifiedSpectrumBuilderResult DoBuild(string parameterFile)
        {
            var fdrCalc = Options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator();

            BuildResult = new DatasetList();

            //从配置进行初始化
            BuildResult.InitFromOptions(Options.DatasetList, this.Progress, parameterFile);

            string optimalFile = FileUtils.ChangeExtension(parameterFile, ".optimal");

            new OptimalFileTextWriter().WriteToFile(optimalFile, BuildResult);

            Progress.SetMessage("Peptide fdr filter done ...");
            return(new IdentifiedSpectrumBuilderResult()
            {
                Spectra = BuildResult.GetSpectra(),
                PeptideFDR = Options.FalseDiscoveryRate.MaxPeptideFdr
            });
        }
        public XElement ToXml()
        {
            XElement docRoot = new XElement("BuildSummaryOption",
                                            new XElement("Version", ApplicationTitle),
                                            new XElement("MergeResult", MergeResult),
                                            new XElement("ConflictType", ConflictType),
                                            new XElement("MinimumEngineAgreeCount", MinimumEngineAgreeCount),
                                            new XElement("MergeResultFromSameEngineButDifferentSearchParameters", KeepTopPeptideFromSameEngineButDifferentSearchParameters),
                                            new XElement("PeptideRetrieval", PeptideRetrieval));

            Database.Save(docRoot);

            FalseDiscoveryRate.Save(docRoot);

            Classification.Save(docRoot);

            PeptideFilter.Save(docRoot);

            DatasetList.Save(docRoot);

            return(docRoot);
        }
        public ProteinFdrFilteredItem GetOptimalResultForGroupFilter(DatasetList dsList, double initFdr, double maxProteinFdr, IIdentifiedProteinGroupFilter groupFilter)
        {
            double curFdr = initFdr;

            var result = new ProteinFdrFilteredItem();

            result.PeptideBeforeFdr = dsList.GetOptimalSpectrumCount();

            while (true)
            {
                if (Progress.IsCancellationPending())
                {
                    throw new UserTerminatedException();
                }

                string condition = groupFilter == null ? "[]" : "[" + groupFilter.FilterCondition + "]";
                string task      = MyConvert.Format("Filtering {0} protein ... PeptideFdr={1:0.0000}", condition, curFdr);
                Progress.SetMessage(task);

                GC.Collect();
                GC.WaitForPendingFinalizers();

                var filteredSpectra = dsList.FilterByFdr(curFdr).Spectra;
                filteredSpectra.TrimExcess();
                GC.Collect();
                GC.WaitForPendingFinalizers();

                Progress.SetMessage(task + ", building protein list from peptides...");
                List <IIdentifiedProtein> proteins = proteinBuilder.Build(filteredSpectra);

                Progress.SetMessage(task + ", building protein group from protein list...");
                List <IIdentifiedProteinGroup> groups = groupBuilder.Build(proteins);

                List <IIdentifiedProteinGroup> filteredGroups = groupFilter == null ? groups : groups.FindAll(g => groupFilter.Accept(g));

                Progress.SetMessage(task + ", calculating protein fdr...");
                double proteinFdr = CalculateProteinGroupFdr(filteredGroups);

                if (proteinFdr <= maxProteinFdr)
                {
                    //using (StreamWriter sw = new StreamWriter(@"e:\temp\protein.txt", true))
                    //{
                    //  sw.WriteLine(task + " kept proteins");
                    //  foreach (var g in filteredGroups)
                    //  {
                    //    foreach (var p in g)
                    //    {
                    //      sw.WriteLine(p.Name);
                    //    }
                    //  }
                    //  sw.WriteLine();
                    //}

                    result.ProteinCondition = condition;
                    result.PeptideFdr       = curFdr;
                    result.ProteinFdr       = proteinFdr;
                    result.ProteinCount     = filteredGroups.Count;

                    Progress.SetMessage(task + ", accepted, processing corresponding PSMs...");
                    foreach (IIdentifiedProteinGroup group in filteredGroups)
                    {
                        result.AcceptedSpectra.UnionWith(group[0].GetSpectra());
                    }
                    result.AcceptedSpectra.TrimExcess();


                    //删除已经被包含在通过筛选的group对应的spectra
                    filteredSpectra.RemoveAll(m => result.AcceptedSpectra.Contains(m));

                    //删除对应于已通过筛选的蛋白质的spectra(但未通过初始的肽段筛选)
                    var proteinList = new HashSet <string>((from g in filteredGroups
                                                            from p in g
                                                            select p.Name).Distinct());
                    filteredSpectra.RemoveAll(m =>
                    {
                        foreach (var pep in m.Peptides)
                        {
                            foreach (var p in pep.Proteins)
                            {
                                if (proteinList.Contains(p))
                                {
                                    return(true);
                                }
                            }
                        }
                        return(false);
                    });

                    filteredSpectra.TrimExcess();
                    result.RejectedSpectra = filteredSpectra;

                    List <IIdentifiedProtein>      rejectProteins = proteinBuilder.Build(filteredSpectra);
                    List <IIdentifiedProteinGroup> rejectGroups   = groupBuilder.Build(rejectProteins);
                    //using (StreamWriter sw = new StreamWriter(@"e:\temp\protein.txt", true))
                    //{
                    //  sw.WriteLine(task + " rejected proteins");
                    //  foreach (var g in rejectGroups)
                    //  {
                    //    foreach (var p in g)
                    //    {
                    //      sw.WriteLine(p.Name);
                    //    }
                    //  }
                    //  sw.WriteLine();
                    //}

                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    Progress.SetMessage(task + " finished.");
                    return(result);
                }
                else
                {
                    double stepFdr = CalculateStepFdr(curFdr);
                    curFdr -= stepFdr;
                }
            }
        }
예제 #8
0
        public IdentifiedSpectrumBuilderResult Build(string parameterFile)
        {
            Options = new BuildSummaryOptions(parameterFile);
            Options.DatasetList.RemoveDisabled();

            IIdentifiedProteinGroupFilter conFilter = Options.Database.GetNotContaminationDescriptionFilter(this.Progress);

            var fdrCalc = Options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator();

            BuildResult = new DatasetList();

            //从配置进行初始化
            BuildResult.InitFromOptions(Options.DatasetList, this.Progress, parameterFile);

            var totalCount = BuildResult.GetOptimalSpectrumCount();

            string optimalResultFile = FileUtils.ChangeExtension(parameterFile, ".optimal");

            using (var sw = new StreamWriter(optimalResultFile))
            {
                new OptimalFileTextWriter().WriteToStream(sw, BuildResult);

                UniformProteinFdrOptimalResultCalculator proteinCalc = new UniformProteinFdrOptimalResultCalculator(fdrCalc, Options.GetDecoyGroupFilter())
                {
                    Progress = this.Progress
                };

                sw.WriteLine(OptimalFilteredItem.GetHeader());

                var uniqueFilter = new IdentifiedProteinGroupUniquePeptideCountFilter(2);

                OptimalFilteredItem finalItem = null;

                List <IIdentifiedSpectrum> allSpectrum = Options.PeptideRetrieval ? BuildResult.GetSpectra() : null;

                int    fdrPeptideCount = Options.FalseDiscoveryRate.FdrPeptideCount > 2 ? Options.FalseDiscoveryRate.FdrPeptideCount : 2;
                double firstStepFdr    = Options.FalseDiscoveryRate.MaxPeptideFdr;
                bool   bFirst          = true;
                for (int curPeptideCount = fdrPeptideCount; curPeptideCount >= 2; curPeptideCount--)
                {
                    //重新根据保留的Spectra构建SpectrumBin。
                    if (!bFirst)
                    {
                        BuildResult.BuildSpectrumBin();
                    }
                    bFirst = false;

                    var curItem = new OptimalFilteredItem();

                    IIdentifiedProteinGroupFilter groupFilter;

                    bool bNeedFirstStep = curPeptideCount > 2;
                    if (bNeedFirstStep)
                    {
                        Progress.SetMessage("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 and peptide count >= {1} using peptide fdr {2}...", Options.FalseDiscoveryRate.FdrValue, curPeptideCount, firstStepFdr);

                        //第一步,根据UniquePeptideCount和PeptideCount进行筛选,得到满足蛋白质Fdr要求所对应的肽段fdr。
                        var countFilter = new IdentifiedProteinGroupPeptideCountFilter(curPeptideCount);

                        if (conFilter != null)
                        {
                            groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { conFilter, uniqueFilter, countFilter });
                        }
                        else
                        {
                            groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { uniqueFilter, countFilter });
                        }

                        curItem.Unique2CountResult = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, firstStepFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter);
                        firstStepFdr = curItem.Unique2CountResult.PeptideFdr;

                        //只保留没有被通过筛选的蛋白质包含的PSMs。
                        BuildResult.KeepOptimalResultInSetOnly(new HashSet <IIdentifiedSpectrum>(curItem.Unique2CountResult.RejectedSpectra));

                        GC.Collect();
                        GC.WaitForPendingFinalizers();
                    }
                    else
                    {
                        curItem.Unique2CountResult = new ProteinFdrFilteredItem();
                    }

                    Progress.SetMessage("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 using peptide fdr {1}...", Options.FalseDiscoveryRate.FdrValue, firstStepFdr);

                    //第二步,根据UniquePeptideCount进行筛选,计算得到满足给定蛋白质fdr的结果。
                    double secondStepFdr = bNeedFirstStep ? Options.FalseDiscoveryRate.MaxPeptideFdr : firstStepFdr;

                    if (conFilter != null)
                    {
                        groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { conFilter, uniqueFilter });
                    }
                    else
                    {
                        groupFilter = uniqueFilter;
                    }

                    curItem.Unique2Result = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, secondStepFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter);

                    //只保留没有被通过筛选的蛋白质包含的PSMs。
                    BuildResult.KeepOptimalResultInSetOnly(new HashSet <IIdentifiedSpectrum>(curItem.Unique2Result.RejectedSpectra));
                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    curItem.Unique1Result = FilterOneHitWonders(conFilter, proteinCalc);

                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    sw.WriteLine(curItem.ToString());

                    if (finalItem == null || finalItem.TotalProteinCount < curItem.TotalProteinCount)
                    {
                        finalItem = curItem;
                    }

                    curItem = null;

                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    Console.WriteLine(MyConvert.Format("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 and peptide count >= {1} using peptide fdr {2}...cost {3}.", Options.FalseDiscoveryRate.FdrValue, curPeptideCount, firstStepFdr, SystemUtils.CostMemory()));
                }

                Progress.SetMessage("Filtering PSMs by protein fdr {0} finished, free memory...", Options.FalseDiscoveryRate.FdrValue);

                if (finalItem != null)
                {
                    sw.WriteLine();
                    sw.WriteLine("Final result : ");

                    WriteScoreMap(sw, BuildResult, finalItem.Unique2CountResult);
                    WriteScoreMap(sw, BuildResult, finalItem.Unique2Result);
                    WriteScoreMap(sw, BuildResult, finalItem.Unique1Result);

                    var finalSpectra = finalItem.GetSpectra();
                    if (Options.PeptideRetrieval)
                    {
                        Progress.SetMessage("Retrivaling peptides passed maximum peptide FDR for proteins passed protein FDR...");
                        var proteinBuilder = new IdentifiedProteinBuilder();
                        var groupBuilder   = new IdentifiedProteinGroupBuilder();
                        List <IIdentifiedProtein>      proteins = proteinBuilder.Build(finalSpectra);
                        List <IIdentifiedProteinGroup> groups   = groupBuilder.Build(proteins);

                        var proteinMap = new Dictionary <string, IIdentifiedProteinGroup>();
                        foreach (var g in groups)
                        {
                            foreach (var p in g)
                            {
                                proteinMap[p.Name] = g;
                            }
                        }

                        var savedSpectra = new HashSet <IIdentifiedSpectrum>(finalItem.GetSpectra());
                        foreach (var spectrum in allSpectrum)
                        {
                            if (savedSpectra.Contains(spectrum))
                            {
                                continue;
                            }

                            var pgs = new HashSet <IIdentifiedProteinGroup>();
                            foreach (var protein in spectrum.Proteins)
                            {
                                IIdentifiedProteinGroup pg;
                                if (proteinMap.TryGetValue(protein, out pg))
                                {
                                    pgs.Add(pg);
                                }
                            }

                            //if the spectrum doesn't map to protein passed FDR filter, ignore
                            //if the spectrum maps to multiple groups, ignore
                            if (pgs.Count == 0 || pgs.Count > 1)
                            {
                                continue;
                            }

                            //The spectrum should map to all proteins in the group
                            if (pgs.First().All(l => spectrum.Proteins.Contains(l.Name)))
                            {
                                finalSpectra.Add(spectrum);
                            }
                        }
                    }

                    BuildResult.ClearSpectra();
                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    return(new IdentifiedSpectrumBuilderResult()
                    {
                        Spectra = finalSpectra,
                        PeptideFDR = finalItem.Unique2Result.PeptideFdr,
                        ProteinFDR = Options.FalseDiscoveryRate.FdrValue
                    });
                }
                else
                {
                    return(new IdentifiedSpectrumBuilderResult()
                    {
                        Spectra = new List <IIdentifiedSpectrum>()
                    });
                }
            }
        }