示例#1
0
        protected ProteinFdrFilteredItem FilterOneHitWonders(IIdentifiedProteinGroupFilter conFilter, UniformProteinFdrOptimalResultCalculator proteinCalc)
        {
            var oldLevel = Options.FalseDiscoveryRate.FdrLevel;

            try
            {
                List <IIdentifiedProteinGroupFilter> filters = new List <IIdentifiedProteinGroupFilter>();
                if (conFilter != null)
                {
                    filters.Add(conFilter);
                }

                if (Options.FalseDiscoveryRate.FilterOneHitWonder && Options.FalseDiscoveryRate.MinOneHitWonderPeptideCount > 1)
                {
                    filters.Add(new IdentifiedProteinGroupPeptideCountFilter(Options.FalseDiscoveryRate.MinOneHitWonderPeptideCount));
                }

                AndIdentifiedProteinGroupFilter groupFilter = new AndIdentifiedProteinGroupFilter(filters);

                Progress.SetMessage("Filtering PSMs by protein fdr {0} using unique peptide fdr {0} ...", Options.FalseDiscoveryRate.FdrValue);

                Options.FalseDiscoveryRate.FdrLevel = FalseDiscoveryRateLevel.UniquePeptide;

                return(proteinCalc.GetOptimalResultForGroupFilter(BuildResult, Options.FalseDiscoveryRate.FdrValue, Options.FalseDiscoveryRate.FdrValue, groupFilter));
            }
            finally
            {
                Options.FalseDiscoveryRate.FdrLevel = oldLevel;
            }
        }
        public UniformProteinFdrOptimalResultCalculator(IFalseDiscoveryRateCalculator fdrCalc, IIdentifiedProteinGroupFilter decoyFilter)
        {
            this.fdrCalc = fdrCalc;

            this.decoyFilter = decoyFilter;

            this.proteinBuilder = new IdentifiedProteinBuilder();

            this.groupBuilder = new IdentifiedProteinGroupBuilder();
        }
    public IIdentifiedProteinGroupFilter GetContaminationDescriptionFilter(IProgressCallback progress)
    {
      if (HasContaminationDescriptionFilter())
      {
        if (_contaminationGroupFilter == null)
        {
          var acParser = GetAccessNumberParser();

          var map = IdentifiedResultUtils.GetContaminationAccessNumbers(acParser, Location, ContaminationDescriptionPattern, progress);

          _contaminationGroupFilter = new IdentifiedProteinGroupContaminationMapFilter(acParser, map);
        }
        return _contaminationGroupFilter;
      }

      return null;
    }
        private static double CalculateProteinFdr(List <IIdentifiedProteinGroup> groups, IIdentifiedProteinGroupFilter decoyFilter, IFalseDiscoveryRateCalculator calc, out int targetCount)
        {
            targetCount = 0;
            int decoyCount = 0;

            foreach (var group in groups)
            {
                if (decoyFilter.Accept(group))
                {
                    decoyCount++;
                }
                else
                {
                    targetCount++;
                }
            }
            return(calc.Calculate(decoyCount, targetCount));
        }
        public static IdentificationSummary Parse(string proteinFile, string defaultDecoyPattern, IFalseDiscoveryRateCalculator defaultCalc)
        {
            IdentificationSummary result = new IdentificationSummary();

            result.FileName = FileUtils.ChangeExtension(new FileInfo(proteinFile).Name, "");

            Regex decoyReg = new Regex(defaultDecoyPattern);

            IIdentifiedProteinGroupFilter decoyFilter = null;
            IFalseDiscoveryRateCalculator curCalc     = null;

            var paramFile = FileUtils.ChangeExtension(proteinFile, ".param");

            if (File.Exists(paramFile))
            {
                BuildSummaryOptions options = BuildSummaryOptionsUtils.LoadFromFile(paramFile);
                if (options.FalseDiscoveryRate.FilterByFdr)
                {
                    decoyFilter = options.GetDecoyGroupFilter();
                    curCalc     = options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator();
                }
            }

            if (decoyFilter == null)
            {
                decoyFilter = new IdentifiedProteinGroupNameRegexFilter(defaultDecoyPattern, false);
                curCalc     = defaultCalc;
            }

            var peptideFile = FileUtils.ChangeExtension(proteinFile, ".peptides");

            if (File.Exists(peptideFile))
            {
                var peptides = new MascotPeptideTextFormat().ReadFromFile(peptideFile);

                var fullSpectra       = GetSpectraByNPT(peptides, 2);
                var fullTargetSpectra = GetTargetSpectra(decoyReg, fullSpectra);
                var semiSpectra       = GetSpectraByNPT(peptides, 1);
                var semiTargetSpectra = GetTargetSpectra(decoyReg, semiSpectra);

                result.FullSpectrumCount       = GetSpectrumCount(fullSpectra);
                result.FullTargetSpectrumCount = GetSpectrumCount(fullTargetSpectra);
                result.SemiSpectrumCount       = GetSpectrumCount(semiSpectra);
                result.SemiTargetSpectrumCount = GetSpectrumCount(semiTargetSpectra);

                result.FullPeptideCount       = IdentifiedSpectrumUtils.GetUniquePeptideCount(fullSpectra);
                result.FullTargetPeptideCount = IdentifiedSpectrumUtils.GetUniquePeptideCount(fullTargetSpectra);
                result.SemiPeptideCount       = IdentifiedSpectrumUtils.GetUniquePeptideCount(semiSpectra);
                result.SemiTargetPeptideCount = IdentifiedSpectrumUtils.GetUniquePeptideCount(semiTargetSpectra);

                result.FullSpectrumFdr = curCalc.Calculate(result.FullSpectrumCount - result.FullTargetSpectrumCount, result.FullTargetSpectrumCount);
                result.SemiSpectrumFdr = curCalc.Calculate(result.SemiSpectrumCount - result.SemiTargetSpectrumCount, result.SemiTargetSpectrumCount);
                result.FullPeptideFdr  = curCalc.Calculate(result.FullPeptideCount - result.FullTargetPeptideCount, result.FullTargetPeptideCount);
                result.SemiPeptideFdr  = curCalc.Calculate(result.SemiPeptideCount - result.SemiTargetPeptideCount, result.SemiTargetPeptideCount);
            }

            if (File.Exists(proteinFile))
            {
                var ir = new MascotResultTextFormat().ReadFromFile(proteinFile);
                ir.InitUniquePeptideCount();

                var u2proteins = (from p in ir
                                  where p[0].UniquePeptideCount > 1
                                  select p).ToList();

                var u1proteins = (from p in ir
                                  where p[0].UniquePeptideCount == 1
                                  select p).ToList();

                result.ProteinGroupCount        = ir.Count;
                result.Unique2ProteinGroupCount = u2proteins.Count;

                int targetCount;
                result.Unique2ProteinFdr = CalculateProteinFdr(u2proteins, decoyFilter, defaultCalc, out targetCount);
                result.Unique2ProteinGroupTargetCount = (int)targetCount;

                result.Unique1ProteinFdr = CalculateProteinFdr(u1proteins, decoyFilter, defaultCalc, out targetCount);
                result.Unique1ProteinGroupTargetCount = (int)targetCount;
            }

            return(result);
        }
示例#6
0
 public IdentifiedProteinGroupNotFilter(IIdentifiedProteinGroupFilter filter)
 {
     this.filter = filter;
 }
        public ProteinFdrFilteredItem GetOptimalResultForGroupFilter(DatasetList dsList, double initFdr, double maxProteinFdr, IIdentifiedProteinGroupFilter groupFilter)
        {
            double curFdr = initFdr;

            var result = new ProteinFdrFilteredItem();

            result.PeptideBeforeFdr = dsList.GetOptimalSpectrumCount();

            while (true)
            {
                if (Progress.IsCancellationPending())
                {
                    throw new UserTerminatedException();
                }

                string condition = groupFilter == null ? "[]" : "[" + groupFilter.FilterCondition + "]";
                string task      = MyConvert.Format("Filtering {0} protein ... PeptideFdr={1:0.0000}", condition, curFdr);
                Progress.SetMessage(task);

                GC.Collect();
                GC.WaitForPendingFinalizers();

                var filteredSpectra = dsList.FilterByFdr(curFdr).Spectra;
                filteredSpectra.TrimExcess();
                GC.Collect();
                GC.WaitForPendingFinalizers();

                Progress.SetMessage(task + ", building protein list from peptides...");
                List <IIdentifiedProtein> proteins = proteinBuilder.Build(filteredSpectra);

                Progress.SetMessage(task + ", building protein group from protein list...");
                List <IIdentifiedProteinGroup> groups = groupBuilder.Build(proteins);

                List <IIdentifiedProteinGroup> filteredGroups = groupFilter == null ? groups : groups.FindAll(g => groupFilter.Accept(g));

                Progress.SetMessage(task + ", calculating protein fdr...");
                double proteinFdr = CalculateProteinGroupFdr(filteredGroups);

                if (proteinFdr <= maxProteinFdr)
                {
                    //using (StreamWriter sw = new StreamWriter(@"e:\temp\protein.txt", true))
                    //{
                    //  sw.WriteLine(task + " kept proteins");
                    //  foreach (var g in filteredGroups)
                    //  {
                    //    foreach (var p in g)
                    //    {
                    //      sw.WriteLine(p.Name);
                    //    }
                    //  }
                    //  sw.WriteLine();
                    //}

                    result.ProteinCondition = condition;
                    result.PeptideFdr       = curFdr;
                    result.ProteinFdr       = proteinFdr;
                    result.ProteinCount     = filteredGroups.Count;

                    Progress.SetMessage(task + ", accepted, processing corresponding PSMs...");
                    foreach (IIdentifiedProteinGroup group in filteredGroups)
                    {
                        result.AcceptedSpectra.UnionWith(group[0].GetSpectra());
                    }
                    result.AcceptedSpectra.TrimExcess();


                    //删除已经被包含在通过筛选的group对应的spectra
                    filteredSpectra.RemoveAll(m => result.AcceptedSpectra.Contains(m));

                    //删除对应于已通过筛选的蛋白质的spectra(但未通过初始的肽段筛选)
                    var proteinList = new HashSet <string>((from g in filteredGroups
                                                            from p in g
                                                            select p.Name).Distinct());
                    filteredSpectra.RemoveAll(m =>
                    {
                        foreach (var pep in m.Peptides)
                        {
                            foreach (var p in pep.Proteins)
                            {
                                if (proteinList.Contains(p))
                                {
                                    return(true);
                                }
                            }
                        }
                        return(false);
                    });

                    filteredSpectra.TrimExcess();
                    result.RejectedSpectra = filteredSpectra;

                    List <IIdentifiedProtein>      rejectProteins = proteinBuilder.Build(filteredSpectra);
                    List <IIdentifiedProteinGroup> rejectGroups   = groupBuilder.Build(rejectProteins);
                    //using (StreamWriter sw = new StreamWriter(@"e:\temp\protein.txt", true))
                    //{
                    //  sw.WriteLine(task + " rejected proteins");
                    //  foreach (var g in rejectGroups)
                    //  {
                    //    foreach (var p in g)
                    //    {
                    //      sw.WriteLine(p.Name);
                    //    }
                    //  }
                    //  sw.WriteLine();
                    //}

                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    Progress.SetMessage(task + " finished.");
                    return(result);
                }
                else
                {
                    double stepFdr = CalculateStepFdr(curFdr);
                    curFdr -= stepFdr;
                }
            }
        }
示例#8
0
        public IdentifiedSpectrumBuilderResult Build(string parameterFile)
        {
            Options = new BuildSummaryOptions(parameterFile);
            Options.DatasetList.RemoveDisabled();

            IIdentifiedProteinGroupFilter conFilter = Options.Database.GetNotContaminationDescriptionFilter(this.Progress);

            var fdrCalc = Options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator();

            BuildResult = new DatasetList();

            //从配置进行初始化
            BuildResult.InitFromOptions(Options.DatasetList, this.Progress, parameterFile);

            var totalCount = BuildResult.GetOptimalSpectrumCount();

            string optimalResultFile = FileUtils.ChangeExtension(parameterFile, ".optimal");

            using (var sw = new StreamWriter(optimalResultFile))
            {
                new OptimalFileTextWriter().WriteToStream(sw, BuildResult);

                UniformProteinFdrOptimalResultCalculator proteinCalc = new UniformProteinFdrOptimalResultCalculator(fdrCalc, Options.GetDecoyGroupFilter())
                {
                    Progress = this.Progress
                };

                sw.WriteLine(OptimalFilteredItem.GetHeader());

                var uniqueFilter = new IdentifiedProteinGroupUniquePeptideCountFilter(2);

                OptimalFilteredItem finalItem = null;

                List <IIdentifiedSpectrum> allSpectrum = Options.PeptideRetrieval ? BuildResult.GetSpectra() : null;

                int    fdrPeptideCount = Options.FalseDiscoveryRate.FdrPeptideCount > 2 ? Options.FalseDiscoveryRate.FdrPeptideCount : 2;
                double firstStepFdr    = Options.FalseDiscoveryRate.MaxPeptideFdr;
                bool   bFirst          = true;
                for (int curPeptideCount = fdrPeptideCount; curPeptideCount >= 2; curPeptideCount--)
                {
                    //重新根据保留的Spectra构建SpectrumBin。
                    if (!bFirst)
                    {
                        BuildResult.BuildSpectrumBin();
                    }
                    bFirst = false;

                    var curItem = new OptimalFilteredItem();

                    IIdentifiedProteinGroupFilter groupFilter;

                    bool bNeedFirstStep = curPeptideCount > 2;
                    if (bNeedFirstStep)
                    {
                        Progress.SetMessage("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 and peptide count >= {1} using peptide fdr {2}...", Options.FalseDiscoveryRate.FdrValue, curPeptideCount, firstStepFdr);

                        //第一步,根据UniquePeptideCount和PeptideCount进行筛选,得到满足蛋白质Fdr要求所对应的肽段fdr。
                        var countFilter = new IdentifiedProteinGroupPeptideCountFilter(curPeptideCount);

                        if (conFilter != null)
                        {
                            groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { conFilter, uniqueFilter, countFilter });
                        }
                        else
                        {
                            groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { uniqueFilter, countFilter });
                        }

                        curItem.Unique2CountResult = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, firstStepFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter);
                        firstStepFdr = curItem.Unique2CountResult.PeptideFdr;

                        //只保留没有被通过筛选的蛋白质包含的PSMs。
                        BuildResult.KeepOptimalResultInSetOnly(new HashSet <IIdentifiedSpectrum>(curItem.Unique2CountResult.RejectedSpectra));

                        GC.Collect();
                        GC.WaitForPendingFinalizers();
                    }
                    else
                    {
                        curItem.Unique2CountResult = new ProteinFdrFilteredItem();
                    }

                    Progress.SetMessage("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 using peptide fdr {1}...", Options.FalseDiscoveryRate.FdrValue, firstStepFdr);

                    //第二步,根据UniquePeptideCount进行筛选,计算得到满足给定蛋白质fdr的结果。
                    double secondStepFdr = bNeedFirstStep ? Options.FalseDiscoveryRate.MaxPeptideFdr : firstStepFdr;

                    if (conFilter != null)
                    {
                        groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { conFilter, uniqueFilter });
                    }
                    else
                    {
                        groupFilter = uniqueFilter;
                    }

                    curItem.Unique2Result = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, secondStepFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter);

                    //只保留没有被通过筛选的蛋白质包含的PSMs。
                    BuildResult.KeepOptimalResultInSetOnly(new HashSet <IIdentifiedSpectrum>(curItem.Unique2Result.RejectedSpectra));
                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    curItem.Unique1Result = FilterOneHitWonders(conFilter, proteinCalc);

                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    sw.WriteLine(curItem.ToString());

                    if (finalItem == null || finalItem.TotalProteinCount < curItem.TotalProteinCount)
                    {
                        finalItem = curItem;
                    }

                    curItem = null;

                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    Console.WriteLine(MyConvert.Format("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 and peptide count >= {1} using peptide fdr {2}...cost {3}.", Options.FalseDiscoveryRate.FdrValue, curPeptideCount, firstStepFdr, SystemUtils.CostMemory()));
                }

                Progress.SetMessage("Filtering PSMs by protein fdr {0} finished, free memory...", Options.FalseDiscoveryRate.FdrValue);

                if (finalItem != null)
                {
                    sw.WriteLine();
                    sw.WriteLine("Final result : ");

                    WriteScoreMap(sw, BuildResult, finalItem.Unique2CountResult);
                    WriteScoreMap(sw, BuildResult, finalItem.Unique2Result);
                    WriteScoreMap(sw, BuildResult, finalItem.Unique1Result);

                    var finalSpectra = finalItem.GetSpectra();
                    if (Options.PeptideRetrieval)
                    {
                        Progress.SetMessage("Retrivaling peptides passed maximum peptide FDR for proteins passed protein FDR...");
                        var proteinBuilder = new IdentifiedProteinBuilder();
                        var groupBuilder   = new IdentifiedProteinGroupBuilder();
                        List <IIdentifiedProtein>      proteins = proteinBuilder.Build(finalSpectra);
                        List <IIdentifiedProteinGroup> groups   = groupBuilder.Build(proteins);

                        var proteinMap = new Dictionary <string, IIdentifiedProteinGroup>();
                        foreach (var g in groups)
                        {
                            foreach (var p in g)
                            {
                                proteinMap[p.Name] = g;
                            }
                        }

                        var savedSpectra = new HashSet <IIdentifiedSpectrum>(finalItem.GetSpectra());
                        foreach (var spectrum in allSpectrum)
                        {
                            if (savedSpectra.Contains(spectrum))
                            {
                                continue;
                            }

                            var pgs = new HashSet <IIdentifiedProteinGroup>();
                            foreach (var protein in spectrum.Proteins)
                            {
                                IIdentifiedProteinGroup pg;
                                if (proteinMap.TryGetValue(protein, out pg))
                                {
                                    pgs.Add(pg);
                                }
                            }

                            //if the spectrum doesn't map to protein passed FDR filter, ignore
                            //if the spectrum maps to multiple groups, ignore
                            if (pgs.Count == 0 || pgs.Count > 1)
                            {
                                continue;
                            }

                            //The spectrum should map to all proteins in the group
                            if (pgs.First().All(l => spectrum.Proteins.Contains(l.Name)))
                            {
                                finalSpectra.Add(spectrum);
                            }
                        }
                    }

                    BuildResult.ClearSpectra();
                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    return(new IdentifiedSpectrumBuilderResult()
                    {
                        Spectra = finalSpectra,
                        PeptideFDR = finalItem.Unique2Result.PeptideFdr,
                        ProteinFDR = Options.FalseDiscoveryRate.FdrValue
                    });
                }
                else
                {
                    return(new IdentifiedSpectrumBuilderResult()
                    {
                        Spectra = new List <IIdentifiedSpectrum>()
                    });
                }
            }
        }