Beispiel #1
0
        public void TestFeatureAlignment()
        {
            const string outFilePath = @"\\protoapps\UserData\Jungkap\Lewy\aligned\promex_crosstab_temp.tsv";
            
            
            //CPTAC_Intact_CR32A_24Aug15_Bane_15-02-06-RZ
            var prsmReader = new ProteinSpectrumMatchReader();
            var tolerance = new Tolerance(10);
            var alignment = new LcMsFeatureAlignment(new AnalysisCompRef.CompRefFeatureComparer(tolerance));

            for (var i = 0; i < NdataSet; i++)
            {
                var rawFile = string.Format(@"{0}\{1}.pbf", PbfPath, GetDataSetNames(i));
                var mspFile = string.Format(@"{0}\{1}_IcTda.tsv", MsPfFolder, GetDataSetNames(i));
                var mspFile2 = string.Format(@"{0}\{1}_IcTda.tsv", MsPfFolder2, GetDataSetNames(i));
                var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", Ms1FtFolder, GetDataSetNames(i));
                Console.WriteLine(rawFile);
                var run = PbfLcMsRun.GetLcMsRun(rawFile);
                var prsmList1 = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder);
                var prsmList2 = prsmReader.LoadIdentificationResult(mspFile2, ProteinSpectrumMatch.SearchTool.MsPathFinder);
                prsmList1.AddRange(prsmList2);
                
                var prsmList = MergePrsm(prsmList1);
                var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run);

                for (var j = 0; j < prsmList.Count; j++)
                {
                    var match = prsmList[j];
                    match.ProteinId = match.ProteinName;
                }

                // tag features by PrSMs
                for (var j = 0; j < features.Count; j++)
                {
                    //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i);
                    var massTol = tolerance.GetToleranceAsTh(features[j].Mass);
                    foreach (var match in prsmList)
                    {
                        if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol)
                        {
                            features[j].ProteinSpectrumMatches.Add(match);
                        }
                    }
                }

                alignment.AddDataSet(i, features, run);
            }

            alignment.AlignFeatures();

            Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures);
            
            for (var i = 0; i < NdataSet; i++)
            {
                alignment.FillMissingFeatures(i);
                Console.WriteLine("{0} has been processed", GetDataSetNames(i));
            }
            
            OutputCrossTabWithId(outFilePath, alignment);
        }
Beispiel #2
0
        public static ICollection <ProteinSpectrumMatchSet> CollectTrainSet(string pbfFilePath, string idFilePath)
        {
            Modification.RegisterAndGetModification(Modification.Cysteinyl.Name, Modification.Cysteinyl.Composition);

            var prsmReader = new ProteinSpectrumMatchReader(0.01);

            var prsmList = prsmReader.LoadIdentificationResult(idFilePath);
            var run      = PbfLcMsRun.GetLcMsRun(pbfFilePath);

            var groupedPrsmList = GroupingByPrsm(0, prsmList, new PrsmComparer(run));

            var finalPrsmGroups = new List <ProteinSpectrumMatchSet>();

            foreach (var prsmSet in groupedPrsmList)
            {
                if (prsmSet.Count < 2)
                {
                    continue;
                }

                var isGood   = false;
                var sequence = prsmSet[0].GetSequence();
                if (sequence == null)
                {
                    continue;
                }

                foreach (var scan in prsmSet.Select(prsm => prsm.ScanNum))
                {
                    var spectrum = run.GetSpectrum(scan) as ProductSpectrum;
                    if (spectrum == null)
                    {
                        continue;
                    }
                    if (IsGoodTarget(spectrum, sequence))
                    {
                        isGood = true;
                        break;
                    }
                }

                if (isGood)
                {
                    finalPrsmGroups.Add(prsmSet);
                }
            }
            return(finalPrsmGroups);
        }
Beispiel #3
0
        public void TestGenerateFrequencyData()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string idFileFolder = @"D:\MassSpecFiles\training\IdScoring\MSPF_trainset";
            const string outFileFolder = @"D:\MassSpecFiles\training\IdScoring";

            if (!Directory.Exists(idFileFolder))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, idFileFolder);
            }

            Modification.RegisterAndGetModification(Modification.Cysteinyl.Name, Modification.Cysteinyl.Composition);
            Modification.RegisterAndGetModification(Modification.Phosphorylation.Name, Modification.Phosphorylation.Composition);
            Modification.RegisterAndGetModification(Modification.Methylation.Name, Modification.Methylation.Composition);
            Modification.RegisterAndGetModification(Modification.DiMethylation.Name, Modification.DiMethylation.Composition);
            Modification.RegisterAndGetModification(Modification.TriMethylation.Name, Modification.TriMethylation.Composition);
            Modification.RegisterAndGetModification("Trioxidation", new Composition(0, 0, 0, 3, 0));
            var aaSet = new AminoAcidSet(@"D:\MassSpecFiles\training\Mods.txt");
            
            var n = 0;
            
            for (var d = 0; d < TrainSetFileLists.Length; d++)
            {
                var dataset = TrainSetFileLists[d];
                var dataname = Path.GetFileNameWithoutExtension(dataset);
                var idFile = string.Format(@"{0}\{1}_IcTda.tsv", idFileFolder, dataname);
                var decoyFile = string.Format(@"{0}\{1}_IcDecoy.tsv", idFileFolder, dataname);
                var targetFile = string.Format(@"{0}\{1}_IcTarget.tsv", idFileFolder, dataname);
                
                if (!File.Exists(idFile)) continue;
                
                var prsmReader = new ProteinSpectrumMatchReader(0.01);
                var prsmList = prsmReader.LoadIdentificationResult(idFile);

                var minScore = prsmList.Last().Score;
                var decoyMatches = prsmReader.ReadMsPathFinderResult(decoyFile, int.MaxValue, 1, Math.Max(minScore - 5, 10));
                var run = PbfLcMsRun.GetLcMsRun(dataset);
                
                var spectrumMatchSet = LcMsFeatureTrain.CollectTrainSet(dataset, idFile);
                Console.WriteLine(spectrumMatchSet.Count);
                var writer = new StreamWriter(string.Format(@"{0}\{1}_target.tsv", outFileFolder, dataname));

                foreach (var matches in spectrumMatchSet)
                {
                    foreach (var match in matches)
                    {
                        var spec = run.GetSpectrum(match.ScanNum) as ProductSpectrum;
                        GetMatchStatistics(spec, match.GetSequence(), match.Charge, writer);
                    }
                }
                writer.Close();

                writer = new StreamWriter(string.Format(@"{0}\{1}_decoy.tsv", outFileFolder, dataname));
                foreach (var match in decoyMatches)
                {
                    var sequence = match.GetSequence();
                    var spec = run.GetSpectrum(match.ScanNum) as ProductSpectrum;
                    GetMatchStatistics(spec, sequence, match.Charge, writer);
                }
                writer.Close();
                n++;
            }
        }
        public void FindMissingLcMsFeatures()
        {
            var mspfFolder = @"D:\MassSpecFiles\CompRef_Kelleher\Study3";
            var ms1ftFolder = @"D:\MassSpecFiles\CompRef_Kelleher\Study3";

            const int Nfraction1 = 3;
            const int Nfraction2 = 5;

            for (var frac1 = 1; frac1 <= Nfraction1; frac1++)
            {
                for (var frac2 = 1; frac2 <= Nfraction2; frac2++)
                {
                    var datasets = GetDataSetNamesStudy3(frac1, frac2);
                    //var outFilePath = string.Format(@"D:\MassSpecFiles\CompRef_Kelleher\study3_GFrep{0}_Gfrac{1}.tsv", frac1.ToString("D2"), frac2.ToString("D2"));
                    var nDataset = datasets.Count;
                    var prsmReader = new ProteinSpectrumMatchReader();
                    var tolerance = new Tolerance(12);

                    for (var i = 0; i < nDataset; i++)
                    {
                        var rawFile = string.Format(@"{0}\{1}.pbf", PbfPath, datasets[i]);
                        var mspFile = string.Format(@"{0}\{1}_IcTda.tsv", mspfFolder, datasets[i]);
                        var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", ms1ftFolder, datasets[i]);
                        var outPath = string.Format(@"{0}\{1}.seqtag.ms1ft", ms1ftFolder, datasets[i]);

                        if (File.Exists(outPath)) continue;

                        var run = PbfLcMsRun.GetLcMsRun(rawFile);
                        var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run);
                        var prsmList = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder);
                        var prsmFeatureMatch = new bool[prsmList.Count];

                        for (var j = 0; j < features.Count; j++)
                        {
                            //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i);
                            var massTol = tolerance.GetToleranceAsTh(features[j].Mass);
                            for (var k = 0; k < prsmList.Count; k++)
                            {
                                var match = prsmList[k];
                                if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol)
                                {
                                    features[j].ProteinSpectrumMatches.Add(match);
                                    prsmFeatureMatch[k] = true;
                                }
                            }
                        }

                        var missingPrsm = new List<ProteinSpectrumMatch>();
                        for (var k = 0; k < prsmList.Count; k++) if (!prsmFeatureMatch[k]) missingPrsm.Add(prsmList[k]);

                        FeatureFind(missingPrsm, run, outPath);
                        Console.WriteLine(outPath);
                    }
                }
            }
        }
        public void AlignFeatures(List<string> datasets, string mspfFolder, string ms1ftFolder, string outFilePath)
        {
            var nDataset = datasets.Count;
            var prsmReader = new ProteinSpectrumMatchReader();
            var tolerance = new Tolerance(12);
            var alignment = new LcMsFeatureAlignment(new AnalysisCompRef.CompRefFeatureComparer(tolerance));
            for (var i = 0; i < nDataset; i++)
            {
                var rawFile = string.Format(@"{0}\{1}.pbf", PbfPath, datasets[i]);
                var mspFile = string.Format(@"{0}\{1}_IcTda.tsv", mspfFolder, datasets[i]);
                var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", ms1ftFolder, datasets[i]);
                var ms1FtFile2 = string.Format(@"{0}\{1}.seqtag.ms1ft", ms1ftFolder, datasets[i]);

                var run = PbfLcMsRun.GetLcMsRun(rawFile);
                var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run);
                var features2 = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile2, run);
                features.AddRange(features2);

                if (File.Exists(mspFile))
                {
                    var prsmList = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder);
                    //var prsmFeatureMatch = new bool[prsmList.Count];

                    for (var j = 0; j < prsmList.Count; j++)
                    {
                        var match = prsmList[j];
                        match.ProteinId = match.ProteinName;
                    }

                    // tag features by PrSMs
                    for (var j = 0; j < features.Count; j++)
                    {
                        //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i);
                        var massTol = tolerance.GetToleranceAsTh(features[j].Mass);
                        for(var k = 0; k < prsmList.Count; k++)
                        {
                            var match = prsmList[k];
                            if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol)
                            {
                                features[j].ProteinSpectrumMatches.Add(match);
                                //prsmFeatureMatch[k] = true;
                            }
                        }
                    }
                }

                alignment.AddDataSet(i, features, run);
            }

            alignment.AlignFeatures();

            Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures);

            for (var i = 0; i < nDataset; i++)
            {
                alignment.FillMissingFeatures(i);
                Console.WriteLine("{0} has been processed", datasets[i]);
            }

            AnalysisCompRef.OutputCrossTabWithId(outFilePath, alignment, datasets.ToArray());
        }