Пример #1
0
        public void TestFeatureAlignment()
        {
            const string outFilePath = @"\\protoapps\UserData\Jungkap\Lewy\aligned\promex_crosstab_temp.tsv";


            //CPTAC_Intact_CR32A_24Aug15_Bane_15-02-06-RZ
            var prsmReader = new ProteinSpectrumMatchReader();
            var tolerance  = new Tolerance(10);
            var alignment  = new LcMsFeatureAlignment(new AnalysisCompRef.CompRefFeatureComparer(tolerance));

            for (var i = 0; i < NdataSet; i++)
            {
                var rawFile   = string.Format(@"{0}\{1}.pbf", PbfPath, GetDataSetNames(i));
                var mspFile   = string.Format(@"{0}\{1}_IcTda.tsv", MsPfFolder, GetDataSetNames(i));
                var mspFile2  = string.Format(@"{0}\{1}_IcTda.tsv", MsPfFolder2, GetDataSetNames(i));
                var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", Ms1FtFolder, GetDataSetNames(i));
                Console.WriteLine(rawFile);
                var run       = PbfLcMsRun.GetLcMsRun(rawFile);
                var prsmList1 = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder);
                var prsmList2 = prsmReader.LoadIdentificationResult(mspFile2, ProteinSpectrumMatch.SearchTool.MsPathFinder);
                prsmList1.AddRange(prsmList2);

                var prsmList = MergePrsm(prsmList1);
                var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run);

                for (var j = 0; j < prsmList.Count; j++)
                {
                    var match = prsmList[j];
                    match.ProteinId = match.ProteinName;
                }

                // tag features by PrSMs
                for (var j = 0; j < features.Count; j++)
                {
                    //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i);
                    var massTol = tolerance.GetToleranceAsTh(features[j].Mass);
                    foreach (var match in prsmList)
                    {
                        if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol)
                        {
                            features[j].ProteinSpectrumMatches.Add(match);
                        }
                    }
                }

                alignment.AddDataSet(i, features, run);
            }

            alignment.AlignFeatures();

            Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures);

            for (var i = 0; i < NdataSet; i++)
            {
                alignment.FillMissingFeatures(i);
                Console.WriteLine("{0} has been processed", GetDataSetNames(i));
            }

            OutputCrossTabWithId(outFilePath, alignment);
        }
Пример #2
0
        public void TestFeatureAlignment()
        {
            const string outFilePath = @"\\protoapps\UserData\Jungkap\CompRef\aligned\promex_crosstab_temp.tsv";

            var runLabels = new[] { "32A", "32B", "32C", "32D", "32E", "32F", "32G", "33A", "33B", "33C", "33D", "33E", "33F", "33G" };
            var nDataset  = runLabels.Length;

            var prsmReader = new ProteinSpectrumMatchReader();
            var tolerance  = new Tolerance(10);
            var alignment  = new LcMsFeatureAlignment(new CompRefFeatureComparer(tolerance));

            for (var i = 0; i < nDataset; i++)
            {
                var rawFile   = string.Format(@"{0}\CPTAC_Intact_CR{1}_24Aug15_Bane_15-02-06-RZ.pbf", RawFolder, runLabels[i]);
                var mspFile   = string.Format(@"{0}\CPTAC_Intact_CR{1}_24Aug15_Bane_15-02-06-RZ_IcTda.tsv", MsPfFolder, runLabels[i]);
                var ms1FtFile = string.Format(@"{0}\CPTAC_Intact_CR{1}_24Aug15_Bane_15-02-06-RZ.ms1ft", Ms1FtFolder, runLabels[i]);

                var run      = PbfLcMsRun.GetLcMsRun(rawFile);
                var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run);

                if (File.Exists(mspFile))
                {
                    var prsmList = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder);

                    for (var j = 0; j < prsmList.Count; j++)
                    {
                        var match = prsmList[j];
                        match.ProteinId = match.ProteinName;
                    }

                    // tag features by PrSMs
                    for (var j = 0; j < features.Count; j++)
                    {
                        //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i);
                        var massTol = tolerance.GetToleranceAsMz(features[j].Mass);
                        foreach (var match in prsmList)
                        {
                            if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol)
                            {
                                features[j].ProteinSpectrumMatches.Add(match);
                            }
                        }
                    }
                }

                alignment.AddDataSet(i, features, run);
            }

            alignment.AlignFeatures();

            Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures);

            for (var i = 0; i < nDataset; i++)
            {
                alignment.FillMissingFeatures(i);
                Console.WriteLine("{0} has been processed", runLabels[i]);
            }

            OutputCrossTabWithId(outFilePath, alignment, runLabels);
        }
Пример #3
0
        private void AlignFeatures(List <string> datasets, string mspfFolder, string ms1ftFolder, string outFilePath)
        {
            var nDataset   = datasets.Count;
            var prsmReader = new ProteinSpectrumMatchReader();
            var tolerance  = new Tolerance(12);
            var alignment  = new LcMsFeatureAlignment(new AnalysisCompRef.CompRefFeatureComparer(tolerance));

            for (var i = 0; i < nDataset; i++)
            {
                var rawFile    = string.Format(@"{0}\{1}.pbf", PbfPath, datasets[i]);
                var mspFile    = string.Format(@"{0}\{1}_IcTda.tsv", mspfFolder, datasets[i]);
                var ms1FtFile  = string.Format(@"{0}\{1}.ms1ft", ms1ftFolder, datasets[i]);
                var ms1FtFile2 = string.Format(@"{0}\{1}.seqtag.ms1ft", ms1ftFolder, datasets[i]);

                var run       = PbfLcMsRun.GetLcMsRun(rawFile);
                var features  = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run);
                var features2 = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile2, run);
                features.AddRange(features2);

                if (File.Exists(mspFile))
                {
                    var prsmList = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder);
                    //var prsmFeatureMatch = new bool[prsmList.Count];

                    foreach (var match in prsmList)
                    {
                        match.ProteinId = match.ProteinName;
                    }

                    // tag features by PrSMs
                    foreach (var feature in features)
                    {
                        //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i);
                        var massTol = tolerance.GetToleranceAsMz(feature.Mass);
                        foreach (var match in prsmList)
                        {
                            if (feature.MinScanNum < match.ScanNum && match.ScanNum < feature.MaxScanNum && Math.Abs(feature.Mass - match.Mass) < massTol)
                            {
                                feature.ProteinSpectrumMatches.Add(match);
                                //prsmFeatureMatch[k] = true;
                            }
                        }
                    }
                }

                alignment.AddDataSet(i, features, run);
            }

            alignment.AlignFeatures();

            Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures);

            for (var i = 0; i < nDataset; i++)
            {
                alignment.FillMissingFeatures(i);
                Console.WriteLine("{0} has been processed", datasets[i]);
            }

            AnalysisCompRef.OutputCrossTabWithId(outFilePath, alignment, datasets);
        }
Пример #4
0
        public void TestFeatureAlignment()
        {
            const string outFilePath = @"\\protoapps\UserData\Jungkap\Quant\aligned\promex_crosstab.tsv";
            //const string outFolder = @"\\protoapps\UserData\Jungkap\CompRef\aligned";
            var runLabels = new string[] { "1x1", "1x2", "1x3", "1x4", "1x5", "5x1", "5x2", "5x3", "5x4", "5x5", "10x1", "10x2", "10x3", "10x4", "10x5", };
            var nDataset  = runLabels.Length;

            var prsmReader = new ProteinSpectrumMatchReader();
            var tolerance  = new Tolerance(10);
            var alignment  = new LcMsFeatureAlignment(new SpikeInFeatureComparer(tolerance));

            for (var i = 0; i < nDataset; i++)
            {
                var rawFile   = string.Format(@"{0}\{1}.pbf", RawFolder, datasets[i]);
                var mspFile   = string.Format(@"{0}\{1}_IcTda.tsv", MsPfFolder, datasets[i]);
                var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", Ms1FtFolder, datasets[i]);

                var run      = PbfLcMsRun.GetLcMsRun(rawFile);
                var prsmList = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder);
                var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run);

                for (var j = 0; j < prsmList.Count; j++)
                {
                    var match = prsmList[j];
                    match.ProteinId = match.ProteinName;
                }

                // tag features by PrSMs
                for (var j = 0; j < features.Count; j++)
                {
                    //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i);
                    var massTol = tolerance.GetToleranceAsTh(features[j].Mass);
                    foreach (var match in prsmList)
                    {
                        if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol)
                        {
                            features[j].ProteinSpectrumMatches.Add(match);
                        }
                    }
                }

                alignment.AddDataSet(i, features, run);
            }

            alignment.AlignFeatures();

            Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures);

            /*
             * for (var i = 0; i < nDataset; i++)
             * {
             *  alignment.FillMissingFeatures(i);
             *  Console.WriteLine("{0} has been processed", runLabels[i]);
             * }
             */
            OutputCrossTabWithId(outFilePath, alignment, runLabels);
        }
Пример #5
0
        public static ICollection <ProteinSpectrumMatchSet> CollectTrainSet(string pbfFilePath, string idFilePath)
        {
            Modification.RegisterAndGetModification(Modification.Cysteinyl.Name, Modification.Cysteinyl.Composition);

            var prsmReader = new ProteinSpectrumMatchReader(0.01);

            var prsmList = prsmReader.LoadIdentificationResult(idFilePath);
            var run      = PbfLcMsRun.GetLcMsRun(pbfFilePath);

            var groupedPrsmList = GroupingByPrsm(0, prsmList, new PrsmComparer(run));

            var finalPrsmGroups = new List <ProteinSpectrumMatchSet>();

            foreach (var prsmSet in groupedPrsmList)
            {
                if (prsmSet.Count < 2)
                {
                    continue;
                }

                var isGood   = false;
                var sequence = prsmSet[0].GetSequence();
                if (sequence == null)
                {
                    continue;
                }

                foreach (var scan in prsmSet.Select(prsm => prsm.ScanNum))
                {
                    var spectrum = run.GetSpectrum(scan) as ProductSpectrum;
                    if (spectrum == null)
                    {
                        continue;
                    }
                    if (IsGoodTarget(spectrum, sequence))
                    {
                        isGood = true;
                        break;
                    }
                }

                if (isGood)
                {
                    finalPrsmGroups.Add(prsmSet);
                }
            }
            return(finalPrsmGroups);
        }
Пример #6
0
        public void TestQuantifyIdedProteoforms()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const string rawFolder           = @"\\proto-11\MSXML_Cache\PBF_Gen_1_193\2015_2";
            const string promexOutFolder     = @"D:\MassSpecFiles\UTEX\MSAlign";
            const string msAlignResultFolder = @"D:\MassSpecFiles\UTEX\MSAlign";

            if (!Directory.Exists(rawFolder))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, rawFolder);
            }

            var nDataset = 32;
            var dataset  = new string[nDataset];

            for (var i = 0; i < nDataset; i++)
            {
                dataset[i] = string.Format("Syn_utex2973_Top_{0,2:D2}_TopDown_7May15_Bane_14-09-01RZ", i + 1);
                //var rawFile = string.Format(@"{0}\{1}.pbf", rawFolder, dataset[i]);
            }

            var prsmReader = new ProteinSpectrumMatchReader(0.01);

            var filesProcessed = 0;

            var tolerance = new Tolerance(10);

            for (var i = 0; i < dataset.Length; i++)
            {
                var rawFile = string.Format(@"{0}\{1}.pbf", rawFolder, dataset[i]);
                if (!File.Exists(rawFile))
                {
                    Console.WriteLine(@"Warning: Skipping file not found: {0}", rawFile);
                    continue;
                }
                var run = PbfLcMsRun.GetLcMsRun(rawFile);

                var path = string.Format(@"{0}\{1}_MSAlign_ResultTable.txt", msAlignResultFolder, dataset[i]);
                if (!File.Exists(path))
                {
                    Console.WriteLine(@"Warning: Skipping file not found: {0}", path);
                    continue;
                }

                var prsmList = prsmReader.LoadIdentificationResult(path, ProteinSpectrumMatch.SearchTool.MsAlign);

                filesProcessed++;

                for (var j = 0; j < prsmList.Count; j++)
                {
                    var match = prsmList[j];
                    match.ProteinId = match.ProteinName.Substring(match.ProteinName.IndexOf(ProteinNamePrefix) + ProteinNamePrefix.Length, 5);
                }

                // PrSM To Feature
                var prsmToFeatureIdMap = new int[prsmList.Count];
                for (var k = 0; k < prsmToFeatureIdMap.Length; k++)
                {
                    prsmToFeatureIdMap[k] = -1;
                }

                // Feature To PrSM
                var featureToPrsm = new List <ProteinSpectrumMatchSet>();

                var featureFinder = new LcMsPeakMatrix(run, new LcMsFeatureLikelihood());
                var featureList   = new List <LcMsPeakCluster>();
                var featureId     = 0;
                for (var j = 0; j < prsmList.Count; j++)
                {
                    if (prsmToFeatureIdMap[j] >= 0)
                    {
                        continue;
                    }

                    var match      = prsmList[j];
                    var minScanNum = match.ScanNum;
                    var maxScanNum = match.ScanNum;
                    var mass       = match.Mass;
                    var charge     = match.Charge;
                    var massTh     = tolerance.GetToleranceAsMz(mass);
                    var id1        = match.ProteinId;

                    var feature = featureFinder.GetLcMsPeakCluster(mass, charge, minScanNum, maxScanNum);
                    var prsmSet = new ProteinSpectrumMatchSet(i)
                    {
                        match
                    };
                    if (feature == null)
                    {
                        feature = featureFinder.GetLcMsPeaksFromNoisePeaks(mass, charge, minScanNum, maxScanNum, charge, charge);
                        prsmToFeatureIdMap[j] = featureId;
                    }
                    else
                    {
                        prsmToFeatureIdMap[j] = featureId;
                        var etTol = Math.Max(run.GetElutionTime(run.MaxLcScan) * 0.005, feature.ElutionLength * 0.2);

                        for (var k = j + 1; k < prsmList.Count; k++)
                        {
                            var otherMatch = prsmList[k];
                            var id2        = otherMatch.ProteinId;
                            var et2        = run.GetElutionTime(otherMatch.ScanNum);

                            if (id1.Equals(id2) &&
                                feature.MinElutionTime - etTol < et2 && et2 < feature.MaxElutionTime - etTol &&
                                Math.Abs(otherMatch.Mass - mass) < massTh)
                            {
                                prsmToFeatureIdMap[k] = featureId;
                                prsmSet.Add(otherMatch);
                            }
                        }
                    }
                    featureId++;

                    feature.Flag = 1;
                    featureList.Add(feature);
                    featureToPrsm.Add(prsmSet);
                }

                // Overlap between features???
                for (var j = 0; j < featureList.Count; j++)
                {
                    var f1 = featureList[j];
                    if (f1.Flag < 1)
                    {
                        continue;
                    }
                    var prsm1 = featureToPrsm[j];

                    for (var k = j + 1; k < featureList.Count; k++)
                    {
                        var f2 = featureList[k];
                        if (f2.Flag < 1)
                        {
                            continue;
                        }

                        var prsm2 = featureToPrsm[k];
                        if (Math.Abs(f1.Mass - f2.Mass) > tolerance.GetToleranceAsMz(f1.Mass))
                        {
                            continue;
                        }
                        if (!f1.CoElutedByNet(f2, 0.005))
                        {
                            continue;
                        }
                        if (!prsm1.ShareProteinId(prsm2))
                        {
                            continue;
                        }

                        // let us merge!!
                        if (f1.ScanLength > f2.ScanLength)
                        {
                            prsm1.AddRange(prsm2);
                            prsm2.Clear();
                            f2.Flag = 0;
                        }
                        else
                        {
                            prsm2.AddRange(prsm1);
                            prsm1.Clear();
                            f1.Flag = 0;
                        }
                    }
                }

                // now output results!!
                var ms1ftFilePath = string.Format(@"{0}\{1}.ms1ft", promexOutFolder, dataset[i]);
                var writer        = new StreamWriter(ms1ftFilePath);
                writer.WriteLine(LcMsFeatureFinderLauncher.GetHeaderString());

                for (var j = 0; j < featureList.Count; j++)
                {
                    var f1 = featureList[j];
                    if (f1.Flag < 1)
                    {
                        continue;
                    }
                    var prsm1 = featureToPrsm[j];

                    var minScanNum = run.GetPrevScanNum(prsm1.MinScanNum, 1);
                    var maxScanNum = run.GetNextScanNum(prsm1.MaxScanNum, 1);
                    f1.ExpandScanRange(minScanNum, maxScanNum);

                    writer.Write("{0}\t", j + 1);
                    writer.WriteLine(LcMsFeatureFinderLauncher.GetString(f1));
                }
                writer.Close();

                Console.WriteLine(ms1ftFilePath);
            }

            if (filesProcessed == 0)
            {
                Assert.Ignore("Skipped since data files not found");
            }
        }
Пример #7
0
        public void TestAlignFeatures()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const string rawFolder           = @"\\proto-11\MSXML_Cache\PBF_Gen_1_193\2015_2";
            const string promexOutFolder     = @"D:\MassSpecFiles\UTEX\MSAlign";
            const string msAlignResultFolder = @"D:\MassSpecFiles\UTEX\MSAlign";

            if (!Directory.Exists(rawFolder))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, rawFolder);
            }

            var nDataset = 32;
            var dataset  = new string[nDataset];

            for (var i = 0; i < nDataset; i++)
            {
                dataset[i] = string.Format("Syn_utex2973_Top_{0,2:D2}_TopDown_7May15_Bane_14-09-01RZ", i + 1);
                //var rawFile = string.Format(@"{0}\{1}.pbf", rawFolder, dataset[i]);
            }

            var tolerance      = new Tolerance(10);
            var ftComparer     = new UtexFeatureComparer(tolerance);
            var align          = new LcMsFeatureAlignment(ftComparer);
            var prsmReader     = new ProteinSpectrumMatchReader(0.01);
            var filesProcessed = 0;

            for (var i = 0; i < dataset.Length; i++)
            {
                var rawFile = string.Format(@"{0}\{1}.pbf", rawFolder, dataset[i]);
                if (!File.Exists(rawFile))
                {
                    Console.WriteLine(@"Warning: Skipping file not found: {0}", rawFile);
                    continue;
                }
                var run = PbfLcMsRun.GetLcMsRun(rawFile);

                var path = string.Format(@"{0}\{1}_MSAlign_ResultTable.txt", msAlignResultFolder, dataset[i]);
                if (!File.Exists(path))
                {
                    Console.WriteLine(@"Warning: Skipping file not found: {0}", path);
                    continue;
                }

                var ms1ftPath = string.Format(@"{0}\{1}.ms1ft", promexOutFolder, dataset[i]);
                if (!File.Exists(ms1ftPath))
                {
                    Console.WriteLine(@"Warning: Skipping file not found: {0}", ms1ftPath);
                    continue;
                }

                filesProcessed++;

                //var map = new ProteinSpectrumMathMap(run, i, dataset[i]);
                //map.LoadIdentificationResult(path, ProteinSpectrumMatch.SearchTool.MsAlign);
                var prsmList = prsmReader.LoadIdentificationResult(path, ProteinSpectrumMatch.SearchTool.MsAlign);

                for (var j = 0; j < prsmList.Count; j++)
                {
                    var match = prsmList[j];
                    match.ProteinId =
                        match.ProteinName.Substring(
                            match.ProteinName.IndexOf(ProteinNamePrefix) + ProteinNamePrefix.Length, 5);
                }

                var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1ftPath, run);

                // tag features by PrSMs
                for (var j = 0; j < features.Count; j++)
                {
                    //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i);
                    var massTol = tolerance.GetToleranceAsMz(features[j].Mass);
                    foreach (var match in prsmList)
                    {
                        if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol)
                        {
                            features[j].ProteinSpectrumMatches.Add(match);
                        }
                    }
                }

                align.AddDataSet(i, features, run);
            }

            if (filesProcessed == 0)
            {
                Assert.Ignore("Skipped since input files not found");
            }

            align.AlignFeatures();
            Console.WriteLine("{0} alignments ", align.CountAlignedFeatures);
            align.RefineAbundance();

            var alignedFeatureList = align.GetAlignedFeatures();

            for (var i = 0; i < nDataset; i++)
            {
                var ms1ftPath = string.Format(@"{0}\{1}_aligned.ms1ft", promexOutFolder, dataset[i]);
                var writer    = new StreamWriter(ms1ftPath);
                writer.Write(LcMsFeatureFinderLauncher.GetHeaderString());
                writer.WriteLine("\tIdedMs2ScanNums");

                for (var j = 0; j < alignedFeatureList.Count; j++)
                {
                    writer.Write(j + 1);
                    writer.Write("\t");

                    if (alignedFeatureList[j][i] == null)
                    {
                        for (var k = 0; k < 14; k++)
                        {
                            writer.Write("0\t");
                        }
                        writer.Write("0\n");
                    }
                    else
                    {
                        writer.Write(LcMsFeatureFinderLauncher.GetString(alignedFeatureList[j][i]));
                        writer.Write("\t");

                        if (alignedFeatureList[j][i].ProteinSpectrumMatches == null)
                        {
                            writer.Write("");
                        }
                        else
                        {
                            var scanNums = string.Join(";", alignedFeatureList[j][i].ProteinSpectrumMatches.Select(prsm => prsm.ScanNum));
                            writer.Write(scanNums);
                        }

                        writer.Write("\n");
                    }
                }
                writer.Close();
            }
        }
Пример #8
0
        public void FindMissingLcMsFeatures()
        {
            var mspfFolder  = @"D:\MassSpecFiles\CompRef_Kelleher\Study3";
            var ms1ftFolder = @"D:\MassSpecFiles\CompRef_Kelleher\Study3";

            const int Nfraction1 = 3;
            const int Nfraction2 = 5;

            for (var frac1 = 1; frac1 <= Nfraction1; frac1++)
            {
                for (var frac2 = 1; frac2 <= Nfraction2; frac2++)
                {
                    var datasets = GetDataSetNamesStudy3(frac1, frac2);
                    //var outFilePath = string.Format(@"D:\MassSpecFiles\CompRef_Kelleher\study3_GFrep{0}_Gfrac{1}.tsv", frac1.ToString("D2"), frac2.ToString("D2"));
                    var nDataset   = datasets.Count;
                    var prsmReader = new ProteinSpectrumMatchReader();
                    var tolerance  = new Tolerance(12);

                    for (var i = 0; i < nDataset; i++)
                    {
                        var rawFile   = string.Format(@"{0}\{1}.pbf", PbfPath, datasets[i]);
                        var mspFile   = string.Format(@"{0}\{1}_IcTda.tsv", mspfFolder, datasets[i]);
                        var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", ms1ftFolder, datasets[i]);
                        var outPath   = string.Format(@"{0}\{1}.seqtag.ms1ft", ms1ftFolder, datasets[i]);

                        if (File.Exists(outPath))
                        {
                            continue;
                        }

                        var run              = PbfLcMsRun.GetLcMsRun(rawFile);
                        var features         = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run);
                        var prsmList         = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder);
                        var prsmFeatureMatch = new bool[prsmList.Count];

                        for (var j = 0; j < features.Count; j++)
                        {
                            //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i);
                            var massTol = tolerance.GetToleranceAsTh(features[j].Mass);
                            for (var k = 0; k < prsmList.Count; k++)
                            {
                                var match = prsmList[k];
                                if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol)
                                {
                                    features[j].ProteinSpectrumMatches.Add(match);
                                    prsmFeatureMatch[k] = true;
                                }
                            }
                        }

                        var missingPrsm = new List <ProteinSpectrumMatch>();
                        for (var k = 0; k < prsmList.Count; k++)
                        {
                            if (!prsmFeatureMatch[k])
                            {
                                missingPrsm.Add(prsmList[k]);
                            }
                        }

                        FeatureFind(missingPrsm, run, outPath);
                        Console.WriteLine(outPath);
                    }
                }
            }
        }
Пример #9
0
        public void TestIMERFeatureAlignment()
        {
            const string outFilePath = @"D:\MassSpecFiles\IMER\promex_crosstab.tsv";
            const string rawFolder   = @"D:\MassSpecFiles\IMER";
            var          runLabels   = new string[] { "1", "2", "3", "4", "5", "6" };

            var nDataset = runLabels.Length;
            //CPTAC_Intact_CR32A_24Aug15_Bane_15-02-06-RZ
            var prsmReader = new ProteinSpectrumMatchReader();
            var tolerance  = new Tolerance(10);
            var alignment  = new LcMsFeatureAlignment(new CompRefFeatureComparer(tolerance));

            for (var i = 0; i < nDataset; i++)
            {
                var k         = runLabels[i].Equals("2") || runLabels[i].Equals("3") ? 14 : 13;
                var rawFile   = string.Format(@"{0}\Diabetes_iPSC_Beta_{1}_IMER_{2}May14_Alder_14-01-33.pbf", rawFolder, runLabels[i], k);
                var mspFile   = string.Format(@"{0}\Diabetes_iPSC_Beta_{1}_IMER_{2}May14_Alder_14-01-33_msgfdb_syn.txt", rawFolder, runLabels[i], k);
                var ms1FtFile = string.Format(@"{0}\Diabetes_iPSC_Beta_{1}_IMER_{2}May14_Alder_14-01-33.ms1ft", rawFolder, runLabels[i], k);

                Console.WriteLine(rawFile);
                Console.WriteLine(File.Exists(rawFile));

                var run      = PbfLcMsRun.GetLcMsRun(rawFile);
                var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run, 500, 15000);

                if (File.Exists(mspFile))
                {
                    var prsmList = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsGfPlus);

                    for (var j = 0; j < prsmList.Count; j++)
                    {
                        var match = prsmList[j];
                        match.ProteinId = match.ProteinName;
                    }

                    // tag features by PrSMs
                    for (var j = 0; j < features.Count; j++)
                    {
                        //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i);
                        var massTol = tolerance.GetToleranceAsTh(features[j].Mass);
                        foreach (var match in prsmList)
                        {
                            if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol)
                            {
                                features[j].ProteinSpectrumMatches.Add(match);
                            }
                        }
                    }
                }


                alignment.AddDataSet(i, features, run);
            }

            alignment.AlignFeatures();

            Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures);

            for (var i = 0; i < nDataset; i++)
            {
                alignment.FillMissingFeatures(i);
                Console.WriteLine("{0} has been processed", runLabels[i]);
            }

            OutputCrossTabWithId(outFilePath, alignment, runLabels);
        }
Пример #10
0
        public static void Main(string[] args)
        {
            // Parse file
            var inputFilePath = args[0];
            var datasets      = DatasetInfo.ParseDatasetInfoFile(inputFilePath);

            var fileName       = Path.GetFileNameWithoutExtension(inputFilePath);
            var directory      = Path.GetDirectoryName(inputFilePath);
            var outputfilePath = Path.Combine(directory, string.Format("{0}_crosstab.tsv", fileName));

            int nDataset   = datasets.Count;
            var prsmReader = new ProteinSpectrumMatchReader();
            var tolerance  = new Tolerance(100);
            var alignment  = new LcMsFeatureAlignment(new CompRefFeatureComparer(tolerance));

            int dataId = 0;

            foreach (var dataset in datasets)
            {
                var run      = PbfLcMsRun.GetLcMsRun(dataset.RawFilePath, 0, 0);
                var features = LcMsFeatureAlignment.LoadProMexResult(dataId++, dataset.Ms1FtFilePath, run);

                if (File.Exists(dataset.MsPfIdFilePath))
                {
                    var prsmList = prsmReader.LoadIdentificationResult(dataset.MsPfIdFilePath, ProteinSpectrumMatch.SearchTool.MsPathFinder);

                    foreach (var match in prsmList)
                    {
                        match.ProteinId = match.ProteinName;
                    }

                    // tag features by PrSMs
                    foreach (LcMsFeature feature in features)
                    {
                        //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i);
                        var massTol = tolerance.GetToleranceAsMz(feature.Mass);
                        foreach (var match in prsmList)
                        {
                            if (feature.MinScanNum < match.ScanNum && match.ScanNum < feature.MaxScanNum && Math.Abs(feature.Mass - match.Mass) < massTol)
                            {
                                feature.ProteinSpectrumMatches.Add(match);
                            }
                        }
                    }
                }

                alignment.AddDataSet(dataId, features, run);
            }

            alignment.AlignFeatures();

            Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures);

            for (var i = 0; i < nDataset; i++)
            {
                alignment.FillMissingFeatures(i);
                Console.WriteLine("{0} has been processed", datasets[i].Label);
            }

            OutputCrossTabWithId(outputfilePath, alignment, datasets.Select(ds => ds.Label).ToArray());
        }
Пример #11
0
        public static void Main(string[] args)
        {
            if (args.Length == 0)
            {
                ShowSyntax();
                return;
            }

            // Parse file
            var inputFilePath = args[0];

            if (!File.Exists(inputFilePath))
            {
                ConsoleMsgUtils.ShowError("File not found: " + inputFilePath);
                return;
            }

            var datasets = DatasetInfo.ParseDatasetInfoFile(inputFilePath);

            if (datasets.Count == 0)
            {
                ConsoleMsgUtils.ShowError("No valid data found in the dataset info file");
                ShowSyntax();
                return;
            }

            var fileName  = Path.GetFileNameWithoutExtension(inputFilePath);
            var directory = Path.GetDirectoryName(inputFilePath);

            var crosstabFilename = string.Format("{0}_crosstab.tsv", fileName);

            string outputfilePath;

            if (string.IsNullOrWhiteSpace(directory))
            {
                outputfilePath = crosstabFilename;
            }
            else
            {
                outputfilePath = Path.Combine(directory, crosstabFilename);
            }

            var nDataset   = datasets.Count;
            var prsmReader = new ProteinSpectrumMatchReader();
            var tolerance  = new Tolerance(100);
            var alignment  = new LcMsFeatureAlignment(new CompRefFeatureComparer(tolerance));

            var dataId = 0;

            foreach (var dataset in datasets)
            {
                if (!File.Exists(dataset.RawFilePath))
                {
                    ConsoleMsgUtils.ShowError("Instrument file not found: " + dataset.RawFilePath);
                    continue;
                }

                if (!File.Exists(dataset.Ms1FtFilePath))
                {
                    ConsoleMsgUtils.ShowError("ProMex results file not found: " + dataset.Ms1FtFilePath);
                    continue;
                }

                Console.WriteLine("Opening " + dataset.RawFilePath);
                var run = PbfLcMsRun.GetLcMsRun(dataset.RawFilePath, 0, 0);

                Console.WriteLine("Opening " + dataset.Ms1FtFilePath);
                var features = LcMsFeatureAlignment.LoadProMexResult(dataId++, dataset.Ms1FtFilePath, run);

                if (!string.IsNullOrWhiteSpace(dataset.MsPfIdFilePath) && File.Exists(dataset.MsPfIdFilePath))
                {
                    Console.WriteLine("Opening " + dataset.MsPfIdFilePath);
                    var prsmList = prsmReader.LoadIdentificationResult(dataset.MsPfIdFilePath, ProteinSpectrumMatch.SearchTool.MsPathFinder);

                    foreach (var match in prsmList)
                    {
                        match.ProteinId = match.ProteinName;
                    }

                    // tag features by PrSMs
                    foreach (var feature in features)
                    {
                        //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i);
                        var massTol = tolerance.GetToleranceAsMz(feature.Mass);
                        foreach (var match in prsmList)
                        {
                            if (feature.MinScanNum < match.ScanNum && match.ScanNum < feature.MaxScanNum && Math.Abs(feature.Mass - match.Mass) < massTol)
                            {
                                feature.ProteinSpectrumMatches.Add(match);
                            }
                        }
                    }
                }

                alignment.AddDataSet(dataId, features, run);
            }

            alignment.AlignFeatures();

            Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures);

            var validResults = 0;

            for (var datasetIndex = 0; datasetIndex < nDataset; datasetIndex++)
            {
                if (datasetIndex >= alignment.CountDatasets)
                {
                    ConsoleMsgUtils.ShowWarning(string.Format("Could not align {0}; features not found", datasets[datasetIndex].Label));
                    continue;
                }

                alignment.FillMissingFeatures(datasetIndex);
                Console.WriteLine("{0} has been processed", datasets[datasetIndex].Label);
                validResults++;
            }

            if (validResults > 0)
            {
                OutputCrossTabWithId(outputfilePath, alignment, datasets.Select(ds => ds.Label).ToArray());
            }
        }
Пример #12
0
        public void TestGenerateFrequencyData()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const string idFileFolder  = @"D:\MassSpecFiles\training\IdScoring\MSPF_trainset";
            const string outFileFolder = @"D:\MassSpecFiles\training\IdScoring";

            if (!Directory.Exists(idFileFolder))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, idFileFolder);
            }

            Modification.RegisterAndGetModification(Modification.Cysteinyl.Name, Modification.Cysteinyl.Composition);
            Modification.RegisterAndGetModification(Modification.Phosphorylation.Name, Modification.Phosphorylation.Composition);
            Modification.RegisterAndGetModification(Modification.Methylation.Name, Modification.Methylation.Composition);
            Modification.RegisterAndGetModification(Modification.DiMethylation.Name, Modification.DiMethylation.Composition);
            Modification.RegisterAndGetModification(Modification.TriMethylation.Name, Modification.TriMethylation.Composition);
            Modification.RegisterAndGetModification("Trioxidation", new Composition(0, 0, 0, 3, 0));
            // var aaSet = new AminoAcidSet(@"D:\MassSpecFiles\training\Mods.txt");

            foreach (var dataset in TrainSetFileLists)
            {
                var dataname  = Path.GetFileNameWithoutExtension(dataset);
                var idFile    = string.Format(@"{0}\{1}_IcTda.tsv", idFileFolder, dataname);
                var decoyFile = string.Format(@"{0}\{1}_IcDecoy.tsv", idFileFolder, dataname);
                //  var targetFile = string.Format(@"{0}\{1}_IcTarget.tsv", idFileFolder, dataname);

                if (!File.Exists(idFile))
                {
                    continue;
                }

                var prsmReader = new ProteinSpectrumMatchReader(0.01);
                var prsmList   = prsmReader.LoadIdentificationResult(idFile);

                var minScore     = prsmList.Last().Score;
                var decoyMatches = prsmReader.ReadMsPathFinderResult(decoyFile, int.MaxValue, 1, Math.Max(minScore - 5, 10));
                var run          = PbfLcMsRun.GetLcMsRun(dataset);

                var spectrumMatchSet = LcMsFeatureTrain.CollectTrainSet(dataset, idFile);
                Console.WriteLine(spectrumMatchSet.Count);
                var writer = new StreamWriter(string.Format(@"{0}\{1}_target.tsv", outFileFolder, dataname));

                foreach (var matches in spectrumMatchSet)
                {
                    foreach (var match in matches)
                    {
                        var spec = run.GetSpectrum(match.ScanNum) as ProductSpectrum;
                        GetMatchStatistics(spec, match.GetSequence(), match.Charge, writer);
                    }
                }
                writer.Close();

                writer = new StreamWriter(string.Format(@"{0}\{1}_decoy.tsv", outFileFolder, dataname));
                foreach (var match in decoyMatches)
                {
                    var sequence = match.GetSequence();
                    var spec     = run.GetSpectrum(match.ScanNum) as ProductSpectrum;
                    GetMatchStatistics(spec, sequence, match.Charge, writer);
                }
                writer.Close();
            }
        }