Beispiel #1
0
        private void Rescore(string icResultFilePath, string outputFilePath)
        {
            var parser       = new TsvFileParser(icResultFilePath);
            var sequences    = parser.GetData("Sequence");
            var scanNums     = parser.GetData("ScanNum").Select(s => Convert.ToInt32(s)).ToArray();
            var charges      = parser.GetData("Charge").Select(c => Convert.ToInt32(c)).ToArray();
            var compositions = parser.GetData("Composition").Select(Composition.Parse).ToArray();
            var modIndex     = parser.GetHeaders().IndexOf("Modifications");

            var rows    = parser.GetRows();
            var headers = parser.GetHeaders();

            using (var writer = new StreamWriter(outputFilePath))
            {
                writer.WriteLine("{0}\t{1}", string.Join("\t", headers), IcScores.GetScoreNames());
                for (var i = 0; i < parser.NumData; i++)
                {
                    var row         = rows[i];
                    var seqStr      = sequences[i];
                    var charge      = charges[i];
                    var scanNum     = scanNums[i];
                    var composition = compositions[i];

                    var scores = _topDownScorer.GetScores(AminoAcid.ProteinNTerm, seqStr, AminoAcid.ProteinCTerm, composition, charge, scanNum);

                    var token = row.Split('\t');
                    for (var j = 0; j < token.Length; j++)
                    {
                        if (j != modIndex)
                        {
                            writer.Write(token[j] + "\t");
                        }
                        else
                        {
                            writer.Write("[" + scores.Modifications + "]" + "\t");
                        }
                    }
                    writer.WriteLine(scores);
                }
            }
        }
Beispiel #2
0
        public string ProcessFile(string rawFile, string resultFile, string methodName)
        {
            if (!File.Exists(rawFile))
            {
                Console.WriteLine(@"Warning: Skipping test {0} since file not found: {1}", methodName, rawFile);
                return("\n");
            }

            if (!File.Exists(resultFile))
            {
                Console.WriteLine(@"Warning: Skipping test {0} since file not found: {1}", methodName, resultFile);
                return("\n");
            }

            var tsvParser      = new TsvFileParser(resultFile);
            var headerList     = tsvParser.GetHeaders();
            var tsvData        = tsvParser.GetAllData();
            var ms2ScanNumbers = tsvData["Scan"];

            var run = PbfLcMsRun.GetLcMsRun(rawFile, 0, 0);

            var resultLine = "";

            for (int i = 0; i < ms2ScanNumbers.Count; i++)
            {
                var scanNum  = Int32.Parse(ms2ScanNumbers[i]);
                var spectrum = run.GetSpectrum(scanNum) as ProductSpectrum;
                int tsvIndex = ms2ScanNumbers.FindIndex(x => Int32.Parse(x) == scanNum);

                var qValue = Double.Parse(tsvData["QValue"].ElementAt(tsvIndex));
                if (qValue > 0.01)
                {
                    continue;
                }

                var seqStr         = tsvData["Sequence"].ElementAt(tsvIndex).Trim();
                var seqMod         = tsvData["Modifications"].ElementAt(tsvIndex).Trim();
                var matchedFrags   = tsvData["#MatchedFragments"].ElementAt(tsvIndex).Trim();
                var aaSet          = new AminoAcidSet();
                var sequence       = Sequence.CreateSequence(seqStr, seqMod, aaSet);
                var tol            = new Tolerance(10);
                var sequenceFinder = new SequenceTagIndexFinder(tol, 1, 10);
                var results        = sequenceFinder.GetLongestSequence(spectrum, sequence);
                resultLine += String.Format("{0},{1},{2},{3},{4},{5},{6},{7},{8},\n", scanNum, matchedFrags, seqStr, results.Item1, results.Item2, results.Item3, results.Item4, results.Item5, results.Item6);
            }
            return(resultLine);
        }
Beispiel #3
0
        private void Rescore(string msAlignFilePath, string outputFilePath)
        {
            var parser    = new TsvFileParser(msAlignFilePath);
            var sequences = parser.GetData("Peptide");
            var scanNums  = parser.GetData("Scan(s)").Select(s => Convert.ToInt32(s)).ToArray();
            var charges   = parser.GetData("Charge").Select(c => Convert.ToInt32(c)).ToArray();

            var rows    = parser.GetRows();
            var headers = parser.GetHeaders();

            using (var writer = new StreamWriter(outputFilePath))
            {
                writer.WriteLine("{0}\t{1}", string.Join("\t", headers), IcScores.GetScoreNames());
                for (var i = 0; i < parser.NumData; i++)
                {
                    var row    = rows[i];
                    var seqStr = SimpleStringProcessing.GetStringBetweenDots(sequences[i]);
                    if (seqStr == null || seqStr.Contains("("))
                    {
                        continue;                                         //TODO: currently ignore ids with modifications
                    }
                    var composition = AASet.GetComposition(seqStr);
                    //var sequence = new Sequence(seqStr, AASet);
                    //if (sequence == null)
                    //{
                    //    Console.WriteLine("Ignore illegal sequence: {0}", seqStr);
                    //    continue;
                    //}
                    var charge  = charges[i];
                    var scanNum = scanNums[i];

                    var scores = _topDownScorer.GetScores(AminoAcid.ProteinNTerm, seqStr, AminoAcid.ProteinCTerm, composition, charge, scanNum);
                    if (scores == null)
                    {
                        continue;
                    }

                    writer.WriteLine("{0}\t{1}", row, scores);
                }
            }
        }
Beispiel #4
0
        public void TestTagAlignedFeatures()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            var featureDir = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, "Output");
            var mspDir     = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"Output\MSP");
            var outFile    = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"Output\aligned_features.tsv");
            var resultFile = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"\Output\aligned_ids.tsv");

            if (!Directory.Exists(featureDir))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, featureDir);
            }

            if (!Directory.Exists(mspDir))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, mspDir);
            }

            if (!File.Exists(outFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, outFile);
            }

            var dataset = GetDataList(featureDir);

            var tsvParser = new TsvFileParser(outFile);
            var massList  = new List <double>();

            for (var i = 0; i < tsvParser.NumData; i++)
            {
                massList.Add(Double.Parse(tsvParser.GetData("MonoMass")[i]));
            }

            var featureIdMap = new Dictionary <int, string>();
            var tolerance    = new Tolerance(12);
            var headers      = new List <string>();

            //foreach (var data in dataset)
            for (var d = 0; d < dataset.Count; d++)
            {
                var data           = dataset[d];
                var minScanColName = string.Format("{0}_minScan", d);
                var maxScanColName = string.Format("{0}_maxScan", d);

                var fname    = string.Format(@"{0}\{1}_IcTda.tsv", mspDir, data);
                var idParser = new TsvFileParser(fname);
                var idRows   = idParser.GetRows();
                if (headers.Count < 1)
                {
                    headers.AddRange(idParser.GetHeaders());
                }

                for (var i = 0; i < idParser.NumData; i++)
                {
                    var scan   = Int32.Parse(idParser.GetData("Scan")[i]);
                    var mass   = Double.Parse(idParser.GetData("Mass")[i]);
                    var qvalue = Double.Parse(idParser.GetData("QValue")[i]);

                    if (qvalue > 0.01)
                    {
                        break;
                    }

                    var massTol = tolerance.GetToleranceAsMz(mass);

                    var idx = massList.BinarySearch(mass);
                    if (idx < 0)
                    {
                        idx = ~idx;
                    }

                    var found = false;
                    for (var j = idx; j >= 0; j--)
                    {
                        if (Math.Abs(mass - massList[j]) > massTol)
                        {
                            break;
                        }

                        if (tsvParser.GetData(minScanColName)[j].Length < 1)
                        {
                            continue;
                        }

                        if (Int32.Parse(tsvParser.GetData(minScanColName)[j]) < scan && scan < Int32.Parse(tsvParser.GetData(maxScanColName)[j]))
                        {
                            found = true;
                            if (!featureIdMap.ContainsKey(j))
                            {
                                featureIdMap.Add(j, idRows[i]);
                            }
                            break;
                        }
                    }

                    if (found)
                    {
                        continue;
                    }
                    for (var j = idx + 1; j < massList.Count; j++)
                    {
                        if (Math.Abs(mass - massList[j]) > massTol)
                        {
                            break;
                        }
                        if (tsvParser.GetData(minScanColName)[j].Length < 1)
                        {
                            continue;
                        }
                        if (Int32.Parse(tsvParser.GetData(minScanColName)[j]) < scan && scan < Int32.Parse(tsvParser.GetData(maxScanColName)[j]))
                        {
                            found = true;
                            if (!featureIdMap.ContainsKey(j))
                            {
                                featureIdMap.Add(j, idRows[i]);
                            }
                            break;
                        }
                    }
                }
            }

            var writer = new StreamWriter(resultFile);

            writer.Write("AlignedFeatureID"); writer.Write("\t");
            writer.Write(string.Join("\t", headers));
            for (var i = 0; i < 32; i++)
            {
                writer.Write("\t");  writer.Write("{0}", i);
            }
            writer.Write("\n");

            var id = 1;

            foreach (var key in featureIdMap.Keys)
            {
                writer.Write(id); writer.Write("\t");
                writer.Write(featureIdMap[key]);
                for (var i = 0; i < 32; i++)
                {
                    writer.Write("\t"); writer.Write("{0}", tsvParser.GetData(string.Format("{0}", i))[key]);
                }
                writer.Write("\n");
                id++;
            }
            writer.Close();
        }
Beispiel #5
0
        public void TestCompositeScoring()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            //const string rawFilePath = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\SpecFiles\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw";
            const string rawFilePath = @"D:\MassSpecFiles\training\raw\QC_Shew_Intact_26Sep14_Bane_C2Column3.pbf";

            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            // Configure amino acid set
            var oxM      = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false);
            var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false);
            var acetylN  = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false);

            const int numMaxModsPerProtein = 4;
            var       searchModifications  = new List <SearchModification>
            {
                dehydroC,
                oxM,
                acetylN
            };
            var aaSet    = new AminoAcidSet(searchModifications, numMaxModsPerProtein);
            var comparer = new FilteredProteinMassBinning(aaSet, 50000, 28);

            var          run = PbfLcMsRun.GetLcMsRun(rawFilePath);
            const double filteringWindowSize    = 1.1;
            const int    isotopeOffsetTolerance = 2;
            var          tolerance    = new Tolerance(10);
            const int    minCharge    = 1;
            const int    maxCharge    = 20;
            var          graphFactory = new ProteinScoringGraphFactory(comparer, aaSet);
            var          aminoAcidSet = new AminoAcidSet();
            //var scorer = new MatchedPeakPostScorer(tolerance, minCharge, maxCharge);
            var scorer = new InformedTopDownScorer(run, aminoAcidSet, minCharge, maxCharge, tolerance);

            var fileExt = new string[] { "IcTarget", "IcDecoy" };

            foreach (var ext in fileExt)
            {
                var resultFileName = string.Format(@"D:\MassSpecFiles\training\Rescoring\QC_Shew_Intact_26Sep14_Bane_C2Column3_{0}.tsv", ext);
                var parser         = new TsvFileParser(resultFileName);
                var scans          = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray();
                var charges        = parser.GetData("Charge").Select(s => Convert.ToInt32(s)).ToArray();
                var protSequences  = parser.GetData("Sequence").ToArray();
                var modStrs        = parser.GetData("Modifications").ToArray();
                var compositions   = parser.GetData("Composition").Select(Composition.Parse).ToArray();
                var protMass       = parser.GetData("Mass").Select(s => Convert.ToDouble(s)).ToArray();
                var outputFileName = string.Format(@"D:\MassSpecFiles\training\Rescoring\QC_Shew_Intact_26Sep14_Bane_C2Column3_{0}_Rescored.tsv", ext);

                using (var writer = new StreamWriter(outputFileName))
                {
                    writer.WriteLine(string.Join("\t", parser.GetHeaders().ToArray(), 0, 15) + "\tScore\tEValue");

                    var lines = new string[parser.NumData];

                    //for (var i = 0; i < parser.NumData; i++)
                    Parallel.For(0, parser.NumData, i =>
                    {
                        var scan         = scans[i];
                        var charge       = charges[i];
                        var protSequence = protSequences[i];
                        var modStr       = modStrs[i];
                        var sequence     = Sequence.CreateSequence(protSequence, modStr, aminoAcidSet);
                        Assert.True(sequence.Composition.Equals(compositions[i] - Composition.H2O));
                        var ms2Spec = run.GetSpectrum(scan) as ProductSpectrum;
                        Assert.True(ms2Spec != null);
                        var scores = scorer.GetScores(sequence, charge, scan);

                        var deconvSpec = Deconvoluter.GetDeconvolutedSpectrum(ms2Spec, minCharge, maxCharge,
                                                                              isotopeOffsetTolerance, filteringWindowSize, tolerance, 0.7);

                        var deconvScorer = new CompositeScorerBasedOnDeconvolutedSpectrum(deconvSpec, ms2Spec, tolerance,
                                                                                          comparer);
                        var graph = graphFactory.CreateScoringGraph(deconvScorer, protMass[i]);

                        var gf = new GeneratingFunction(graph);
                        gf.ComputeGeneratingFunction();

                        var specEvalue = gf.GetSpectralEValue(scores.Score);

                        var rowStr    = parser.GetRows()[i];
                        var items     = rowStr.Split('\t').ToArray();
                        var newRowStr = string.Join("\t", items, 0, 15);

                        //writer.WriteLine("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue);
                        lock (lines)
                        {
                            lines[i] = string.Format("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue);
                        }
                        //Console.WriteLine("{0}\t{1}\t{2}", items[0], scores.Score, specEvalue);
                    });

                    foreach (var line in lines)
                    {
                        writer.WriteLine(line);
                    }
                }
                Console.WriteLine("Done");
            }
        }
Beispiel #6
0
        public void CreatePeptideAbundanceTableWithSkyline()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            // Reading Henry's results
            var          pepKeySet       = new HashSet <string>();
            var          resultDic       = new Dictionary <string, Tuple <double, double> >();
            const string henryResultPath = @"H:\Research\IPRG2015\Henry_results\tsv";

            if (!Directory.Exists(henryResultPath))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, henryResultPath);
            }

            var aaSet = new AminoAcidSet();

            foreach (var resultFile in Directory.GetFiles(henryResultPath, "*.tsv"))
            {
                var fileName = Path.GetFileName(resultFile);
                if (fileName == null)
                {
                    continue;
                }
                var sample = fileName.Substring(0, 2);
                Console.WriteLine("Processing {0}", sample);
                var tsvReader = new TsvFileParser(resultFile);
                var peptides  = tsvReader.GetData("Peptide").ToArray();
                var charge    = tsvReader.GetData("Charge").Select(c => Convert.ToInt32(c)).ToArray();
                var prob      = tsvReader.GetData("Prob").Select(Convert.ToDouble).ToArray();
                var qValue    = tsvReader.GetData("QValue").Select(Convert.ToDouble).ToArray();
                for (var i = 0; i < tsvReader.NumData; i++)
                {
                    var peptide     = peptides[i];
                    var nominalMass = GetNominalMass(aaSet, peptide);
                    var key         = sample + ":" + GetPeptide(peptides[i]) + ":" + nominalMass + ":" + charge[i];
                    var pepKey      = GetPeptide(peptides[i]) + ":" + nominalMass;
                    pepKeySet.Add(pepKey);
                    Tuple <double, double> existingScores;
                    if (resultDic.TryGetValue(key, out existingScores))
                    {
                        if (prob[i] > existingScores.Item1)
                        {
                            resultDic[key] = new Tuple <double, double>(prob[i], qValue[i]);
                        }
                    }
                    else
                    {
                        resultDic.Add(key, new Tuple <double, double>(prob[i], qValue[i]));
                    }
                }
            }

            const string skylineFilePath = @"H:\Research\IPRG2015\MySkyline\TransitionResults.csv";

            if (!File.Exists(skylineFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, skylineFilePath);
            }

            var skylineTable = new TsvFileParser(skylineFilePath, ',');

            const string outputFilePath = @"H:\Research\IPRG2015\MySkyline\SkylineTransitionResultsWithScores3.tsv";

            using (var writer = new StreamWriter(outputFilePath))
            {
                var peptides     = skylineTable.GetData("Peptide Sequence").ToArray();
                var samples      = skylineTable.GetData("Replicate Name").Select(s => "" + s[0] + s[2]).ToArray();
                var charges      = skylineTable.GetData("Precursor Charge").Select(c => Convert.ToInt32(c)).ToArray();
                var precursorMzs = skylineTable.GetData("Precursor Mz").Select(Convert.ToDouble).ToArray();

                writer.WriteLine("{0}\tProbability\tQValue", string.Join("\t", skylineTable.GetHeaders().Take(skylineTable.GetHeaders().Count - 2)));
                for (var i = 0; i < skylineTable.NumData; i++)
                {
                    var precursorMz = precursorMzs[i];
                    var charge      = charges[i];
                    var nominalMass = (int)Math.Round(((precursorMz - Constants.Proton) * charge - Composition.H2O.Mass) *
                                                      Constants.RescalingConstant);
                    var pepKey = peptides[i] + ":" + nominalMass;
                    if (!pepKeySet.Contains(pepKey))
                    {
                        //Console.WriteLine("Removing {0}", pepKey);
                        continue;
                    }
                    var    key = samples[i] + ":" + peptides[i] + ":" + nominalMass + ":" + charge;
                    double?prob = null, qValue = null;
                    Tuple <double, double> scores;
                    if (resultDic.TryGetValue(key, out scores))
                    {
                        prob   = scores.Item1;
                        qValue = scores.Item2;
                    }
                    var skylineData = skylineTable.GetRows()[i].Split(',');
                    for (var j = 0; j < skylineData.Length - 2; j++)
                    {
                        if (j != 2)
                        {
                            writer.Write(skylineData[j] + "\t");
                        }
                        else
                        {
                            writer.Write("" + skylineData[j][0] + skylineData[j][2] + "\t");
                        }
                    }
                    writer.WriteLine("{0}\t{1}",
                                     prob != null ? prob.ToString() : "NA",
                                     qValue != null ? qValue.ToString() : "NA");
                }
            }
            Console.WriteLine("Done");
        }
Beispiel #7
0
        public void TestCompositeScoring()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            var pbfFilePath = Utils.GetPbfTestFilePath(false);
            var pbfFile     = Utils.GetTestFile(methodName, pbfFilePath);

            // Configure amino acid set
            var oxM      = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false);
            var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false);
            var acetylN  = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false);

            const int numMaxModsPerProtein = 4;
            var       searchModifications  = new List <SearchModification>
            {
                dehydroC,
                oxM,
                acetylN
            };
            var aaSet    = new AminoAcidSet(searchModifications, numMaxModsPerProtein);
            var comparer = new FilteredProteinMassBinning(aaSet, 50000, 28);

            var          run = PbfLcMsRun.GetLcMsRun(pbfFile.FullName);
            const double filteringWindowSize    = 1.1;
            const int    isotopeOffsetTolerance = 2;
            var          tolerance    = new Tolerance(10);
            const int    minCharge    = 1;
            const int    maxCharge    = 20;
            var          graphFactory = new ProteinScoringGraphFactory(comparer, aaSet);
            var          aminoAcidSet = new AminoAcidSet();
            //var scorer = new MatchedPeakPostScorer(tolerance, minCharge, maxCharge);
            var scorer = new InformedTopDownScorer(run, aminoAcidSet, minCharge, maxCharge, tolerance);

            if (pbfFile.DirectoryName == null)
            {
                Assert.Ignore("Ignoring test since cannot determine the parent directory of " + pbfFile.FullName);
            }

            var fileExt = new[] { "IcTarget", "IcDecoy" };

            foreach (var ext in fileExt)
            {
                var resultFileName = Path.Combine(pbfFile.DirectoryName, Path.GetFileNameWithoutExtension(pbfFile.Name)) + string.Format("_{0}.tsv", ext);
                var parser         = new TsvFileParser(resultFileName);
                var scans          = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray();
                var charges        = parser.GetData("Charge").Select(s => Convert.ToInt32(s)).ToArray();
                var protSequences  = parser.GetData("Sequence").ToArray();
                var modStrs        = parser.GetData("Modifications").ToArray();
                var compositions   = parser.GetData("Composition").Select(Composition.Parse).ToArray();
                var protMass       = parser.GetData("Mass").Select(s => Convert.ToDouble(s)).ToArray();

                var outputFileName = Path.Combine(pbfFile.DirectoryName, Path.GetFileNameWithoutExtension(pbfFile.Name)) + string.Format("_{0}_Rescored.tsv", ext);

                using (var writer = new StreamWriter(outputFileName))
                {
                    writer.WriteLine(string.Join("\t", parser.GetHeaders().ToArray(), 0, 15) + "\tScore\tEValue");

                    var lines = new string[parser.NumData];

                    //for (var i = 0; i < parser.NumData; i++)
                    Parallel.For(0, 30, i =>
                    {
                        var scan         = scans[i];
                        var charge       = charges[i];
                        var protSequence = protSequences[i];
                        var modStr       = modStrs[i];
                        var sequence     = Sequence.CreateSequence(protSequence, modStr, aminoAcidSet);
                        // Assert.True(sequence.Composition.Equals(compositions[i] - Composition.H2O));

                        if (!(run.GetSpectrum(scan) is ProductSpectrum ms2Spec))
                        {
                            Console.WriteLine("Could not get the spectrum datafor scan {0}", scan);
                        }
Beispiel #8
0
        public void AddMostAbundantIsotopePeakIntensity()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            var rawFilePath = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"TestYufengData\QC_ShewIntact_40K_LongSeparation_1_141016155143.raw");

            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test " + methodName + @" since file not found: " + rawFilePath);
            }

            var run = PbfLcMsRun.GetLcMsRun(rawFilePath);

            var resultFilePath = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"TestYufengData\QC_ShewIntact_40K_LongSeparation_1_141016155143_IcTda.tsv");

            var parser               = new TsvFileParser(resultFilePath);
            var compositions         = parser.GetData("Composition").Select(Composition.Parse).ToArray();
            var scanNums             = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray();
            var charges              = parser.GetData("Charge").Select(s => Convert.ToInt32(s)).ToArray();
            var precursorIntensities = new double[parser.NumData];
            var tolerance            = new Tolerance(10);

            for (var i = 0; i < parser.NumData; i++)
            {
                var scanNum      = scanNums[i];
                var composition  = compositions[i];
                var charge       = charges[i];
                var precursorIon = new Ion(composition, charge);

                var precursorScanNum = run.GetPrecursorScanNum(scanNum);
                var precursorSpec    = run.GetSpectrum(precursorScanNum);
                var isotopePeaks     = precursorSpec.GetAllIsotopePeaks(precursorIon, tolerance, 0.1);
                if (isotopePeaks != null)
                {
                    var maxIntensity = 0.0;
                    for (var j = 0; j < isotopePeaks.Length; j++)
                    {
                        if (isotopePeaks[j] != null && isotopePeaks[j].Intensity > maxIntensity)
                        {
                            maxIntensity = isotopePeaks[j].Intensity;
                        }
                    }
                    precursorIntensities[i] = maxIntensity;
                }
            }

            // Writing
            var newResultFilePath = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"TestYufengData\QC_ShewIntact_40K_LongSeparation_1_141016155143_IcTdaWithIntensities.tsv");

            using (var writer = new StreamWriter(newResultFilePath))
            {
                writer.WriteLine(string.Join("\t", parser.GetHeaders()) + "\t" + "PrecursorIntensity");
                for (var i = 0; i < parser.NumData; i++)
                {
                    writer.WriteLine(parser.GetRows()[i] + "\t" + precursorIntensities[i]);
                }
            }
            Console.WriteLine("Done");
        }
Beispiel #9
0
        public void TestCompositeScoring()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            var pbfFilePath = Utils.GetPbfTestFilePath(false);
            var pbfFile     = Utils.GetTestFile(methodName, pbfFilePath);

            // Configure amino acid set
            var oxM      = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false);
            var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false);
            var acetylN  = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false);

            const int numMaxModsPerProtein = 4;
            var       searchModifications  = new List <SearchModification>
            {
                dehydroC,
                oxM,
                acetylN
            };
            var aaSet    = new AminoAcidSet(searchModifications, numMaxModsPerProtein);
            var comparer = new FilteredProteinMassBinning(aaSet, 50000, 28);

            var          run = PbfLcMsRun.GetLcMsRun(pbfFile.FullName);
            const double filteringWindowSize    = 1.1;
            const int    isotopeOffsetTolerance = 2;
            var          tolerance    = new Tolerance(10);
            const int    minCharge    = 1;
            const int    maxCharge    = 20;
            var          graphFactory = new ProteinScoringGraphFactory(comparer, aaSet);
            var          aminoAcidSet = new AminoAcidSet();
            //var scorer = new MatchedPeakPostScorer(tolerance, minCharge, maxCharge);
            var scorer = new InformedTopDownScorer(run, aminoAcidSet, minCharge, maxCharge, tolerance);

            if (pbfFile.DirectoryName == null)
            {
                Assert.Ignore("Ignoring test since cannot determine the parent directory of " + pbfFile.FullName);
            }

            var fileExt = new string[] { "IcTarget", "IcDecoy" };

            foreach (var ext in fileExt)
            {
                var resultFileName = Path.Combine(pbfFile.DirectoryName, Path.GetFileNameWithoutExtension(pbfFile.Name)) + string.Format("_{0}.tsv", ext);
                var parser         = new TsvFileParser(resultFileName);
                var scans          = parser.GetData("Scan").Select(s => Convert.ToInt32((string)s)).ToArray();
                var charges        = parser.GetData("Charge").Select(s => Convert.ToInt32(s)).ToArray();
                var protSequences  = parser.GetData("Sequence").ToArray();
                var modStrs        = parser.GetData("Modifications").ToArray();
                var compositions   = parser.GetData("Composition").Select(Composition.Parse).ToArray();
                var protMass       = parser.GetData("Mass").Select(s => Convert.ToDouble(s)).ToArray();

                var outputFileName = Path.Combine(pbfFile.DirectoryName, Path.GetFileNameWithoutExtension(pbfFile.Name)) + string.Format("_{0}_Rescored.tsv", ext);

                using (var writer = new StreamWriter(outputFileName))
                {
                    writer.WriteLine(string.Join("\t", parser.GetHeaders().ToArray(), 0, 15) + "\tScore\tEValue");

                    var lines = new string[parser.NumData];

                    //for (var i = 0; i < parser.NumData; i++)
                    Parallel.For(0, 30, i =>
                    {
                        var scan         = scans[i];
                        var charge       = charges[i];
                        var protSequence = protSequences[i];
                        var modStr       = modStrs[i];
                        var sequence     = Sequence.CreateSequence(protSequence, modStr, aminoAcidSet);
                        // Assert.True(sequence.Composition.Equals(compositions[i] - Composition.H2O));
                        var ms2Spec = run.GetSpectrum(scan) as ProductSpectrum;

                        if (ms2Spec == null)
                        {
                            Console.WriteLine("Could not get the spectrum datafor scan {0}", scan);
                        }
                        else
                        {
                            Assert.True(ms2Spec != null);
                            var scores = scorer.GetScores(sequence, charge, scan);

                            var deconvSpec = Deconvoluter.GetDeconvolutedSpectrum(ms2Spec, minCharge, maxCharge,
                                                                                  isotopeOffsetTolerance, filteringWindowSize, tolerance, 0.7);

                            var deconvScorer = new CompositeScorerBasedOnDeconvolutedSpectrum(deconvSpec, ms2Spec, tolerance,
                                                                                              comparer);
                            var graph = graphFactory.CreateScoringGraph(deconvScorer, protMass[i]);

                            var gf = new GeneratingFunction(graph);
                            gf.ComputeGeneratingFunction();

                            var specEvalue = gf.GetSpectralEValue(scores.Score);

                            var rowStr    = parser.GetRows()[i];
                            var items     = rowStr.Split('\t').ToArray();
                            var newRowStr = string.Join("\t", items, 0, 15);

                            //writer.WriteLine("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue);
                            lines[i] = string.Format("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue);
                            //Console.WriteLine("{0}\t{1}\t{2}", items[0], scores.Score, specEvalue);
                        }
                    });

                    foreach (var line in (from item in lines where !string.IsNullOrWhiteSpace(item) select item).Take(20))
                    {
                        Console.WriteLine(line);
                    }
                }
                Console.WriteLine("Done");
            }
        }
Beispiel #10
0
        public void ProcessIprg2015PreStudy()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string dir = @"H:\Research\IPRG2015";

            const string databaseFilePath = dir + @"\database\yeast6proteaprotein.fasta";

            if (!File.Exists(databaseFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath);
            }

            var database = new FastaDatabase(databaseFilePath);

            database.Read();

            const string jobFilePath = dir + @"\Jobs.tsv";

            if (!File.Exists(jobFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, jobFilePath);
            }

            var jobParser   = new TsvFileParser(jobFilePath);
            var jobs        = jobParser.GetData("Jobs").Select(j => Convert.ToInt32(j)).ToArray();
            var experiments = jobParser.GetData("Experiments").Select(e => e.Split('_')[2]).ToArray();

            //const string resultFilePath = dir + @"\AMT_Proteins_NA.tsv";
            //const string outputFilePath = dir + @"\AMT_Proteins.tsv";

            const string resultFilePath = dir + @"\AMT_Peptides_NA.tsv";
            const string outputFilePath = dir + @"\AMT_Peptides.tsv";

            var parser    = new TsvFileParser(resultFilePath);
            var headers   = parser.GetHeaders();
            var jobColNum = new int[jobs.Length];

            for (var i = 0; i < jobs.Length; i++)
            {
                for (var j = 0; j < headers.Count; j++)
                {
                    if (headers[j].Contains("" + jobs[i]))
                    {
                        jobColNum[i] = j;
                        break;
                    }
                }
            }

            for (var i = 0; i < jobs.Length; i++)
            {
                Console.WriteLine("{0}\t{1}\t{2}", jobs[i], jobColNum[i], experiments[i]);
            }

            using (var writer = new StreamWriter(outputFilePath))
            {
                var peptides   = parser.GetData("Peptide");   // Peptides
                var proteins   = parser.GetData("Reference"); // Proteins
                var abundances = new string[jobs.Length][];
                for (var i = 0; i < jobs.Length; i++)
                {
                    abundances[i] = parser.GetData(headers[jobColNum[i]]).ToArray();
                }

                if (peptides != null)
                {
                    writer.Write("Peptide\t");
                }
                writer.Write("Protein\tLength");
                for (var i = 0; i < jobs.Length; i++)
                {
                    writer.Write("\t" + experiments[i]);
                }
                writer.WriteLine("\tSpikeIn");
                for (var i = 0; i < proteins.Count; i++)
                {
                    var protein = proteins[i];
                    if (protein.StartsWith("XXX") || protein.StartsWith("Contaminant"))
                    {
                        continue;
                    }
                    var length = database.GetProteinLength(protein);
                    //if (length <= 0)
                    //{
                    //    Console.WriteLine("Shit!");
                    //    return;
                    //}
                    if (peptides != null)
                    {
                        writer.Write(peptides[i] + "\t");
                    }
                    writer.Write(protein + "\t" + length);
                    for (var j = 0; j < jobs.Length; j++)
                    {
                        writer.Write("\t" + abundances[j][i]);
                    }
                    writer.WriteLine("\t" + (protein.StartsWith("STANDARD") ? 1 : 0));
                }
            }
        }
Beispiel #11
0
        public void TestSequenceTag()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            //const string TestRawFile = @"D:\\Vlad_TopDown\\raw\\yufeng_column_test2.raw";
            //const string TestResultFile = @"D:\\Vlad_TopDown\\results\\yufeng_column_test2_IcTda.tsv";
            const string TestRawFile    = @"D:\MassSpecFiles\training\raw\QC_Shew_Intact_26Sep14_Bane_C2Column3.pbf";
            const string TestResultFile = @"D:\MassSpecFiles\training\IdResult\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv";

            //const string TestRawFile = @"D:\MassSpecFiles\Lewy\Lewy_intact_01.raw";
            //const string TestResultFile = @"D:\MassSpecFiles\Lewy\Lewy_intact_01_IcTda.tsv";

            if (!File.Exists(TestRawFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, TestRawFile);
            }

            if (!File.Exists(TestResultFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, TestResultFile);
            }

            // Configure amino acid set

            var aminoAcidList = new List <AminoAcid>();

            foreach (var aa in AminoAcid.StandardAminoAcidArr)
            {
                aminoAcidList.Add(aa);
                aminoAcidList.Add(new ModifiedAminoAcid(aa, Modification.Acetylation));
                aminoAcidList.Add(new ModifiedAminoAcid(aa, Modification.Oxidation));
            }

            //const int MaxTags = 100000;
            var tsvParser      = new TsvFileParser(TestResultFile);
            var headerList     = tsvParser.GetHeaders();
            var tsvData        = tsvParser.GetAllData();
            var ms2ScanNumbers = tsvData["Scan"];

            var run      = PbfLcMsRun.GetLcMsRun(TestRawFile);
            var nSpec    = 0;
            var nHitSpec = 0;

            for (var i = 0; i < ms2ScanNumbers.Count; i++)
            //foreach(var scanNum in targetScans)
            {
                var scanNum = Int32.Parse(ms2ScanNumbers[i]);

                //if (scanNum != 4672) continue;

                var spectrum = run.GetSpectrum(scanNum) as ProductSpectrum;

                int tsvIndex = ms2ScanNumbers.FindIndex(x => Int32.Parse(x) == scanNum);
                var qValue   = double.Parse(tsvData["QValue"].ElementAt(tsvIndex));
                if (qValue > 0.01)
                {
                    break;
                }

                var seqStr    = tsvData["Sequence"].ElementAt(tsvIndex).Trim();
                var modStr    = tsvData["Modifications"].ElementAt(tsvIndex).Trim();
                var tolerance = new Tolerance(5);
                var tagFinder = new SequenceTagFinder(spectrum, tolerance, 5, 8, aminoAcidList.ToArray());
                var nTags     = 0;
                var nHit      = 0;

                var seqOjb         = Sequence.CreateSequence(seqStr, modStr, new AminoAcidSet());
                var compWithoutH2O = seqOjb.Composition - Composition.H2O;

                //Console.WriteLine(compWithoutH2O.Mass);

                foreach (var seqTagStr in tagFinder.GetAllSequenceTagString())
                {
                    if (seqStr.Contains(seqTagStr.Sequence)) //|| seqStr.Contains(Reverse(tagStr)))
                    {
                        //var idx = seqStr.IndexOf(seqTagStr.Sequence);

                        //seqStr.Substring(0, idx)

                        /*var comp2 = seqOjb.GetComposition(0, idx);
                         *
                         * Console.Write(comp2.Mass);
                         * Console.Write("\t");
                         *
                         * Console.Write(seqTagStr.FlankingMass);
                         * Console.Write("\t");
                         * Console.Write(seqTagStr.Sequence);
                         * Console.Write("\t");
                         * Console.Write(seqTagStr.IsPrefix);
                         * Console.WriteLine("");
                         */
                        if (seqStr.Contains(seqTagStr.Sequence))
                        {
                            nHit++;
                        }
                    }
                    nTags++;
                }

                nSpec++;
                if (nHit > 0)
                {
                    nHitSpec++;
                }

                Console.WriteLine(@"[{0}]seqLen = {1}: {2}/{3}", scanNum, seqStr.Length, nHit, nTags);
            }
            //var existingTags = tagFinder.ExtractExistingSequneceTags(sequence);
            Console.Write("{0}/{1}", nHitSpec, nSpec);
        }