Exemple #1
0
        public IList <SpectrumMatch> Read()
        {
            var specMatches      = new List <SpectrumMatch>();
            var tsvFile          = new TsvFileParser(_fileName);
            var precursorCharges = tsvFile.GetData(PrecursorChargeHeader);
            var scans            = tsvFile.GetData(ScanHeader);

            var peptides = tsvFile.GetData(TopDownPeptideHeader);

            if (peptides != null)
            {
                var          peptideSet      = new HashSet <string>();
                const double filterThreshold = QValueThreshold;
                var          filterValues    = tsvFile.GetData(QValueHeader);

                var aset = new AminoAcidSet();

                for (int i = 0; i < peptides.Count; i++)
                {
                    if (Convert.ToDouble(filterValues[i]) > filterThreshold || peptideSet.Contains(peptides[i]))
                    {
                        continue;
                    }
                    peptideSet.Add(peptides[i]);
                    var scanNum         = Convert.ToInt32(scans[i]);
                    int precursorCharge = Convert.ToInt32(precursorCharges[i]);
                    specMatches.Add(new SpectrumMatch(new Sequence(peptides[i], aset), _lcms, scanNum, precursorCharge, _decoy));
                }
            }
            return(specMatches);
        }
        public void TestReadingTmtResultFile()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string filePath = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSGFPlusResultTMT10.tsv";

            if (!File.Exists(filePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, filePath);
            }

            var parser      = new TsvFileParser(filePath);
            var pepStrs     = parser.GetData("Peptide");
            var formulaStrs = parser.GetData("Formula");

            Assert.True(pepStrs.Count == formulaStrs.Count);

            var peptides = pepStrs.Select(Sequence.GetSequenceFromMsGfPlusPeptideStr).ToList();
            var formulae = formulaStrs.Select(Composition.Parse).ToList();

            Assert.True(peptides.Count == formulae.Count);

            for (var i = 0; i < peptides.Count; i++)
            {
                Assert.True((peptides[i].Composition + Composition.H2O).Equals(formulae[i]));
            }
        }
Exemple #3
0
        public void TestReadingTmtResultFile()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            var filePath = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, "MSGFPlusResultTMT10.tsv");

            if (!File.Exists(filePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, filePath);
            }

            var parser      = new TsvFileParser(filePath);
            var pepStrs     = parser.GetData("Peptide");
            var formulaStrs = parser.GetData("Formula");

            Assert.True(pepStrs.Count == formulaStrs.Count);

            var peptides = pepStrs.Select(Sequence.GetSequenceFromMsGfPlusPeptideStr).ToList();
            var formulae = formulaStrs.Select(Composition.Parse).ToList();

            Assert.True(peptides.Count == formulae.Count);

            for (var i = 0; i < peptides.Count; i++)
            {
                Assert.True((peptides[i].Composition + Composition.H2O).Equals(formulae[i]));
            }
        }
Exemple #4
0
        private void Parse(string tagFilePath)
        {
            var tagParser    = new TsvFileParser(tagFilePath);
            var scan         = tagParser.GetData("ScanNum").Select(s => Convert.ToInt32(s)).ToArray();
            var sequence     = tagParser.GetData("SequenceTag").ToArray();
            var isPrefix     = tagParser.GetData("IsPrefix").Select(s => s.Equals("1")).ToArray();
            var flankingMass = tagParser.GetData("FlankingMass").Select(Convert.ToDouble).ToArray();

            for (var i = 0; i < tagParser.NumData; i++)
            {
                if (sequence[i].Length < _minTagLength)
                {
                    continue;
                }
                var tag = new SequenceTag.SequenceTag(scan[i], sequence[i], isPrefix[i], flankingMass[i]);

                IList <SequenceTag.SequenceTag> tagList;
                if (_scanToTags.TryGetValue(scan[i], out tagList))
                {
                    if (tagList.Count < _numTagsPerScan)
                    {
                        tagList.Add(tag);
                    }
                }
                else
                {
                    _scanToTags.Add(scan[i], new List <SequenceTag.SequenceTag> {
                        tag
                    });
                }
            }
        }
Exemple #5
0
        public void TestPredictPTMfromMs1ft()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string resultFilePath = @"\\protoapps\UserData\Jungkap\FeatureFinding\ProMex_v1.1\test\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv";

            if (!File.Exists(resultFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath);
            }

            // const string ms1ftFilePath = @"\\protoapps\UserData\Jungkap\FeatureFinding\ProMex_v1.1\test\QC_Shew_Intact_26Sep14_Bane_C2Column3.ms1ft";

            var parser        = new TsvFileParser(resultFilePath);
            var sequences     = parser.GetData("Sequence");
            var modifications = parser.GetData("Modifications");
            var scanNums      = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray();
            var qValues       = parser.GetData("QValue").Select(s => Convert.ToDouble(s)).ToArray();
            var nMacthed      = parser.GetData("#MatchedFragments");
            var aaSet         = new AminoAcidSet();
            var ptmList       = new List <Tuple <int, double, double> >();

            for (var i = 0; i < parser.NumData; i++)
            {
                if (qValues[i] > 0.01)
                {
                    continue;
                }
                //var sequenceComp = aaSet.GetComposition(sequences[i]) + Composition.H2O;
                var seq          = new Sequence(sequences[i], aaSet);
                var sequenceComp = seq.Composition + Composition.H2O;

                var modComposition = Composition.Zero;
                var modsStr        = modifications[i];
                if (modsStr.Length == 0)
                {
                    continue;
                }
                var mods = modsStr.Split(',');
                foreach (var modStr in mods.Where(str => str.Length > 0))
                {
                    var modName = modStr.Split()[0];
                    var mod     = Modification.Get(modName);
                    modComposition += mod.Composition;
                }
                Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}", scanNums[i], sequenceComp.Mass, modComposition.Mass, nMacthed[i], sequences[i], modsStr);
                //var compFromSeqAndMods = sequenceComp + modComposition;
                //Assert.True(compFromSeqAndMods.Equals(compositions[i]));

                ptmList.Add(new Tuple <int, double, double>(scanNums[i], sequenceComp.Mass, modComposition.Mass));
            }

            //var featureParser = new TsvFileParser(ms1ftFilePath);
            //var minScan = featureParser.GetData("MinScan").Select(s => Convert.ToInt32(s)).ToArray();
            //var maxScan = featureParser.GetData("MaxScan").Select(s => Convert.ToInt32(s)).ToArray();
            //var monoMass = featureParser.GetData("MonoMass").Select(s => Convert.ToDouble(s)).ToArray();
        }
Exemple #6
0
        public void DiaRankScore()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string dataFile =
                @"\\protoapps\UserData\Wilkins\BottomUp\HCD_QCShew\raw\QC_Shew_13_04_A_17Feb14_Samwise_13-07-28.raw";
            const string tsvFile =
                @"\\protoapps\UserData\Wilkins\BottomUp\HCD_QCShew\tsv\QC_Shew_13_04_A_17Feb14_Samwise_13-07-28.tsv";

            if (!File.Exists(dataFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dataFile);
            }

            if (!File.Exists(tsvFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tsvFile);
            }

            var parser    = new TsvFileParser(tsvFile);
            var sequences = parser.GetData("Peptide");
            var charges   = parser.GetData("Charge");
            var scans     = parser.GetData("ScanNum");

            var lcms       = InMemoryLcMsRun.GetLcMsRun(dataFile, 0, 0);
            var rankScorer =
                new DiaRankScore(
                    @"C:\Users\wilk011\Documents\DataFiles\TestFolder\HCD_QExactive_Tryp.txt");

            using (
                var outFile = new StreamWriter(@"C:\Users\wilk011\Documents\DataFiles\TestFolder\HCD_QCShew_Score_2.txt"))
            {
                outFile.WriteLine("Target\tDecoy");
                for (int i = 0; i < sequences.Count; i++)
                {
                    string sequenceStr = sequences[i];
                    int    charge      = Convert.ToInt32(charges[i]);
                    int    scan        = Convert.ToInt32(scans[i]);

                    var sequence = Sequence.GetSequenceFromMsGfPlusPeptideStr(sequenceStr);
                    var decoySeq = Sequence.GetSequenceFromMsGfPlusPeptideStr(sequenceStr);
                    decoySeq.Reverse();
                    var decoyStr = decoySeq.Aggregate("", (current, aa) => current + aa);
                    decoyStr = SimpleStringProcessing.Mutate(decoyStr, sequence.Count / 2);
                    decoySeq = Sequence.GetSequenceFromMsGfPlusPeptideStr(decoyStr);

                    var sequenceScore = rankScorer.GetScore(sequence, charge, scan, lcms);
                    var decoyScore    = rankScorer.GetScore(decoySeq, charge, scan, lcms);
                    outFile.WriteLine("{0}\t{1}", sequenceScore, decoyScore);
                }
            }
        }
        public void TestInitialScoring()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string icResultPath = @"C:\cygwin\home\kims336\Data\QCShewQE\Ic_NTT2_03_NoMod_NoRescoring\QC_Shew_13_04_A_17Feb14_Samwise_13-07-28_IcTarget.tsv";

            if (!File.Exists(icResultPath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, icResultPath);
            }

            var icParser   = new TsvFileParser(icResultPath);
            var icScans    = icParser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray();
            var icPeptides = icParser.GetData("Sequence");
            var icScore    = icParser.GetData("Score").Select(s => Convert.ToInt32(s)).ToArray();
            var map        = new Dictionary <string, int>();

            for (var i = 0; i < icParser.NumData; i++)
            {
                map.Add(icScans[i] + ":" + icPeptides[i], icScore[i]);
            }

            const string msgfPlusResultPath = @"C:\cygwin\home\kims336\Data\QCShewQE\NoMod.tsv";

            if (!File.Exists(msgfPlusResultPath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, msgfPlusResultPath);
            }

            var msgfPlusResults = new MsGfResults(msgfPlusResultPath);
            var matches         = msgfPlusResults.GetMatchesAtPsmFdr(0.01);

            //Console.WriteLine("NumMatches: {0}", matches.Count);
            Console.WriteLine("ScanNum\tPeptide\tSpecEValue\tIcScore");
            foreach (var match in matches)
            {
                var scanNum    = match.ScanNum;
                var peptide    = match.Peptide;
                var specEValue = match.SpecEValue;
                int score;
                if (!map.TryGetValue(scanNum + ":" + peptide, out score))
                {
                    score = -1;
                }
                Console.WriteLine("{0}\t{1}\t{2}\t{3}", scanNum, peptide, specEValue, score);
            }
        }
Exemple #8
0
        public void CompareRtFusion()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            // Fusion
            const string qeDdaResult = @"D:\Research\Data\UW\Fusion\DDA_Summary.tsv";
            const string qeDiaResult = @"D:\Research\Data\UW\Fusion\DIA_Summary.tsv";

            const string specFileDda = @"D:\Research\Data\UW\Fusion\WT_D_DDA_130412065618.raw";
            var          ddaReader   = new XCaliburReader(specFileDda);

            const string specFileDia = @"D:\Research\Data\UW\Fusion\WT_D_DIA_130412091220.raw";
            var          diaReader   = new XCaliburReader(specFileDia);

            const string resultPath1 = qeDdaResult;
            const string resultPath2 = qeDiaResult;

            var result1 = new TsvFileParser(resultPath1);
            var result2 = new TsvFileParser(resultPath2);

            const double pepQValueThreshold = 0.01;
            var          vennDiagram        = new VennDiagram <string>(result1.GetPeptides(pepQValueThreshold),
                                                                       result2.GetPeptides(pepQValueThreshold));

            var intersectionPeptides = vennDiagram.Intersection;

            var result1Peptides = result1.GetData("Peptide");
            var result1ScanNums = result1.GetData("ScanNum");

            var result2Peptides = result2.GetData("Peptide");
            var result2ScanNums = result2.GetData("ScanNum");

            Console.WriteLine("Peptide\tScanNum1\tScanNum2\tRt1\tRt2");
            foreach (var peptide in intersectionPeptides)
            {
                var index1 = result1Peptides.IndexOf(peptide);
                var index2 = result2Peptides.IndexOf(peptide);

                var scanNum1 = Convert.ToInt32(result1ScanNums[index1]);
                var scanNum2 = Convert.ToInt32(result2ScanNums[index2]);

                var reader1 = ddaReader;
                var reader2 = diaReader;

                Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", peptide.Replace("C+57.021", "C"), scanNum1, scanNum2, reader1.RtFromScanNum(scanNum1), reader2.RtFromScanNum(scanNum2));
            }
        }
Exemple #9
0
        private void Read(string ms1FtFileName)
        {
            var ftFileParser         = new TsvFileParser(ms1FtFileName);
            var monoMassArr          = ftFileParser.GetData("MonoMass").Select(Convert.ToDouble).ToArray();
            var minScanArray         = ftFileParser.GetData("MinScan").Select(s => Convert.ToInt32(s)).ToArray();
            var maxScanArray         = ftFileParser.GetData("MaxScan").Select(s => Convert.ToInt32(s)).ToArray();
            var repScanArray         = ftFileParser.GetData("RepScan").Select(s => Convert.ToInt32(s)).ToArray();
            var minChargeArray       = ftFileParser.GetData("MinCharge").Select(s => Convert.ToInt32(s)).ToArray();
            var maxChargeArray       = ftFileParser.GetData("MaxCharge").Select(s => Convert.ToInt32(s)).ToArray();
            var scoreArray           = ftFileParser.GetData("LikelihoodRatio").Select(Convert.ToDouble).ToArray();
            var featureCountFiltered = 0;

            for (var i = 0; i < monoMassArr.Length; i++)
            {
                //if (flagArray[i] == 0 && probArray[i] < _minProbability)  continue;
                if (scoreArray[i] < _minLikelihoodRatio)
                {
                    continue;
                }
                featureCountFiltered++;
                var monoMass = monoMassArr[i];
                _lcMsChargeMap.SetMatches(monoMass, minScanArray[i], maxScanArray[i], repScanArray[i], minChargeArray[i], maxChargeArray[i]);
            }

            // NOTE: The DMS Analysis Manager looks for this statistic; do not change it
            Console.Write(@"{0}/{1} features loaded...", featureCountFiltered, monoMassArr.Length);
            _lcMsChargeMap.CreateMassToScanNumMap();
        }
Exemple #10
0
        public void TestClusterCentricSearch()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string pfResultFilePath = @"H:\Research\QCShew_TopDown\Production\M1_V4_JP_Len500\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv";

            if (!File.Exists(pfResultFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, pfResultFilePath);
            }

            var tsvReader = new TsvFileParser(pfResultFilePath);

            var ms2Scans     = tsvReader.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray();
            var compositions = tsvReader.GetData("Composition").ToArray();
            var qValues      = tsvReader.GetData("QValue").Select(Convert.ToDouble).ToArray();

            var compScanTable = new Dictionary <string, IList <int> >();

            for (var i = 0; i < qValues.Length; i++)
            {
                var qValue = qValues[i];
                if (qValue > 0.01)
                {
                    break;
                }
                IList <int> scanNums;
                if (compScanTable.TryGetValue(compositions[i], out scanNums))
                {
                    scanNums.Add(ms2Scans[i]);
                }
                else
                {
                    compScanTable.Add(compositions[i], new List <int> {
                        ms2Scans[i]
                    });
                }
            }

            Console.Write("NumCompositions: {0}", compScanTable.Keys.Count);

            //const string featureFilePath = @"H:\Research\QCShew_TopDown\Production\M1_V4_JP_Len500\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv";
        }
Exemple #11
0
        private void Read(string isosFileName)
        {
            var icrToolsparser = new TsvFileParser(isosFileName, ',');
            var monoMassArr    = icrToolsparser.GetData("monoisotopic_mw").Select(Convert.ToDouble).ToArray();
            var scanArray      = icrToolsparser.GetData("scan_num").Select(s => Convert.ToInt32(s)).ToArray();
            var chargeArray    = icrToolsparser.GetData("charge").Select(s => Convert.ToInt32(s)).ToArray();

            var fitStringArr = icrToolsparser.GetData("fit");
            var fitArray     = fitStringArr == null ? null : icrToolsparser.GetData("fit").Select(Convert.ToDouble).ToArray();

            var featureCountFiltered = 0;

            var minMass = double.MaxValue;
            var maxMass = 0.0;

            for (var i = 0; i < monoMassArr.Length; i++)
            {
                if (fitArray != null && fitArray[i] > _fitScoreThreshold || chargeArray[i] <= 1)
                {
                    continue;
                }

                featureCountFiltered++;

                var scan     = scanArray[i];
                var monoMass = monoMassArr[i];
                if (minMass > monoMass)
                {
                    minMass = monoMass;
                }
                if (maxMass < monoMass)
                {
                    maxMass = monoMass;
                }

                var minScan = _run.GetPrevScanNum(scan, 1);
                var maxScan = _run.GetNextScanNum(scan, 1);
                _lcMsMatchMap.SetMatches(monoMass, minScan, maxScan);
            }

            Console.Write(@"{0}/{1} features loaded...", featureCountFiltered, monoMassArr.Length);

            _lcMsMatchMap.CreateSequenceMassToMs2ScansMap(_run, _massTolerance, minMass, maxMass);
        }
Exemple #12
0
        public void ValidateIcResultsWithModifications()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const string resultFilePath = @"C:\cygwin\home\kims336\Data\TopDownJia\raw\Synocho_D1_1_Rescored.tsv";

            if (!File.Exists(resultFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath);
            }

            var parser        = new TsvFileParser(resultFilePath);
            var sequences     = parser.GetData("Sequence");
            var modifications = parser.GetData("Modifications");
            var compositions  = parser.GetData("Composition").Select(Composition.Parse).ToArray();
            var scanNums      = parser.GetData("ScanNum").Select(s => Convert.ToInt32(s)).ToArray();
            var aaSet         = new AminoAcidSet();

            for (var i = 0; i < parser.NumData; i++)
            {
                var sequenceComp = aaSet.GetComposition(sequences[i]) + Composition.H2O;

                var modComposition = Composition.Zero;
                var modsStr        = modifications[i].Substring(1, modifications[i].Length - 2);
                var mods           = modsStr.Split(',');
                foreach (var modStr in mods)
                {
                    if (modStr.Length == 0)
                    {
                        continue;
                    }
                    var modName = modStr.Split()[0];
                    var mod     = Modification.Get(modName);
                    modComposition += mod.Composition;
                }

                var compFromSeqAndMods = sequenceComp + modComposition;
                Assert.True(compFromSeqAndMods.Equals(compositions[i]));
            }
        }
Exemple #13
0
        private void Rescore(string icResultFilePath, string outputFilePath)
        {
            var parser       = new TsvFileParser(icResultFilePath);
            var sequences    = parser.GetData("Sequence");
            var scanNums     = parser.GetData("ScanNum").Select(s => Convert.ToInt32(s)).ToArray();
            var charges      = parser.GetData("Charge").Select(c => Convert.ToInt32(c)).ToArray();
            var compositions = parser.GetData("Composition").Select(Composition.Parse).ToArray();
            var modIndex     = parser.GetHeaders().IndexOf("Modifications");

            var rows    = parser.GetRows();
            var headers = parser.GetHeaders();

            using (var writer = new StreamWriter(outputFilePath))
            {
                writer.WriteLine("{0}\t{1}", string.Join("\t", headers), IcScores.GetScoreNames());
                for (var i = 0; i < parser.NumData; i++)
                {
                    var row         = rows[i];
                    var seqStr      = sequences[i];
                    var charge      = charges[i];
                    var scanNum     = scanNums[i];
                    var composition = compositions[i];

                    var scores = _topDownScorer.GetScores(AminoAcid.ProteinNTerm, seqStr, AminoAcid.ProteinCTerm, composition, charge, scanNum);

                    var token = row.Split('\t');
                    for (var j = 0; j < token.Length; j++)
                    {
                        if (j != modIndex)
                        {
                            writer.Write(token[j] + "\t");
                        }
                        else
                        {
                            writer.Write("[" + scores.Modifications + "]" + "\t");
                        }
                    }
                    writer.WriteLine(scores);
                }
            }
        }
Exemple #14
0
        private void Rescore(string msAlignFilePath, string outputFilePath)
        {
            var parser    = new TsvFileParser(msAlignFilePath);
            var sequences = parser.GetData("Peptide");
            var scanNums  = parser.GetData("Scan(s)").Select(s => Convert.ToInt32(s)).ToArray();
            var charges   = parser.GetData("Charge").Select(c => Convert.ToInt32(c)).ToArray();

            var rows    = parser.GetRows();
            var headers = parser.GetHeaders();

            using (var writer = new StreamWriter(outputFilePath))
            {
                writer.WriteLine("{0}\t{1}", string.Join("\t", headers), IcScores.GetScoreNames());
                for (var i = 0; i < parser.NumData; i++)
                {
                    var row    = rows[i];
                    var seqStr = SimpleStringProcessing.GetStringBetweenDots(sequences[i]);
                    if (seqStr == null || seqStr.Contains("("))
                    {
                        continue;                                         //TODO: currently ignore ids with modifications
                    }
                    var composition = AASet.GetComposition(seqStr);
                    //var sequence = new Sequence(seqStr, AASet);
                    //if (sequence == null)
                    //{
                    //    Console.WriteLine("Ignore illegal sequence: {0}", seqStr);
                    //    continue;
                    //}
                    var charge  = charges[i];
                    var scanNum = scanNums[i];

                    var scores = _topDownScorer.GetScores(AminoAcid.ProteinNTerm, seqStr, AminoAcid.ProteinCTerm, composition, charge, scanNum);
                    if (scores == null)
                    {
                        continue;
                    }

                    writer.WriteLine("{0}\t{1}", row, scores);
                }
            }
        }
Exemple #15
0
        private void OutputMergedResult(TextWriter writer, TsvFileParser parser, FastaDatabase fastaDb)
        {
            var scoreColumn = parser.GetData("#MatchedFragments") ?? parser.GetData("Score");
            var qValColumn  = parser.GetData("QValue");

            for (var i = 0; i < parser.NumData; i++)
            {
                var sequence = parser.GetData("Sequence")[i];
                var scanNum  = int.Parse(parser.GetData("Scan")[i]);
                var mass     = double.Parse(parser.GetData("Mass")[i]);
                var protName = parser.GetData("ProteinName")[i];
                var protDesc = fastaDb.GetProteinDescription(protName);

                var firstResId = int.Parse(parser.GetData("Start")[i]);
                var lastResId  = int.Parse(parser.GetData("End")[i]);
                var score      = double.Parse(scoreColumn[i]);
                var mod        = parser.GetData("Modifications")[i];
                var qvalue     = (qValColumn != null) ? qValColumn[i] : "0";

                writer.Write(scanNum);
                writer.Write("\t");
                writer.Write(sequence);
                writer.Write("\t");
                writer.Write(mod);
                writer.Write("\t");
                writer.Write(mass);
                writer.Write("\t");
                writer.Write(protName);
                writer.Write("\t");
                writer.Write(protDesc);
                writer.Write("\t");
                writer.Write(firstResId);
                writer.Write("\t");
                writer.Write(lastResId);
                writer.Write("\t");
                writer.Write(score);
                writer.Write("\t");
                writer.Write(qvalue);
                writer.Write("\n");
            }
        }
Exemple #16
0
        public void TestClusterCentricSearch(double qValueThreshold, int expectedNumCompositions)
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            var resultFilePath = Path.Combine(Utils.DEFAULT_SPEC_FILES_FOLDER, "QC_Shew_Intact_26Sep14_Bane_C2Column3_Excerpt_IcTda.tsv");
            var resultFile     = Utils.GetTestFile(methodName, resultFilePath);

            var tsvReader = new TsvFileParser(resultFile.FullName);

            var ms2Scans     = tsvReader.GetData("Scan").Select(s => Convert.ToInt32((string)s)).ToArray();
            var compositions = tsvReader.GetData("Composition").ToArray();
            var qValues      = tsvReader.GetData("QValue").Select(Convert.ToDouble).ToArray();

            var compScanTable = new Dictionary <string, IList <int> >();

            for (var i = 0; i < qValues.Length; i++)
            {
                var qValue = qValues[i];
                if (qValue > qValueThreshold)
                {
                    break;
                }
                IList <int> scanNums;
                if (compScanTable.TryGetValue(compositions[i], out scanNums))
                {
                    scanNums.Add(ms2Scans[i]);
                }
                else
                {
                    compScanTable.Add(compositions[i], new List <int> {
                        ms2Scans[i]
                    });
                }
            }

            Console.Write("NumCompositions: {0}", compScanTable.Keys.Count);

            Assert.AreEqual(expectedNumCompositions, compScanTable.Keys.Count);
        }
Exemple #17
0
        public void CompareIpaIc()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const string resultDir = @"D:\Research\Data\UW\QExactive\Ic_NTT2_03";

            if (!Directory.Exists(resultDir))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, resultDir);
            }

            var targetPeptides = new HashSet <string>();

            foreach (var icResultFilePath in Directory.GetFiles(resultDir, "*DIA*IcTarget.tsv"))
            {
                var icParser = new TsvFileParser(icResultFilePath);
                foreach (var peptide in icParser.GetData("Sequence"))
                {
                    targetPeptides.Add(peptide);
                }
            }

            const string ipaResultPath = @"D:\Research\Data\UW\QExactive\DIA_All_Summary.tsv";

            if (!File.Exists(ipaResultPath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, methodName);
            }

            var parser      = new TsvFileParser(ipaResultPath);
            var ipaPeptides = parser.GetPeptides(0.005).Select(p => p.Replace("C+57.021", "C"));
            var ipaOnly     = 0;
            var both        = 0;

            foreach (var ipaPeptide in ipaPeptides)
            {
                if (targetPeptides.Contains(ipaPeptide))
                {
                    ++both;
                }
                else
                {
                    ++ipaOnly;
                    Console.WriteLine(ipaPeptide);
                }
            }

            Console.WriteLine("Both: {0}, IpaOnly: {1}, Sum: {2}", both, ipaOnly, both + ipaOnly);
        }
Exemple #18
0
        public IList <SpectrumMatch> Read()
        {
            var specMatches      = new List <SpectrumMatch>();
            var tsvFile          = new TsvFileParser(_fileName);
            var precursorCharges = tsvFile.GetData(PrecursorChargeHeader);
            var scans            = tsvFile.GetData(ScanHeader);

            var peptides = tsvFile.GetData(BottomUpPeptideHeader);

            if (scans == null)
            {
                throw new FormatException();
            }

            var pepQValues = tsvFile.GetData(PepQValueHeader);
            var formulas   = tsvFile.GetData(FormulaHeader);

            var peptideSet = new HashSet <string>();

            for (int i = 0; i < peptides.Count; i++)
            {
                if (Convert.ToDouble(pepQValues[i]) > PepQValueThreshold || peptideSet.Contains(peptides[i]))
                {
                    continue;
                }
                peptideSet.Add(peptides[i]);
                var scanNum = Convert.ToInt32(scans[i]);
//                    var spectrum = lcms.GetSpectrum(scanNum);
//                    var spec = spectrum as ProductSpectrum;
//                    if (spec == null || spec.ActivationMethod != Act) continue;
                int precursorCharge = Convert.ToInt32(precursorCharges[i]);
                specMatches.Add((formulas != null && formulas[i] != null)
                    ? new SpectrumMatch(peptides[i], DataFileFormat.IcBottomUp, _lcms, scanNum, precursorCharge, _decoy, formulas[i])
                    : new SpectrumMatch(peptides[i], DataFileFormat.IcBottomUp, _lcms, scanNum, precursorCharge, _decoy));
            }
            return(specMatches);
        }
Exemple #19
0
        public void TestMs1EvidenceScore()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            var testRawFile = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"TopDown\Lewy_ManyMods\Lewy_intact_01.pbf");

            if (!File.Exists(testRawFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, testRawFile);
            }

            var testResultFile = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"TopDown\Lewy_ManyMods\TestOutput\Lewy_intact_01_IcTda.tsv");

            if (!File.Exists(testResultFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, testResultFile);
            }

            var run           = PbfLcMsRun.GetLcMsRun(testRawFile);
            var tsvParser     = new TsvFileParser(testResultFile);
            var featureFinder = new LcMsPeakMatrix(run);

            for (var i = 0; i < tsvParser.NumData; i++)
            {
                var scan   = int.Parse(tsvParser.GetData("Scan")[i]);
                var charge = int.Parse(tsvParser.GetData("Charge")[i]);
                var mass   = double.Parse(tsvParser.GetData("Mass")[i]);
                var qvalue = double.Parse(tsvParser.GetData("QValue")[i]);

                //var targetFeature = new TargetFeature(mass, charge, scan);
                var score = featureFinder.GetMs1EvidenceScore(scan, mass, charge);
                Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", scan, mass, charge, qvalue, score);
            }
        }
Exemple #20
0
        public void TestMs1EvidenceScore()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string TestRawFile = @"\\protoapps\UserData\Jungkap\Lewy\Lewy_intact_01.pbf";

            if (!File.Exists(TestRawFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, TestRawFile);
            }

            const string TestResultFile = @"\\protoapps\UserData\Jungkap\Lewy\Lewy_intact_01_IcTda.tsv";

            if (!File.Exists(TestResultFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, TestResultFile);
            }

            var run           = PbfLcMsRun.GetLcMsRun(TestRawFile);
            var tsvParser     = new TsvFileParser(TestResultFile);
            var featureFinder = new LcMsPeakMatrix(run);

            for (var i = 0; i < tsvParser.NumData; i++)
            {
                var scan   = int.Parse(tsvParser.GetData("Scan")[i]);
                var charge = int.Parse(tsvParser.GetData("Charge")[i]);
                var mass   = double.Parse(tsvParser.GetData("Mass")[i]);
                var qvalue = double.Parse(tsvParser.GetData("QValue")[i]);

                //var targetFeature = new TargetFeature(mass, charge, scan);
                var score = featureFinder.GetMs1EvidenceScore(scan, mass, charge);
                Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", scan, mass, charge, qvalue, score);
            }
        }
        public List <ProteinSpectrumMatch> ReadMsGfPlusResult(string msgfResultPath, int maxPrsm)
        {
            var parser      = new TsvFileParser(msgfResultPath);
            var prsmList    = new List <ProteinSpectrumMatch>();
            var prevScanNum = -1;

            for (var i = 0; i < parser.NumData; i++)
            {
                var sequence = parser.GetData("Peptide")[i];
                var scanNum  = int.Parse(parser.GetData("Scan")[i]);

                if (prevScanNum == scanNum)
                {
                    continue;
                }
                prevScanNum = scanNum;

                var mz       = double.Parse(parser.GetData("PrecursorMZ")[i]);
                var protName = parser.GetData("Protein")[i];
                var protDesc = "";
                var score    = double.Parse(parser.GetData("MSGFScore")[i]);
                var charge   = int.Parse(parser.GetData("Charge")[i]);

                var seq          = Sequence.GetSequenceFromMsGfPlusPeptideStr(sequence);
                var sequenceText = GetSequenceText(seq);
                var mass         = (mz - Constants.Proton) * charge;
                var firstResId   = 0;
                var lastResId    = 0;
                var fdr          = Double.Parse(parser.GetData("QValue")[i]);
                if (fdr > FdrCutoff)
                {
                    continue;
                }

                var prsm = new ProteinSpectrumMatch(sequence, scanNum, mass, charge, protName, protDesc, firstResId, lastResId, score, ProteinSpectrumMatch.SearchTool.MsGfPlus)
                {
                    SequenceText = sequenceText,
                };

                prsmList.Add(prsm);

                if (prsmList.Count >= maxPrsm)
                {
                    break;
                }
            }

            return(prsmList);
        }
Exemple #22
0
        public void GenerateVennDiagramsPeMmr()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            // No PE-MMR
            //const string noPeMmr = @"D:\Research\Data\PEMMR\iTRAQ_N33T34_10ug_100cm_300min_C2_061213.tsv";

            // PE-MMR Scan based FDR
            //const string scanBasedPeMmr = @"D:\Research\Data\PEMMR\NewSpectra\iTRAQ_N33T34_10ug_100cm_300min_C2_061213_MX_PEMMR_UMCID_ScanFDR.tsv";

            // UMC based FDR
            const string umcBasedPeMmr = @"D:\Research\Data\PEMMR\NewSpectra\iTRAQ_N33T34_10ug_100cm_300min_C2_061213_MX_PEMMR_UMCID_UMCFDR.tsv";

            // IPA
            const string ipa = @"D:\Research\Data\PEMMR\Ox\IPA_Summary_TargetOnly.tsv";

            const string resultPath1 = umcBasedPeMmr;
            const string resultPath2 = ipa;

            var result1 = new TsvFileParser(resultPath1);
            var result2 = new TsvFileParser(resultPath2);

            const double pepQValueThreshold = 0.01;
            var          vennDiagram        = new VennDiagram <string>(result1.GetPeptides(pepQValueThreshold),
                                                                       result2.GetPeptides(pepQValueThreshold));

            Console.WriteLine("{0}\t{1}\t{2}",
                              vennDiagram.Set1Only.Count + vennDiagram.Intersection.Count,
                              vennDiagram.Intersection.Count,
                              vennDiagram.Set2Only.Count + vennDiagram.Intersection.Count);
            Console.WriteLine("{0}\t{1}\t{2}",
                              vennDiagram.Set1Only.Count,
                              vennDiagram.Intersection.Count,
                              vennDiagram.Set2Only.Count);

            foreach (var peptide in vennDiagram.Set2Only)
            {
                Console.WriteLine(peptide);
                var peptides = result2.GetData("Peptide");
            }
        }
        private double[][] LoadTable(string fname)
        {
            if (!File.Exists(fname))
            {
                throw new FileNotFoundException("Missing score datafile: " + fname);
            }

            var parser = new TsvFileParser(fname);
            var table  = new double[_massBins.Length][];

            for (var i = 0; i < _massBins.Length; i++)
            {
                table[i] = new double[NumberOfBins];

                for (var k = 0; k < NumberOfBins; k++)
                {
                    var colData = parser.GetData(string.Format("{0}", k));
                    table[i][k] = double.Parse(colData[i]);
                }
            }
            return(table);
        }
Exemple #24
0
        public void CountMatchedProteins()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const int minTagLength = 3;

            var          scanToProtein  = new Dictionary <int, string>();
            var          idTag          = new Dictionary <int, bool>();
            const string resultFilePath = @"H:\Research\ProMex\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv";

            if (!File.Exists(resultFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath);
            }

            var parser       = new TsvFileParser(resultFilePath);
            var scans        = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray();
            var proteinNames = parser.GetData("ProteinName").ToArray();
            var qValues      = parser.GetData("QValue").Select(Convert.ToDouble).ToArray();

            for (var i = 0; i < qValues.Length; i++)
            {
                if (qValues[i] > 0.01)
                {
                    break;
                }
                scanToProtein.Add(scans[i], proteinNames[i]);
                idTag.Add(scans[i], false);
            }

            const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw";

            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            var run = PbfLcMsRun.GetLcMsRun(rawFilePath);

            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

//            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta";
//            const string fastaFilePath =
//                @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta";
            var fastaDb      = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);

            Console.WriteLine("Sequence length: {0}", fastaDb.GetSequence().Length);

            const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.seqtag";

            if (!File.Exists(tagFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath);
            }

            var hist = new Dictionary <int, int>();

            var scanSet = new HashSet <int>();
            HashSet <string> proteinSetForThisScan = null;
            var prevScan        = -1;
            var totalNumMatches = 0L;
            var isHeader        = true;

            foreach (var line in File.ReadAllLines(tagFilePath))
            {
                if (isHeader)
                {
                    isHeader = false;
                    continue;
                }

                var token = line.Split('\t');
                if (token.Length < 3)
                {
                    continue;
                }
                var scan      = Convert.ToInt32(token[0]);
                var proteinId = scanToProtein.ContainsKey(scan) ? scanToProtein[scan] : null;

                if (scan != prevScan)
                {
                    if (proteinSetForThisScan != null)
                    {
                        var numMatches = proteinSetForThisScan.Count;
                        int numOcc;
                        if (hist.TryGetValue(numMatches, out numOcc))
                        {
                            hist[numMatches] = numOcc + 1;
                        }
                        else
                        {
                            hist.Add(numMatches, 1);
                        }
                    }

                    prevScan = scan;
                    proteinSetForThisScan = new HashSet <string>();
                }

                scanSet.Add(scan);
                var tag = token[1];
                if (tag.Length < minTagLength)
                {
                    continue;
                }

                if (proteinSetForThisScan == null)
                {
                    continue;
                }

                var numMatchesForThisTag = 0;
                foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag)
                         .Select(index => fastaDb.GetProteinName(index)))
                {
                    proteinSetForThisScan.Add(matchedProtein);
                    ++numMatchesForThisTag;

                    if (proteinId != null && matchedProtein.Equals(proteinId))
                    {
                        idTag[scan] = true;
                    }
                }
                totalNumMatches += numMatchesForThisTag;
//                if (numMatchesForThisTag > 10)
//                {
//                    Console.WriteLine("{0}\t{1}", tag, numMatchesForThisTag);
//                }
            }

            if (proteinSetForThisScan != null)
            {
                var numMatches = proteinSetForThisScan.Count;
                int numOcc;
                if (hist.TryGetValue(numMatches, out numOcc))
                {
                    hist[numMatches] = numOcc + 1;
                }
                else
                {
                    hist.Add(numMatches, 1);
                }
            }

            Console.WriteLine("AvgNumMatches: {0}", totalNumMatches / (float)scanSet.Count);
            Console.WriteLine("Histogram:");
            foreach (var entry in hist.OrderBy(e => e.Key))
            {
                Console.WriteLine("{0}\t{1}", entry.Key, entry.Value);
            }

            Console.WriteLine("NumId: {0}", idTag.Count);
            Console.WriteLine("NumIdByTag: {0}", idTag.Select(e => e.Value).Count(v => v));
        }
Exemple #25
0
        public void TestTagAlignedFeatures()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            var featureDir = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, "Output");
            var mspDir     = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"Output\MSP");
            var outFile    = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"Output\aligned_features.tsv");
            var resultFile = Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"\Output\aligned_ids.tsv");

            if (!Directory.Exists(featureDir))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, featureDir);
            }

            if (!Directory.Exists(mspDir))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, mspDir);
            }

            if (!File.Exists(outFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, outFile);
            }

            var dataset = GetDataList(featureDir);

            var tsvParser = new TsvFileParser(outFile);
            var massList  = new List <double>();

            for (var i = 0; i < tsvParser.NumData; i++)
            {
                massList.Add(Double.Parse(tsvParser.GetData("MonoMass")[i]));
            }

            var featureIdMap = new Dictionary <int, string>();
            var tolerance    = new Tolerance(12);
            var headers      = new List <string>();

            //foreach (var data in dataset)
            for (var d = 0; d < dataset.Count; d++)
            {
                var data           = dataset[d];
                var minScanColName = string.Format("{0}_minScan", d);
                var maxScanColName = string.Format("{0}_maxScan", d);

                var fname    = string.Format(@"{0}\{1}_IcTda.tsv", mspDir, data);
                var idParser = new TsvFileParser(fname);
                var idRows   = idParser.GetRows();
                if (headers.Count < 1)
                {
                    headers.AddRange(idParser.GetHeaders());
                }

                for (var i = 0; i < idParser.NumData; i++)
                {
                    var scan   = Int32.Parse(idParser.GetData("Scan")[i]);
                    var mass   = Double.Parse(idParser.GetData("Mass")[i]);
                    var qvalue = Double.Parse(idParser.GetData("QValue")[i]);

                    if (qvalue > 0.01)
                    {
                        break;
                    }

                    var massTol = tolerance.GetToleranceAsMz(mass);

                    var idx = massList.BinarySearch(mass);
                    if (idx < 0)
                    {
                        idx = ~idx;
                    }

                    var found = false;
                    for (var j = idx; j >= 0; j--)
                    {
                        if (Math.Abs(mass - massList[j]) > massTol)
                        {
                            break;
                        }

                        if (tsvParser.GetData(minScanColName)[j].Length < 1)
                        {
                            continue;
                        }

                        if (Int32.Parse(tsvParser.GetData(minScanColName)[j]) < scan && scan < Int32.Parse(tsvParser.GetData(maxScanColName)[j]))
                        {
                            found = true;
                            if (!featureIdMap.ContainsKey(j))
                            {
                                featureIdMap.Add(j, idRows[i]);
                            }
                            break;
                        }
                    }

                    if (found)
                    {
                        continue;
                    }
                    for (var j = idx + 1; j < massList.Count; j++)
                    {
                        if (Math.Abs(mass - massList[j]) > massTol)
                        {
                            break;
                        }
                        if (tsvParser.GetData(minScanColName)[j].Length < 1)
                        {
                            continue;
                        }
                        if (Int32.Parse(tsvParser.GetData(minScanColName)[j]) < scan && scan < Int32.Parse(tsvParser.GetData(maxScanColName)[j]))
                        {
                            found = true;
                            if (!featureIdMap.ContainsKey(j))
                            {
                                featureIdMap.Add(j, idRows[i]);
                            }
                            break;
                        }
                    }
                }
            }

            var writer = new StreamWriter(resultFile);

            writer.Write("AlignedFeatureID"); writer.Write("\t");
            writer.Write(string.Join("\t", headers));
            for (var i = 0; i < 32; i++)
            {
                writer.Write("\t");  writer.Write("{0}", i);
            }
            writer.Write("\n");

            var id = 1;

            foreach (var key in featureIdMap.Keys)
            {
                writer.Write(id); writer.Write("\t");
                writer.Write(featureIdMap[key]);
                for (var i = 0; i < 32; i++)
                {
                    writer.Write("\t"); writer.Write("{0}", tsvParser.GetData(string.Format("{0}", i))[key]);
                }
                writer.Write("\n");
                id++;
            }
            writer.Close();
        }
Exemple #26
0
        public void TestCompositeScoring()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            //const string rawFilePath = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\SpecFiles\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw";
            const string rawFilePath = @"D:\MassSpecFiles\training\raw\QC_Shew_Intact_26Sep14_Bane_C2Column3.pbf";

            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            // Configure amino acid set
            var oxM      = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false);
            var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false);
            var acetylN  = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false);

            const int numMaxModsPerProtein = 4;
            var       searchModifications  = new List <SearchModification>
            {
                dehydroC,
                oxM,
                acetylN
            };
            var aaSet    = new AminoAcidSet(searchModifications, numMaxModsPerProtein);
            var comparer = new FilteredProteinMassBinning(aaSet, 50000, 28);

            var          run = PbfLcMsRun.GetLcMsRun(rawFilePath);
            const double filteringWindowSize    = 1.1;
            const int    isotopeOffsetTolerance = 2;
            var          tolerance    = new Tolerance(10);
            const int    minCharge    = 1;
            const int    maxCharge    = 20;
            var          graphFactory = new ProteinScoringGraphFactory(comparer, aaSet);
            var          aminoAcidSet = new AminoAcidSet();
            //var scorer = new MatchedPeakPostScorer(tolerance, minCharge, maxCharge);
            var scorer = new InformedTopDownScorer(run, aminoAcidSet, minCharge, maxCharge, tolerance);

            var fileExt = new string[] { "IcTarget", "IcDecoy" };

            foreach (var ext in fileExt)
            {
                var resultFileName = string.Format(@"D:\MassSpecFiles\training\Rescoring\QC_Shew_Intact_26Sep14_Bane_C2Column3_{0}.tsv", ext);
                var parser         = new TsvFileParser(resultFileName);
                var scans          = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray();
                var charges        = parser.GetData("Charge").Select(s => Convert.ToInt32(s)).ToArray();
                var protSequences  = parser.GetData("Sequence").ToArray();
                var modStrs        = parser.GetData("Modifications").ToArray();
                var compositions   = parser.GetData("Composition").Select(Composition.Parse).ToArray();
                var protMass       = parser.GetData("Mass").Select(s => Convert.ToDouble(s)).ToArray();
                var outputFileName = string.Format(@"D:\MassSpecFiles\training\Rescoring\QC_Shew_Intact_26Sep14_Bane_C2Column3_{0}_Rescored.tsv", ext);

                using (var writer = new StreamWriter(outputFileName))
                {
                    writer.WriteLine(string.Join("\t", parser.GetHeaders().ToArray(), 0, 15) + "\tScore\tEValue");

                    var lines = new string[parser.NumData];

                    //for (var i = 0; i < parser.NumData; i++)
                    Parallel.For(0, parser.NumData, i =>
                    {
                        var scan         = scans[i];
                        var charge       = charges[i];
                        var protSequence = protSequences[i];
                        var modStr       = modStrs[i];
                        var sequence     = Sequence.CreateSequence(protSequence, modStr, aminoAcidSet);
                        Assert.True(sequence.Composition.Equals(compositions[i] - Composition.H2O));
                        var ms2Spec = run.GetSpectrum(scan) as ProductSpectrum;
                        Assert.True(ms2Spec != null);
                        var scores = scorer.GetScores(sequence, charge, scan);

                        var deconvSpec = Deconvoluter.GetDeconvolutedSpectrum(ms2Spec, minCharge, maxCharge,
                                                                              isotopeOffsetTolerance, filteringWindowSize, tolerance, 0.7);

                        var deconvScorer = new CompositeScorerBasedOnDeconvolutedSpectrum(deconvSpec, ms2Spec, tolerance,
                                                                                          comparer);
                        var graph = graphFactory.CreateScoringGraph(deconvScorer, protMass[i]);

                        var gf = new GeneratingFunction(graph);
                        gf.ComputeGeneratingFunction();

                        var specEvalue = gf.GetSpectralEValue(scores.Score);

                        var rowStr    = parser.GetRows()[i];
                        var items     = rowStr.Split('\t').ToArray();
                        var newRowStr = string.Join("\t", items, 0, 15);

                        //writer.WriteLine("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue);
                        lock (lines)
                        {
                            lines[i] = string.Format("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue);
                        }
                        //Console.WriteLine("{0}\t{1}\t{2}", items[0], scores.Score, specEvalue);
                    });

                    foreach (var line in lines)
                    {
                        writer.WriteLine(line);
                    }
                }
                Console.WriteLine("Done");
            }
        }
        public List <ProteinSpectrumMatch> ReadMsAlignResult(string msAlignResultTablePath, int maxPrsm)
        {
            var parser   = new TsvFileParser(msAlignResultTablePath);
            var prsmList = new List <ProteinSpectrumMatch>();

            for (var i = 0; i < parser.NumData; i++)
            {
                var sequence     = parser.GetData("Peptide")[i];
                var scanNum      = int.Parse(parser.GetData("Scan(s)")[i]);
                var mass         = double.Parse(parser.GetData("Precursor_mass")[i]);
                var protNameDesc = parser.GetData("Protein_name")[i];

                var k        = protNameDesc.IndexOf(' ');
                var protName = (k < 0) ? protNameDesc : protNameDesc.Substring(0, k);
                var protDesc = (k < 0) ? protNameDesc : protNameDesc.Substring(k + 1);

                var firstResId   = int.Parse(parser.GetData("First_residue")[i]);
                var lastResId    = int.Parse(parser.GetData("Last_residue")[i]);
                var score        = double.Parse(parser.GetData("#matched_fragment_ions")[i]);
                var sequenceText = parser.GetData("Peptide")[i];
                var charge       = int.Parse(parser.GetData("Charge")[i]);
                var evalue       = double.Parse(parser.GetData("E-value")[i]);

                var fdr = Double.Parse(parser.GetData("FDR")[i]);
                if (fdr > FdrCutoff)
                {
                    continue;
                }

                var prsm = new ProteinSpectrumMatch(sequence, scanNum, mass, charge, protName, protDesc, firstResId, lastResId, score, ProteinSpectrumMatch.SearchTool.MsAlign)
                {
                    SequenceText   = sequenceText,
                    SpectralEvalue = evalue,
                };

                prsmList.Add(prsm);

                if (prsmList.Count >= maxPrsm)
                {
                    break;
                }
            }

            return(prsmList);
        }
        public List <ProteinSpectrumMatch> ReadMsPathFinderResult(string msPathFinderResultPath, int maxPrsm, double minScore = 3, double maxScore = int.MaxValue)
        {
            var parser   = new TsvFileParser(msPathFinderResultPath);
            var prsmList = new List <ProteinSpectrumMatch>();

            var scoreColumn = parser.GetData("#MatchedFragments") ?? parser.GetData("Score");
            var qValColumn  = parser.GetData("QValue");

            var evalueColumn = parser.GetData("SpecEValue");

            for (var i = 0; i < parser.NumData; i++)
            {
                var sequence = parser.GetData("Sequence")[i];
                var scanNum  = int.Parse(parser.GetData("Scan")[i]);
                var mass     = double.Parse(parser.GetData("Mass")[i]);
                var protName = parser.GetData("ProteinName")[i];
                var protDesc = parser.GetData("ProteinDesc")[i];
                var charge   = int.Parse(parser.GetData("Charge")[i]);

                var firstResId = int.Parse(parser.GetData("Start")[i]);
                var lastResId  = int.Parse(parser.GetData("End")[i]);
                var score      = double.Parse(scoreColumn[i]);
                var mod        = parser.GetData("Modifications")[i];
                var evalue     = (evalueColumn != null) ? double.Parse(parser.GetData("SpecEValue")[i]) : 0;

                var pre        = parser.GetData("Pre")[i];
                var post       = parser.GetData("Post")[i];
                var proteinLen = int.Parse(parser.GetData("ProteinLength")[i]);

                if (score < minScore || score > maxScore)
                {
                    continue;
                }

                if (qValColumn != null)
                {
                    var fdr = double.Parse(qValColumn[i]);
                    if (fdr > FdrCutoff)
                    {
                        continue;
                    }
                }

                var sequenceText = GetSequenceText(sequence, mod);

                var prsm = new ProteinSpectrumMatch(sequence, scanNum, mass, charge, protName, protDesc, firstResId, lastResId, score, ProteinSpectrumMatch.SearchTool.MsPathFinder)
                {
                    SequenceText   = sequenceText,
                    Modifications  = mod,
                    Pre            = pre,
                    Post           = post,
                    ProteinLength  = proteinLen,
                    SpectralEvalue = evalue,
                };

                prsmList.Add(prsm);

                if (prsmList.Count >= maxPrsm)
                {
                    break;
                }
            }

            return(prsmList);
        }
Exemple #29
0
        public void TestMs1Filtering()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const string resultFilePath =
                //    @"C:\cygwin\home\kims336\Data\TopDown\raw\CorrMatches_N30\SBEP_STM_001_02272012_Aragon.tsv";
                @"C:\cygwin\home\kims336\Data\TopDown\raw\CorrMatches_N30\SBEP_STM_001_02272012_Aragon.decoy.icresult";

            if (!File.Exists(resultFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath);
            }

            const string rawFilePath = @"C:\cygwin\home\kims336\Data\TopDown\raw\DataFiles\SBEP_STM_001_02272012_Aragon.raw";

            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            var run = InMemoryLcMsRun.GetLcMsRun(rawFilePath, 1.4826, 1.4826);

            //const int minPrecursorCharge = 3;
            //const int maxPrecursorCharge = 30;
            //const int tolerancePpm = 15;
            var tolerance = new Tolerance(15);

            //var ms1BasedFilter = new Ms1IsotopeCorrFilter(run, minPrecursorCharge, maxPrecursorCharge, tolerancePpm, 0.7, 40);
            ////var ms1BasedFilter = new Ms1IsotopeTopKFilter(run, minPrecursorCharge, maxPrecursorCharge, tolerancePpm, 20);
            //ISequenceFilter ms1Filter = ms1BasedFilter;

            var tsvReader    = new TsvFileParser(resultFilePath);
            var compositions = tsvReader.GetData("Composition");
            var scanNums     = tsvReader.GetData("ScanNum");
            var charges      = tsvReader.GetData("Charge");
            var qValues      = tsvReader.GetData("QValue");
            var scores       = tsvReader.GetData("Score");

            //var sequences = tsvReader.GetData("Annotation");

            //var hist = new int[11];

            Console.WriteLine("ScanNum\tScore\tPrecursor\tNext\tSum\tNextIsotope\tLessCharge\tMoreCharge\tMax\tNumXicPeaks");
            for (var i = 0; i < compositions.Count; i++)
            {
                if (qValues != null)
                {
                    var qValue = Convert.ToDouble(qValues[i]);
                    if (qValue > 0.01)
                    {
                        continue;
                    }
                }

                var scanNum     = Convert.ToInt32(scanNums[i]);
                var composition = Composition.Parse(compositions[i]);
                var charge      = Convert.ToInt32(charges[i]);

                var precursorIon = new Ion(composition, charge);
                var isValid      = run.GetSpectrum(scanNum) is ProductSpectrum spec && spec.IsolationWindow.Contains(precursorIon.GetMostAbundantIsotopeMz());
                if (!isValid)
                {
                    continue;
                }

                var score = Convert.ToDouble(scores[i]);

                var precursorScanNum = run.GetPrecursorScanNum(scanNum);
                var precursorSpec    = run.GetSpectrum(precursorScanNum);
                var preIsotopeCorr   = precursorSpec.GetCorrScore(precursorIon, tolerance, 0.1);

                var nextScanNum     = run.GetNextScanNum(scanNum, 1);
                var nextSpec        = run.GetSpectrum(nextScanNum);
                var nextIsotopeCorr = nextSpec.GetCorrScore(precursorIon, tolerance, 0.1);

                var xicMostAbundant = run.GetPrecursorExtractedIonChromatogram(precursorIon.GetMostAbundantIsotopeMz(), tolerance, scanNum);

                var apexScanNum = xicMostAbundant.GetApexScanNum();
                if (apexScanNum < run.MinLcScan)
                {
                    apexScanNum = scanNum;
                }
                //var sumSpec = run.GetSummedMs1Spectrum(apexScanNum);
                //                var apexIsotopeCorr = sumSpec.GetCorrScore(precursorIon, tolerance, 0.1);
                //                var corr3 = ms1Filter.GetMatchingMs2ScanNums(composition.Mass).Contains(scanNum) ? 1 : 0;

                var xicNextIsotope = run.GetPrecursorExtractedIonChromatogram(precursorIon.GetMostAbundantIsotopeMz() + Constants.C13MinusC12 / charge, tolerance, scanNum);

                var plusOneIsotopeCorr = xicMostAbundant.GetCorrelation(xicNextIsotope);

                var precursorIonChargeMinusOne = new Ion(composition, charge - 1);
                var xicChargeMinusOne          = run.GetPrecursorExtractedIonChromatogram(precursorIonChargeMinusOne.GetMostAbundantIsotopeMz(), tolerance, scanNum);
                var chargeMinusOneCorr         = xicMostAbundant.GetCorrelation(xicChargeMinusOne);

                var precursorIonChargePlusOne = new Ion(composition, charge + 1);
                var xicChargePlusOne          = run.GetPrecursorExtractedIonChromatogram(precursorIonChargePlusOne.GetMostAbundantIsotopeMz(), tolerance, scanNum);
                var chargePlusOneCorr         = xicMostAbundant.GetCorrelation(xicChargePlusOne);

                //var max = new[] {preIsotopeCorr, nextIsotopeCorr, apexIsotopeCorr, plusOneIsotopeCorr, chargeMinusOneCorr, chargePlusOneCorr}.Max();
                //Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}",
                //    scanNum, score, preIsotopeCorr, nextIsotopeCorr, apexIsotopeCorr, plusOneIsotopeCorr, chargeMinusOneCorr, chargePlusOneCorr, max, xicMostAbundant.Count);
            }

            //Console.WriteLine("Histogram");
            //for (var i = 0; i < hist.Length; i++)
            //{
            //    Console.WriteLine("{0:f1}\t{1}", i / 10.0, hist[i]);
            //}
        }
Exemple #30
0
        public void ExtractLcMsFeaturesForTrainingSet()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string idFileFolder = @"D:\MassSpecFiles\training\FilteredIdResult";

            if (!Directory.Exists(idFileFolder))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, idFileFolder);
            }

            var tolerance  = new Tolerance(10);
            var tolerance2 = new Tolerance(20);
            var id         = 1;


            for (var d = 0; d < TrainSetFileLists.Length; d++)
            {
                var dataset            = TrainSetFileLists[d];
                var dataname           = Path.GetFileNameWithoutExtension(dataset);
                var filtedIdResultFile = string.Format(@"{0}\{1}.trainset.tsv", idFileFolder, Path.GetFileNameWithoutExtension(dataset));
                var featureResult      = string.Format(@"{0}\{1}.ms1ft", idFileFolder, Path.GetFileNameWithoutExtension(dataset));

                if (!File.Exists(dataset))
                {
                    Console.WriteLine(@"Warning: Skipping since file not found: {0}", dataset);
                    continue;
                }
                if (!File.Exists(filtedIdResultFile))
                {
                    Console.WriteLine(@"Warning: Skipping since file not found: {0}", filtedIdResultFile);
                    continue;
                }


                var run = PbfLcMsRun.GetLcMsRun(dataset);


                var targetStatWriter = new StreamWriter(string.Format(@"D:\MassSpecFiles\training\statistics\{0}.tsv", Path.GetFileNameWithoutExtension(dataset)));
                var decoyStatWriter  = new StreamWriter(string.Format(@"D:\MassSpecFiles\training\statistics\{0}_decoy.tsv", Path.GetFileNameWithoutExtension(dataset)));
                var writer           = new StreamWriter(featureResult);

                writer.Write("Ms2MinScan\tMs2MaxScan\tMs2MinCharge\tMs2MaxCharge\tMs2Mass\t");
                writer.Write("Mass\tMinScan\tMaxScan\tMinCharge\tMaxCharge\tMinTime\tMaxTime\tElution\tGood\n");
                var tsvParser = new TsvFileParser(filtedIdResultFile);

                var featureFinder = new LcMsPeakMatrix(run);


                for (var i = 0; i < tsvParser.NumData; i++)
                {
                    var minScan   = int.Parse(tsvParser.GetData("MinScan")[i]);
                    var maxScan   = int.Parse(tsvParser.GetData("MaxScan")[i]);
                    var minCharge = int.Parse(tsvParser.GetData("MinCharge")[i]);
                    var maxCharge = int.Parse(tsvParser.GetData("MaxCharge")[i]);
                    var mass      = double.Parse(tsvParser.GetData("Mass")[i]);

                    writer.Write(minScan);
                    writer.Write("\t");
                    writer.Write(maxScan);
                    writer.Write("\t");
                    writer.Write(minCharge);
                    writer.Write("\t");
                    writer.Write(maxCharge);
                    writer.Write("\t");
                    writer.Write(mass);
                    writer.Write("\t");

                    var binNum = featureFinder.Comparer.GetBinNumber(mass);

                    var binMass = featureFinder.Comparer.GetMzAverage(binNum);

                    var             binNumList     = (mass < binMass) ? new int[] { binNum, binNum - 1, binNum + 1 } : new int[] { binNum, binNum + 1, binNum - 1 };
                    LcMsPeakCluster refinedFeature = null;

                    foreach (var bi in binNumList)
                    {
                        var tempList = new List <LcMsPeakCluster>();
                        var features = featureFinder.FindFeatures(bi);
                        var massTh   = (mass < 2000) ? tolerance2.GetToleranceAsTh(mass) : tolerance.GetToleranceAsTh(mass);
                        foreach (var feature in features)
                        {
                            if (Math.Abs(mass - feature.Mass) < massTh)
                            {
                                tempList.Add(feature);
                            }
                        }

                        //var nHits = 0;
                        var highestAbu = 0d;
                        //var scans = Enumerable.Range(minScan, maxScan - minScan + 1);
                        foreach (var feature in tempList)
                        {
                            //var scans2 = Enumerable.Range(feature.MinScanNum, feature.MaxScanNum - feature.MinScanNum + 1);
                            //var hitScans = scans.Intersect(scans2).Count();
                            if (feature.MinScanNum < 0.5 * (minScan + maxScan) &&
                                0.5 * (minScan + maxScan) < feature.MaxScanNum)
                            {
                                if (feature.Abundance > highestAbu)
                                {
                                    refinedFeature = feature;
                                    highestAbu     = feature.Abundance;
                                }
                            }

                            /*if (hitScans > 0)
                             * {
                             *  refinedFeature = feature;
                             *  nHits = hitScans;
                             * }*/
                        }

                        if (refinedFeature != null)
                        {
                            break;
                        }
                    }

                    if (refinedFeature != null)
                    {
                        writer.Write(refinedFeature.Mass);
                        writer.Write("\t");
                        writer.Write(refinedFeature.MinScanNum);
                        writer.Write("\t");
                        writer.Write(refinedFeature.MaxScanNum);
                        writer.Write("\t");
                        writer.Write(refinedFeature.MinCharge);
                        writer.Write("\t");
                        writer.Write(refinedFeature.MaxCharge);
                        writer.Write("\t");
                        writer.Write(refinedFeature.MinElutionTime);
                        writer.Write("\t");
                        writer.Write(refinedFeature.MaxElutionTime);
                        writer.Write("\t");
                        writer.Write(refinedFeature.MaxElutionTime - refinedFeature.MinElutionTime);
                        writer.Write("\t");

                        var good = (refinedFeature.MinScanNum <= minScan && refinedFeature.MaxScanNum >= maxScan);
                        writer.Write(good ? 1 : 0);
                        writer.Write("\n");
                        //writer.Write(0); writer.Write("\t");
                        //writer.Write(0); writer.Write("\n");

                        OutputEnvelopPeakStat(id, refinedFeature, targetStatWriter);

                        var chargeRange = featureFinder.GetDetectableMinMaxCharge(refinedFeature.RepresentativeMass, run.MinMs1Mz, run.MaxMs1Mz);
                        refinedFeature.UpdateWithDecoyScore(featureFinder.Ms1Spectra, chargeRange.Item1, chargeRange.Item2);
                        OutputEnvelopPeakStat(id, refinedFeature, decoyStatWriter);
                        id++;
                    }
                    else
                    {
                        writer.Write(0);
                        writer.Write("\t");
                        writer.Write(0);
                        writer.Write("\t");
                        writer.Write(0);
                        writer.Write("\t");
                        writer.Write(0);
                        writer.Write("\t");
                        writer.Write(0);
                        writer.Write("\t");
                        writer.Write(0);
                        writer.Write("\t");
                        writer.Write(0);
                        writer.Write("\t");
                        writer.Write(0);
                        writer.Write("\t");
                        writer.Write(0);
                        writer.Write("\n");
                    }
                    //var feature = featureFinder.FindLcMsPeakCluster(mass, (int) scan, (int) charge);
                }
                writer.Close();
                targetStatWriter.Close();
                decoyStatWriter.Close();
                Console.WriteLine(dataname);
            }
        }