Ejemplo n.º 1
0
        public void FindProteinDeltaMass()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string folderPath = @"D:\MassSpecFiles\Glyco\";
            if (!Directory.Exists(folderPath))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, folderPath);
            }

            var fileSet = new string[]
            {
                "User_sample_test_02252015", "User_sample_test_MWCO_02262016", "User_sample_test_SEC_F3_03022105",
                "User_sample_test_SEC_F1_02272015", "User_sample_test_SEC_F2_02282015"
            };
            const string fastaFilePath = folderPath + "ID_003836_DA9CC1E4.fasta";

            for (var i = 0; i < fileSet.Length; i++)
            {
                var datasetName = fileSet[i];
                var tagFilePath = folderPath + datasetName + ".seqtag";
                //var outputFilePath = folderPath + datasetName + ".matchedtag";
                var outputFilePath = folderPath + datasetName + ".dmass";
                var fastaDb = new FastaDatabase(fastaFilePath);
                var searchableDb = new SearchableDatabase(fastaDb);

                using (var writer = new StreamWriter(outputFilePath))
                {
                    var isHeader = true;
                    var nReadSeqTag = 0;

                    Console.WriteLine(@"Reading {0} file", tagFilePath);

                    var nColumn = 0;
                    foreach (var line in File.ReadAllLines(tagFilePath))
                    {
                        if (isHeader)
                        {
                            isHeader = false;
                            nColumn = line.Split('\t').Length;
                            writer.WriteLine(line + "\t" + "Protein" + "\t" + "DetectedFlankingMass" + "\t" + "ExpectedFlankingMass" + "\t" + "DeltaMass");
                            continue;
                        }

                        var token = line.Split('\t');
                        if (token.Length != nColumn) continue;
                        var tag = token[1];
                        //var scan = Convert.ToInt32(token[0]);

                        if (tag.Length < 6) continue;

                        var nTerminal = token[2].Equals("1");
                        var detectedFlankingMass = Double.Parse(token[3]);

                        if (!nTerminal) detectedFlankingMass -= Composition.H2O.Mass;

                        nReadSeqTag++;

                        var matchedProteins =
                            searchableDb.FindAllMatchedSequenceIndices(tag)
                                .Select(index => fastaDb.GetProteinName(index))
                                .Distinct().ToArray();

                        if (matchedProteins.Length < 1) continue;

                        foreach (var protName in matchedProteins)
                        {
                            var seqStr = fastaDb.GetProteinSequence(protName);
                            var oriSeq = new Sequence(seqStr, AminoAcidSet.GetStandardAminoAcidSet());
                            
                            var startIdx = 0;
                            while (true)
                            {
                                var idx = seqStr.IndexOf(tag, startIdx);

                                if (idx < 0) break; //no matching

                                //var nClv = (nTerminal) ? idx : seqStr.Length - idx - tag.Length;
                                var nClv = (nTerminal) ? 2 : 1; 

                                for (var j = 0; j < nClv; j++)
                                {
                                    var flankComposition = (nTerminal)
                                        ? oriSeq.GetComposition(j, idx)
                                        : oriSeq.GetComposition(idx + tag.Length, oriSeq.Count - j);

                                    var massDiff = (detectedFlankingMass - flankComposition.Mass);
                                    if (massDiff > -500 && massDiff < 2000)
                                    {
                                        //writer.WriteLine(massDiff);
                                        writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", line, protName, detectedFlankingMass, flankComposition.Mass, massDiff);
                                    }

                                    if (massDiff > 2000) break;
                                }
                                
                                startIdx = idx + tag.Length;
                            }
                        }
                        //var matchedProteinStr = string.Join(",", matchedProteins);
                        //var massDiffStr = string.Join(",", massDiffList);
                        //writer.WriteLine("{0}\t{1}\t{2}\t{3}", line, matchedProteins.Length, matchedProteinStr, massDiffStr);
                    }

                    Console.WriteLine(@"{0} seq tags are processed", nReadSeqTag);
                }
                Console.WriteLine(@"Done");
            }
        }
Ejemplo n.º 2
0
        private void WriteResultsToFile(SortedSet<DatabaseSequenceSpectrumMatch>[] matches, string outputFilePath, FastaDatabase database)
        {
            using (var writer = new StreamWriter(outputFilePath))
            {
                writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" +
                             "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tIcScore"
                             );
                for (var scanNum = _run.MinLcScan; scanNum <= _run.MaxLcScan; scanNum++)
                {
                    if (matches[scanNum] == null) continue;
                    foreach (var match in matches[scanNum].Reverse())
                    {
                        var sequence = match.Sequence;
                        var offset = match.Offset;
                        var start = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages;
                        var end = start + sequence.Length - 1;
                        var proteinName = database.GetProteinName(match.Offset);
                        var protLength = database.GetProteinLength(proteinName);
                        var ion = match.Ion;

                        var scores = _bottomUpScorer.GetScores(match, ion.Composition, ion.Charge, scanNum);

                        if (ion == null)
                        {
                            Console.WriteLine(@"Null ion!");
                        }
                        if (scores == null)
                        {
                            Console.WriteLine(@"Null scores");
                        }

                        // Note for DblToString(value, 9, true), by having "9" and "true",
                        // values between 100 and 999 Da will have 7 digits after the decimal place, and
                        // values between 1000 and 9999 will have 6 digits after the decimal place

                        writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}",
                            scanNum,
                            match.Pre,
                            sequence, // Sequence
                            match.Post,
                            scores.Modifications, // Modifications
                            ion.Composition, // Composition
                            proteinName, // ProteinName
                            database.GetProteinDescription(match.Offset), // ProteinDescription
                            protLength, // ProteinLength
                            start, // Start
                            end, // End
                            ion.Charge, // precursorCharge
                            StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true), // MostAbundantIsotopeMz
                            StringUtilities.DblToString(ion.Composition.Mass, 9, true),           // Mass
                            match.Score,
                            scores.Score    // Score (re-scored)
                            );

                    }
                }
            }
        }
Ejemplo n.º 3
0
        public void CountMatchedProteins()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const int minTagLength = 3;

            var scanToProtein = new Dictionary<int, string>();
            var idTag = new Dictionary<int, bool>();
            const string resultFilePath = @"H:\Research\ProMex\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv";
            if (!File.Exists(resultFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath);
            }

            var parser = new TsvFileParser(resultFilePath);
            var scans = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray();
            var proteinNames = parser.GetData("ProteinName").ToArray();
            var qValues = parser.GetData("QValue").Select(Convert.ToDouble).ToArray();
            for (var i = 0; i < qValues.Length; i++)
            {
                if (qValues[i] > 0.01) break;
                scanToProtein.Add(scans[i], proteinNames[i]);
                idTag.Add(scans[i], false);
            }

            const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw";
            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            var run = PbfLcMsRun.GetLcMsRun(rawFilePath);

            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";
            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

//            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta";
//            const string fastaFilePath =
//                @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta";
            var fastaDb = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);
            Console.WriteLine("Sequence length: {0}", fastaDb.GetSequence().Length);

            const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.seqtag";
            if (!File.Exists(tagFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath);
            }

            var hist = new Dictionary<int, int>();
            
            var scanSet = new HashSet<int>();
            HashSet<string> proteinSetForThisScan = null;
            var prevScan = -1;
            var totalNumMatches = 0L;
            var isHeader = true;
            foreach (var line in File.ReadAllLines(tagFilePath))
            {
                if (isHeader)
                {
                    isHeader = false;
                    continue;
                }

                var token = line.Split('\t');
                if (token.Length < 3) continue;
                var scan = Convert.ToInt32(token[0]);
                var proteinId = scanToProtein.ContainsKey(scan) ? scanToProtein[scan] : null;

                if (scan != prevScan)
                {
                    if (proteinSetForThisScan != null)
                    {
                        var numMatches = proteinSetForThisScan.Count;
                        int numOcc;
                        if (hist.TryGetValue(numMatches, out numOcc)) hist[numMatches] = numOcc + 1;
                        else hist.Add(numMatches, 1);
                    }

                    prevScan = scan;
                    proteinSetForThisScan = new HashSet<string>();
                }

                scanSet.Add(scan);
                var tag = token[1];
                if (tag.Length < minTagLength) continue;

                if (proteinSetForThisScan == null) continue;

                var numMatchesForThisTag = 0;
                foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag)
                    .Select(index => fastaDb.GetProteinName(index)))
                {
                    proteinSetForThisScan.Add(matchedProtein);
                    ++numMatchesForThisTag;

                    if (proteinId != null && matchedProtein.Equals(proteinId))
                    {
                        idTag[scan] = true;
                    }
                }
                totalNumMatches += numMatchesForThisTag;
//                if (numMatchesForThisTag > 10)
//                {
//                    Console.WriteLine("{0}\t{1}", tag, numMatchesForThisTag);
//                }
            }

            if (proteinSetForThisScan != null)
            {
                var numMatches = proteinSetForThisScan.Count;
                int numOcc;
                if (hist.TryGetValue(numMatches, out numOcc)) hist[numMatches] = numOcc + 1;
                else hist.Add(numMatches, 1);
            }
            
            Console.WriteLine("AvgNumMatches: {0}", totalNumMatches/(float)scanSet.Count);
            Console.WriteLine("Histogram:");
            foreach (var entry in hist.OrderBy(e => e.Key))
            {
                Console.WriteLine("{0}\t{1}", entry.Key, entry.Value);
            }

            Console.WriteLine("NumId: {0}", idTag.Count);
            Console.WriteLine("NumIdByTag: {0}", idTag.Select(e => e.Value).Count(v => v));
        }
Ejemplo n.º 4
0
        public void CountMatchedScansPerProtein()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const int minTagLength = 6;

            var proteinToScan = new Dictionary<string, HashSet<int>>();
            const string fastaFilePath = @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta";
            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);
            Console.WriteLine(@"Sequence length: {0}", fastaDb.GetSequence().Length);

            //const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv";
            //const string tagFilePath = @"\\protoapps\UserData\Jungkap\Co_culture\23B_pellet_TD_3Feb14_Bane_PL011402.seqtag";
            const string tagFilePath = @"D:\MassSpecFiles\co_culture\23A_pellet_TD_3Feb14_Bane_PL011402.seqtag";
            if (!File.Exists(tagFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath);
            }

            var isHeader = true;
            var numMatchedPairs = 0;
            foreach (var line in File.ReadAllLines(tagFilePath))
            {
                if (isHeader)
                {
                    isHeader = false;
                    continue;
                }

                var token = line.Split('\t');
                if (token.Length != 3) continue;
                var scan = Convert.ToInt32(token[0]);

                var tag = token[1];
                if (tag.Length < minTagLength) continue;

                foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag)
                    .Select(index => fastaDb.GetProteinName(index)))
                {
                    ++numMatchedPairs;
                    HashSet<int> matchedScans;
                    if (proteinToScan.TryGetValue(matchedProtein, out matchedScans))
                    {
                        matchedScans.Add(scan);
                    }
                    else
                    {
                        matchedScans = new HashSet<int> {scan};
                        proteinToScan.Add(matchedProtein, matchedScans);
                    }
                }
            }

            var numMatchedProteins = proteinToScan.Keys.Count;
            var numAllProteins = fastaDb.GetNumEntries();
            Console.WriteLine("NumAllProteins: {0}", numAllProteins);
            Console.WriteLine("NumMatchedProteins: {0}", numMatchedProteins);
            Console.WriteLine("AvgMatchedScansPerProtein: {0}", numMatchedPairs / (float)numAllProteins);
        }
Ejemplo n.º 5
0
        private void WriteResultsToFile(DatabaseSequenceSpectrumMatch[] matches, string outputFilePath, FastaDatabase database)
        {
            using (var writer = new StreamWriter(outputFilePath))
            {
                writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" +
                                 "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tProbability\tSpecEValue\tEValue");
                
                foreach(var scanNum in _ms2ScanNums)
                {
                    var match = matches[scanNum];
                    if (match == null) continue;

                    var sequence = match.Sequence;
                    var offset = match.Offset;
                    var start = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages;
                    var end = start + sequence.Length - 1;
                    var proteinName = database.GetProteinName(match.Offset);
                    var protLength = database.GetProteinLength(proteinName);
                    var ion = match.Ion;
                    var proteinDescription = database.GetProteinDescription(match.Offset);
                    var probability = CompositeScorer.GetProbability(match.Score);

                    // Note for DblToString(value, 9, true), by having "9" and "true",
                    // values between 100 and 999 Da will have 7 digits after the decimal place, and
                    // values between 1000 and 9999 will have 6 digits after the decimal place
                    writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\t{16}\t{17}",
                        scanNum,
                        match.Pre,                 // Pre
                        sequence,                  // Sequence
                        match.Post,                // Post
                        match.ModificationText,    // Modifications
                        ion.Composition,           // Composition
                        proteinName,               // ProteinName
                        proteinDescription,        // ProteinDescription
                        protLength,                // ProteinLength
                        start,                     // Start position in protein
                        end,                       // End position in protein
                        ion.Charge,                // precursorCharge
                        StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true), // MostAbundantIsotopeMz
                        StringUtilities.DblToString(ion.Composition.Mass, 9, true),           // Mass
                        match.NumMatchedFragments,                                          // (Number of matched fragments)
                        StringUtilities.DblToString(probability, 4),                        // Probability
                        StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue), 6, true, 0.001),                             // EValue; will be displayed using scientific notation if the value is less than 0.001
                        StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue * database.GetNumEntries()), 6, true, 0.001)   // SpecEValue; will be displayed using scientific notation if the value is less than 0.001
                        );

                }
            }
        }
Ejemplo n.º 6
0
        public void FindProteins()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";
            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);

            const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv";
            if (!File.Exists(tagFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath);
            }

            const string outputFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_matchedtag.tsv";
            using (var writer = new StreamWriter(outputFilePath))
            {
                var isHeader = true;
                foreach (var line in File.ReadAllLines(tagFilePath))
                {
                    if (isHeader)
                    {
                        isHeader = false;
                        writer.WriteLine(line+"\t"+"Proteins");
                        continue;
                    }

                    var token = line.Split('\t');
                    if (token.Length != 3) continue;
                    var tag = token[1];

                    var matchedProteins =
                        searchableDb.FindAllMatchedSequenceIndices(tag)
                            .Select(index => fastaDb.GetProteinName(index))
                            .Distinct().ToArray();
                    var matchedProteinStr = string.Join(",", matchedProteins);
                    writer.WriteLine("{0}\t{1}\t{2}", line, matchedProteins.Length, matchedProteinStr);
                }
            }
            Console.WriteLine(@"Done");
        }
Ejemplo n.º 7
0
        public void TestGettingProteinLengthAndPosition()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta";
            if (!File.Exists(dbFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile);
            }

            var db = new FastaDatabase(dbFile);
            db.Read();
            var indexedDb = new IndexedDatabase(db);
            foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsets(6, 20, 2, 0, Enzyme.Trypsin))
            {
                var annotation = peptideAnnotationAndOffset.Annotation;
                var offset = peptideAnnotationAndOffset.Offset;
                Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}",
                    annotation, 
                    offset, 
                    db.GetProteinName(offset), 
                    db.GetProteinLength(db.GetProteinName(offset)), 
                    db.GetOneBasedPositionInProtein(offset)+1);
            }
        }
Ejemplo n.º 8
0
        public void TestId()
        {
            const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw";
//            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta";
            //const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\Decoy_SO4280.fasta";
            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\SO2312.fasta";
            const string modFilePath = @"H:\Research\QCShew_TopDown\Production\Mods.txt";
            const int numBits = 29; // max error: 4ppm
            const int minCharge = 2;
            const int maxCharge = 20;
            var tolerance = new Tolerance(10);
            const double corrThreshold = 0.7;

            var comparer = new MzComparerWithBinning(numBits);
            const double minFragmentMass = 200.0;
            const double maxFragmentMass = 50000.0;
            var minFragMassBin = comparer.GetBinNumber(minFragmentMass);
            var maxFragMassBin = comparer.GetBinNumber(maxFragmentMass);

            var aminoAcidSet = new AminoAcidSet(modFilePath);

            var run = PbfLcMsRun.GetLcMsRun(rawFilePath);
//            var ms2ScanNumArr = run.GetScanNumbers(2).ToArray();
            //var ms2ScanNumArr = new[] {4130};
            var ms2ScanNumArr = new[] { 5189 };

            var sw = new Stopwatch();

            sw.Start();
            Console.Write("Building Spectrum Arrays...");
            var massVectors = new BitArray[maxFragMassBin - minFragMassBin + 1];
            for (var i = minFragMassBin; i <= maxFragMassBin; i++)
            {
                massVectors[i - minFragMassBin] = new BitArray(run.MaxLcScan + 1);
            }

            foreach (var ms2ScanNum in ms2ScanNumArr)
            {
                var productSpec = run.GetSpectrum(ms2ScanNum) as ProductSpectrum;
                if (productSpec == null) continue;

                productSpec.FilterNoise();
                var deconvolutedPeaks = Deconvoluter.GetDeconvolutedPeaks(productSpec, minCharge, maxCharge, 2, 1.1,
                    tolerance, corrThreshold);

                if (deconvolutedPeaks == null) continue;

                foreach (var p in deconvolutedPeaks)
                {
                    var mass = p.Mass;
                    var deltaMass = tolerance.GetToleranceAsDa(mass, 1);
                    var minMass = mass - deltaMass;
                    var maxMass = mass + deltaMass;

                    var minBinNum = comparer.GetBinNumber(minMass);
                    var maxBinNum = comparer.GetBinNumber(maxMass);
                    for (var binNum = minBinNum; binNum <= maxBinNum; binNum++)
                    {
                        if (binNum >= minFragMassBin && binNum <= maxFragMassBin) massVectors[binNum - minFragMassBin][ms2ScanNum] = true;
                    }
                }
            }
            sw.Stop();
            Console.WriteLine(@"{0:f4} sec.", sw.Elapsed.TotalSeconds);

            sw.Reset();
            sw.Start();
            var fastaDb = new FastaDatabase(fastaFilePath);
            fastaDb.Read();
            var indexedDb = new IndexedDatabase(fastaDb);
            var numProteins = 0;
            var intactProteinAnnotationAndOffsets =
                indexedDb.IntactSequenceAnnotationsAndOffsets(0, int.MaxValue);

            var bestProtein = new string[run.MaxLcScan + 1];
            var bestScore = new int[run.MaxLcScan + 1];
            foreach (var annotationAndOffset in intactProteinAnnotationAndOffsets)
            {
                if (++numProteins % 10 == 0)
                {
                    Console.WriteLine("Processing {0}{1} proteins...", numProteins,
                        numProteins == 1 ? "st" : numProteins == 2 ? "nd" : numProteins == 3 ? "rd" : "th");
                    if (numProteins != 0)
                    {
                        sw.Stop();

                        Console.WriteLine("Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds);
                        sw.Reset();
                        sw.Start();
                    }
                }
                var annotation = annotationAndOffset.Annotation;
                var offset = annotationAndOffset.Offset;

                var protSequence = annotation.Substring(2, annotation.Length - 4);

                // suffix
                var seqGraph = SequenceGraph.CreateGraph(aminoAcidSet, AminoAcid.ProteinNTerm, protSequence,
                    AminoAcid.ProteinCTerm);
                if (seqGraph == null) continue;

                for (var numNTermCleavage = 0; numNTermCleavage <= 0; numNTermCleavage++)
                {
                    if (numNTermCleavage > 0) seqGraph.CleaveNTerm();
                    var allCompositions = seqGraph.GetAllFragmentNodeCompositions().ToArray();

                    var scoreArr = new int[run.MaxLcScan + 1];
                    foreach (var fragComp in allCompositions)
                    {
                        var suffixMass = fragComp.Mass + BaseIonType.Y.OffsetComposition.Mass;
                        var binNum = comparer.GetBinNumber(suffixMass);
                        if (binNum < minFragMassBin || binNum > maxFragMassBin) continue;

                        var vector = massVectors[binNum - minFragMassBin];
                        foreach (var ms2ScanNum in ms2ScanNumArr)
                        {
                            if (vector[ms2ScanNum])
                            {
                                ++scoreArr[ms2ScanNum];
                                Console.WriteLine(suffixMass);
                            }
                        }
                    }
                    foreach (var ms2ScanNum in ms2ScanNumArr)
                    {
                        if (scoreArr[ms2ScanNum] > bestScore[ms2ScanNum])
                        {
                            bestScore[ms2ScanNum] = scoreArr[ms2ScanNum];
                            var proteinName = fastaDb.GetProteinName(offset);
                            bestProtein[ms2ScanNum] = proteinName + (numNTermCleavage == 1 ? "'" : "");
                        }
                    }
                }
                //// prefix
                //var seqGraphPrefix = SequenceGraph.CreateGraph(aminoAcidSet, AminoAcid.ProteinNTerm, protSequence,
                //    AminoAcid.ProteinCTerm);
                //if (seqGraphPrefix == null) continue;

                //{
                //    if (numNTermCleavage > 0) seqGraph.CleaveNTerm();
                //    var allCompositions = seqGraph.GetAllFragmentNodeCompositions();

                //    var scoreArr = new int[run.MaxLcScan + 1];
                //    foreach (var fragComp in allCompositions)
                //    {
                //        var suffixMass = fragComp.Mass + BaseIonType.Y.OffsetComposition.Mass;
                //        var binNum = comparer.GetBinNumber(suffixMass);
                //        if (binNum < minFragMassBin || binNum > maxFragMassBin) continue;

                //        var vector = massVectors[binNum - minFragMassBin];
                //        foreach (var ms2ScanNum in ms2ScanNumArr)
                //        {
                //            if (vector[ms2ScanNum]) ++scoreArr[ms2ScanNum];
                //        }
                //    }
                //    foreach (var ms2ScanNum in ms2ScanNumArr)
                //    {
                //        if (scoreArr[ms2ScanNum] > bestScore[ms2ScanNum])
                //        {
                //            bestScore[ms2ScanNum] = scoreArr[ms2ScanNum];
                //            var proteinName = fastaDb.GetProteinName(offset);
                //            bestProtein[ms2ScanNum] = proteinName + (numNTermCleavage == 1 ? "'" : "");
                //        }
                //    }
                //}
            }

            Console.WriteLine("ScanNum\tBestProtein\tScore");
            foreach (var ms2ScanNum in ms2ScanNumArr)
            {
                Console.WriteLine("{0}\t{1}\t{2}", ms2ScanNum, bestProtein[ms2ScanNum] ?? "", bestScore[ms2ScanNum]);
            }
            //sw.Stop();
            //Console.WriteLine(@"Scoring: {0:f4} sec.", sw.Elapsed.TotalSeconds);
        }
Ejemplo n.º 9
0
        public void TestFeatureIdMatching()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string resultFilePath = @"H:\Research\QCShew_TopDown\Production\M1_V092\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv";

            if (!File.Exists(resultFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath);
            }

            var resultParser = new MsPathFinderParser(resultFilePath);
            const double qValueThreshold = 0.01;
            const double tolerancePpm = 13;

            const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3";
            var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw");

            if (!File.Exists(rawFileName))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFileName);
            }

            var run = PbfLcMsRun.GetLcMsRun(rawFileName);

            var idList =
                resultParser.GetIdList().TakeWhile(id => id.QValue <= qValueThreshold).OrderBy(id => id.Mass).ToList();
            var idMassList = idList.Select(id => id.Mass).ToList();
            var idFlag = new bool[idList.Count];


            // Parse sequence tags
            var tagFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".seqtag");
            const int minTagLength = 6;
            const int numProtMatches = 4;
//            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";
            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta";

            if (!File.Exists(tagFileName))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFileName);
            }

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);

            var tagParser = new SequenceTagParser(tagFileName, minTagLength);

            var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft");
            var featureParser = new TsvFileParser(featureFileName);

            var minScan = featureParser.GetData("MinScan").Select(s => Convert.ToInt32(s)).ToArray();
            var maxScan = featureParser.GetData("MaxScan").Select(s => Convert.ToInt32(s)).ToArray();
            var minCharge = featureParser.GetData("MinCharge").Select(s => Convert.ToInt32(s)).ToArray();
            var maxCharge = featureParser.GetData("MaxCharge").Select(s => Convert.ToInt32(s)).ToArray();
            var monoMass = featureParser.GetData("MonoMass").Select(Convert.ToDouble).ToArray();

            var numFeaturesWithId = 0;
            var numFeaturesWithMs2 = 0;
            var numFeaturesWithTags = 0;
            var numFeaturesWithMatchingTags = 0;
            var numFeaturesWithTwoOrMoreMatchingTags = 0;
            var numFeaturesWithNoIdAndMatchingTags = 0;

            for (var i = 0; i < featureParser.NumData; i++)
            {
                var mass = monoMass[i];

                // Find Id
                var tolDa = new Tolerance(tolerancePpm).GetToleranceAsDa(mass, 1);
                var minMass = mass - tolDa;
                var maxMass = mass + tolDa;
                var index = idMassList.BinarySearch(mass);
                if (index < 0) index = ~index;

                var matchedId = new List<MsPathFinderId>();
                // go down
                var curIndex = index - 1;
                while (curIndex >= 0)
                {
                    var curId = idList[curIndex];
                    if (curId.Mass < minMass) break;
                    if (curId.Scan > minScan[i] && curId.Scan < maxScan[i]
                        && curId.Charge >= minCharge[i] && curId.Charge <= maxCharge[i])
                    {
                        matchedId.Add(curId);
                        idFlag[curIndex] = true;
                    }
                    --curIndex;
                }

                // go up
                curIndex = index;
                while (curIndex < idList.Count)
                {
                    var curId = idList[curIndex];
                    if (curId.Mass > maxMass) break;
                    if (curId.Scan >= minScan[i] && curId.Scan <= maxScan[i]
                        && curId.Charge >= minCharge[i] && curId.Charge <= maxCharge[i])
                    {
                        matchedId.Add(curId);
                        idFlag[curIndex] = true;
                    }
                    ++curIndex;
                }

                var hasId = false;
                if (matchedId.Any())
                {
                    ++numFeaturesWithId;
                    hasId = true;
                }

                // Find MS2 scans
//                var numMs2Scans = 0;
                var tags = new List<SequenceTag>();
                var hasMs2 = false;
                for (var scanNum = minScan[i]; scanNum <= maxScan[i]; scanNum++)
                {
                    var isolationWindow = run.GetIsolationWindow(scanNum);
                    if (isolationWindow == null) continue;
                    var isolationWindowTargetMz = isolationWindow.IsolationWindowTargetMz;
                    var charge = (int)Math.Round(mass / isolationWindowTargetMz);
                    if (charge < minCharge[i] || charge > maxCharge[i]) continue;
                    var mz = Ion.GetIsotopeMz(mass, charge,
                        Averagine.GetIsotopomerEnvelope(mass).MostAbundantIsotopeIndex);
                    if (isolationWindow.Contains(mz))
                    {
//                        ++numMs2Scans;
                        tags.AddRange(tagParser.GetSequenceTags(scanNum));
                        hasMs2 = true;
                    }
                }
                if (hasMs2) ++numFeaturesWithMs2;
                if (tags.Any()) ++numFeaturesWithTags;
                var protHist = new Dictionary<string, int>();
                var hasMatchedTag = false;
                foreach (var tag in tags)
                {
                    var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).Select(idx => fastaDb.GetProteinName(idx)).ToArray();
                    if (matchedProteins.Any())
                    {
                        hasMatchedTag = true;
                        foreach (var protein in matchedProteins)
                        {
                            int num;
                            if (protHist.TryGetValue(protein, out num)) protHist[protein] = num + 1;
                            else protHist[protein] = 1;
                        }
                    }
                }
                if (hasMatchedTag)
                {
                    ++numFeaturesWithMatchingTags;
                    if (!hasId) ++numFeaturesWithNoIdAndMatchingTags;
                }
                if (protHist.Any())
                {
                    var maxOcc = protHist.Values.Max();
                    if (maxOcc >= numProtMatches) ++numFeaturesWithTwoOrMoreMatchingTags;
                }
            }

            Console.WriteLine("NumFeatures: {0}", featureParser.NumData);
            Console.WriteLine("NumId: {0}", idList.Count);
            Console.WriteLine("NumFeaturesWithId: {0} ({1})", numFeaturesWithId, numFeaturesWithId / (float)featureParser.NumData);
            Console.WriteLine("NumFeaturesWithMs2: {0} ({1})", numFeaturesWithMs2, numFeaturesWithMs2 / (float)featureParser.NumData);
            Console.WriteLine("NumFeaturesWithTag: {0} ({1})", numFeaturesWithTags, numFeaturesWithTags / (float)featureParser.NumData);
            Console.WriteLine("NumFeaturesWithMatchedTag: {0} ({1})", numFeaturesWithMatchingTags, numFeaturesWithMatchingTags / (float)featureParser.NumData);
            Console.WriteLine("NumFeaturesWithMoreThanOneMatchedTag: {0} ({1})", numFeaturesWithTwoOrMoreMatchingTags, numFeaturesWithTwoOrMoreMatchingTags / (float)featureParser.NumData);
            Console.WriteLine("NumFeaturesWithNoIdAndMatchedTag: {0} ({1})", numFeaturesWithNoIdAndMatchingTags, numFeaturesWithNoIdAndMatchingTags / (float)featureParser.NumData);

            for (var i = 0; i < idFlag.Length; i++)
            {
                if (!idFlag[i])
                {
                    Console.WriteLine(idList[i].Scan);
                }
            }


//            Console.WriteLine(string.Join(",", filter.GetMatchingMs2ScanNums(8115.973001)));
//
//            Console.WriteLine(featureFileName);
        }
Ejemplo n.º 10
0
        public void TestFeatureId()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3";

            if (!File.Exists(dataSet))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dataSet);
            }

            // Feature: 5236-5286	6-12	8480.3681	5
            const int minScanNum = 5236;
            const int maxScanNum = 5286;
            const double featureMass = 8480.3681;

            //const int minScanNum = 7251;
            //const int maxScanNum = 7326;
            //const double featureMass = 32347.18;

//            const int minScanNum = 4451;
//            const int maxScanNum = 4541;
//            const double featureMass = 31267.95;

            var tolerance = new Tolerance(10);
            var relaxedTolerance = new Tolerance(20);

            const int minTagLength = 5;
            const int minMergedTagLength = 7;
            const int minNumTagMatches = 1;

            var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw");
            var run = PbfLcMsRun.GetLcMsRun(rawFileName);

            var aminoAcidSet = AminoAcidSet.GetStandardAminoAcidSet();
            var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft");
            var filter = new Ms1FtFilter(run, tolerance, featureFileName);
            var ms2ScanNums =
                filter.GetMatchingMs2ScanNums(featureMass)
                    .Where(scanNum => scanNum > minScanNum && scanNum < maxScanNum)
                    .ToArray();

            const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag");
            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);
            var tagParser = new SequenceTagParser(tagFileName, minTagLength);

            var proteinsToTags = new Dictionary<string, IList<MatchedTag>>();
            foreach (var ms2ScanNum in ms2ScanNums)
            {
                var tags = tagParser.GetSequenceTags(ms2ScanNum);
                foreach (var tag in tags)
                {
                    var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray();
                    foreach (var index in matchedIndices)
                    {
                        var protein = fastaDb.GetProteinName(index);
                        var startIndex = fastaDb.GetZeroBasedPositionInProtein(index);
                        var matchedTag = new MatchedTag(tag, startIndex, featureMass);
                        IList<MatchedTag> existingTags;
                        if (proteinsToTags.TryGetValue(protein, out existingTags))
                        {
                            existingTags.Add(matchedTag);
                        }
                        else
                        {
                            proteinsToTags.Add(protein, new List<MatchedTag> { matchedTag });
                        }
                    }
                }
            }

            foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count))
            {
                if (entry.Value.Count < minNumTagMatches) break;
                var proteinName = entry.Key;
                var proteinSequence = fastaDb.GetProteinSequence(proteinName);
                var protein = new Sequence(proteinSequence, aminoAcidSet);
                Console.WriteLine(proteinName + "\t" + entry.Value.Count);

                var matchedTagSet = new MatchedTagSet(proteinSequence, aminoAcidSet,
                    tolerance, relaxedTolerance);

                Console.WriteLine("********** Before merging");
                foreach (var matchedTag in entry.Value)
                {
                    var seq = proteinSequence.Substring(matchedTag.StartIndex,
                        matchedTag.EndIndex - matchedTag.StartIndex);
                    var nTermMass = protein.GetMass(0, matchedTag.StartIndex);
                    var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count);
                    Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}",
                        (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex,
                        matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable);

                    matchedTagSet.Add(matchedTag);
                }

                Console.WriteLine("********** After merging");
                foreach (var matchedTag in matchedTagSet.Tags)
                {
                    if (matchedTag.Length < minMergedTagLength) continue;
                    var seq = proteinSequence.Substring(matchedTag.StartIndex,
                        matchedTag.EndIndex - matchedTag.StartIndex);
                    var nTermMass = protein.GetMass(0, matchedTag.StartIndex);
                    var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count);
                    Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}",
                        (matchedTag.NTermFlankingMass-nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex,
                        matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable);
                }

                break;
            }
        }
Ejemplo n.º 11
0
        public void TestTagMatchingSingleSpec()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3";
            const int scanNum = 4533;

            // Parse sequence tags
            var tagFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".seqtag");
            const int minTagLength = 8;
            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";

            if (!File.Exists(tagFileName))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFileName);
            }

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);
            var tagParser = new SequenceTagParser(tagFileName, minTagLength);

            var tags = tagParser.GetSequenceTags(scanNum);
            foreach (var tag in tags)
            {
                var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence)
                    .Select(index => fastaDb.GetProteinName(index)).ToArray();
                if (matchedProteins.Any())
                {
                    Console.WriteLine("{0}\t{1}\t{2}\t{3}", tag.Sequence, tag.IsPrefix, tag.FlankingMass, string.Join("\t", matchedProteins));
                }
            }
        }
Ejemplo n.º 12
0
        public void TestGetProteinsWithTagMatchingSingleSpec()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string dataSet = @"H:\Research\Lewy\raw\Lewy_intact_07";
            //            const int scanNum = 5158;
            const int minTagLength = 7;
            const int minNumTagMatches = 1;
            var aminoAcidSet = AminoAcidSet.GetStandardAminoAcidSet();

            const int scanNum = 2;
            // Parse sequence tags
            //const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag");

            const string rawFilePath = "";

            const string fastaFilePath = @"H:\Research\Lewy\ID_004858_0EE8CF61.fasta";

            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);
            //var tagParser = new SequenceTagParser(tagFileName, minTagLength);
            //var tags = tagParser.GetSequenceTags(scanNum);
            var run = PbfLcMsRun.GetLcMsRun(rawFilePath);
            var spec = run.GetSpectrum(scanNum) as ProductSpectrum;
            var tagFinder = new SequenceTagFinder(spec, new Tolerance(5));
            var tags = tagFinder.GetAllSequenceTagString();

            var proteinsToTags = new Dictionary<string, IList<MatchedTag>>();

            foreach (var tag in tags)
            {
                var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray();
                foreach (var index in matchedIndices)
                {
                    var protein = fastaDb.GetProteinName(index);
                    var startIndex = fastaDb.GetOneBasedPositionInProtein(index);
                    var matchedTag = new MatchedTag(tag, startIndex, 0.0);
                    IList<MatchedTag> existingTags;
                    if (proteinsToTags.TryGetValue(protein, out existingTags))
                    {
                        existingTags.Add(matchedTag);
                    }
                    else
                    {
                        proteinsToTags.Add(protein, new List<MatchedTag> { matchedTag });
                    }
                }
            }

            foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count))
            {
                if (entry.Value.Count < minNumTagMatches) break;
                var proteinName = entry.Key;
                var proteinSequence = fastaDb.GetProteinSequence(proteinName);
                var protein = new Sequence(proteinSequence, aminoAcidSet);
                Console.WriteLine(proteinName + "\t" + entry.Value.Count);
                foreach (var matchedTag in entry.Value)
                {
                    var seq = proteinSequence.Substring(matchedTag.StartIndex,
                        matchedTag.EndIndex - matchedTag.StartIndex);
                    var nTermMass = protein.GetMass(0, matchedTag.StartIndex);
                    var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count);
                    Console.WriteLine("\t{0} ({1})\t{2}\t{3} ({4})\t{5}\t{6}\t{7}",
                        matchedTag.NTermFlankingMass, (matchedTag.NTermFlankingMass - nTermMass), 
                        seq, 
                        matchedTag.CTermFlankingMass, (matchedTag.CTermFlankingMass - cTermMass), 
                        matchedTag.StartIndex,
                        matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable);

                }
            }
        }
Ejemplo n.º 13
0
        public void CreateTargetList()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string databaseFilePath = @"D:\Research\Data\IPRG2014\database\SpikedInPeptides.fasta";
            if (!File.Exists(databaseFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath);
            }

            var database = new FastaDatabase(databaseFilePath);
            database.Read();
            var indexedDatabase = new IndexedDatabase(database);
            var numTargets = 0;

            var aaSet = new AminoAcidSet(Modification.Carbamidomethylation);

            Console.WriteLine("Peptide\tFormula\tProtein");
            foreach (var annotationAndOffset in indexedDatabase.AnnotationsAndOffsets(6, 30, 1, 1, Enzyme.Trypsin))
            {
                var annotation = annotationAndOffset.Annotation;
                var peptide = annotation.Substring(2, annotation.Length - 4);
                var offset = annotationAndOffset.Offset;

                Console.WriteLine("{0}\t{1}\t{2}", peptide, (aaSet.GetComposition(peptide) + Composition.H2O).ToPlainString(), database.GetProteinName(offset));
                numTargets++;
            }
            Console.WriteLine("NumTargets: {0}", numTargets);
        }