Ejemplo n.º 1
0
        public void CountMatchedScansPerProtein()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const int minTagLength = 6;

            var          proteinToScan = new Dictionary <string, HashSet <int> >();
            const string fastaFilePath = @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta";

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb      = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);

            Console.WriteLine(@"Sequence length: {0}", fastaDb.GetSequence().Length);

            //const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv";
            //const string tagFilePath = @"\\protoapps\UserData\Jungkap\Co_culture\23B_pellet_TD_3Feb14_Bane_PL011402.seqtag";
            const string tagFilePath = @"D:\MassSpecFiles\co_culture\23A_pellet_TD_3Feb14_Bane_PL011402.seqtag";

            if (!File.Exists(tagFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath);
            }

            var isHeader        = true;
            var numMatchedPairs = 0;

            foreach (var line in File.ReadAllLines(tagFilePath))
            {
                if (isHeader)
                {
                    isHeader = false;
                    continue;
                }

                var token = line.Split('\t');
                if (token.Length != 3)
                {
                    continue;
                }
                var scan = Convert.ToInt32(token[0]);

                var tag = token[1];
                if (tag.Length < minTagLength)
                {
                    continue;
                }

                foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag)
                         .Select(index => fastaDb.GetProteinName(index)))
                {
                    ++numMatchedPairs;
                    HashSet <int> matchedScans;
                    if (proteinToScan.TryGetValue(matchedProtein, out matchedScans))
                    {
                        matchedScans.Add(scan);
                    }
                    else
                    {
                        matchedScans = new HashSet <int> {
                            scan
                        };
                        proteinToScan.Add(matchedProtein, matchedScans);
                    }
                }
            }

            var numMatchedProteins = proteinToScan.Keys.Count;
            var numAllProteins     = fastaDb.GetNumEntries();

            Console.WriteLine("NumAllProteins: {0}", numAllProteins);
            Console.WriteLine("NumMatchedProteins: {0}", numMatchedProteins);
            Console.WriteLine("AvgMatchedScansPerProtein: {0}", numMatchedPairs / (float)numAllProteins);
        }
Ejemplo n.º 2
0
        public void TestFasta()
        {
            var db = new FastaDatabase(@"\\protoapps\UserData\Jungkap\Lewy\db\ID_005140_7A170668.fasta");

            Console.WriteLine(db.GetNumEntries());
        }
Ejemplo n.º 3
0
        public void WriteResultsToMzid(IEnumerable <DatabaseSearchResultData> matches, string outputFilePath)
        {
            var datasetName = Path.GetFileNameWithoutExtension(outputFilePath);
            var creator     = new IdentDataCreator("MSPathFinder_" + datasetName, "MSPathFinder_" + datasetName);
            var soft        = creator.AddAnalysisSoftware("Software_1", "MSPathFinder", System.Reflection.Assembly.GetCallingAssembly().GetName().Version.ToString(), CV.CVID.MS_MSPathFinder, "MSPathFinder");
            var settings    = creator.AddAnalysisSettings(soft, "Settings_1", CV.CVID.MS_ms_ms_search);
            var searchDb    = creator.AddSearchDatabase(database.GetFastaFilePath(), database.GetNumEntries(), Path.GetFileNameWithoutExtension(database.GetFastaFilePath()), CV.CVID.CVID_Unknown,
                                                        CV.CVID.MS_FASTA_format);

            if (options.TargetDecoySearchMode.HasFlag(DatabaseSearchMode.Decoy))
            {
                searchDb.CVParams.AddRange(new CVParamObj[]
                {
                    new CVParamObj()
                    {
                        Cvid = CV.CVID.MS_DB_composition_target_decoy,
                    },
                    new CVParamObj()
                    {
                        Cvid = CV.CVID.MS_decoy_DB_accession_regexp, Value = "^XXX",
                    },
                    //new CVParamObj() { Cvid = CV.CVID.MS_decoy_DB_type_reverse, },
                    new CVParamObj()
                    {
                        Cvid = CV.CVID.MS_decoy_DB_type_randomized,
                    },
                });
            }

            // store the settings...
            CreateMzidSettings(settings);

            var path = options.SpecFilePath;
            var run  = lcmsRun as PbfLcMsRun;

            if (run != null)
            {
                var rawPath = run.RawFilePath;
                if (!string.IsNullOrWhiteSpace(rawPath))
                {
                    path = rawPath;
                }
            }
            // TODO: fix this to match correctly to the original file - May need to modify the PBF format to add an input format specifier
            // TODO: Should probably? request a CV Term for the PBF format?
            var nativeIdFormat = lcmsRun.NativeIdFormat;

            if (nativeIdFormat == CV.CVID.CVID_Unknown)
            {
                nativeIdFormat = CV.CVID.MS_scan_number_only_nativeID_format;
            }
            var specData = creator.AddSpectraData(path, datasetName, nativeIdFormat, lcmsRun.NativeFormat);

            // Get the search modifications as they were passed into the AminoAcidSet constructor, so we can retrieve masses from them
            var modDict = new Dictionary <string, Modification>();

            foreach (var mod in options.AminoAcidSet.SearchModifications)
            {
                if (!modDict.ContainsKey(mod.Modification.Name))
                {
                    modDict.Add(mod.Modification.Name, mod.Modification);
                }
                else if (!modDict[mod.Modification.Name].Composition.Equals(mod.Modification.Composition))
                {
                    throw new System.Exception(
                              "ERROR: Cannot have modifications with the same name and different composition/mass! Fix input modifications! Duplicated modification name: " +
                              mod.Modification.Name);
                }
            }

            foreach (var match in matches)
            {
                var scanNum  = match.ScanNum;
                var spec     = lcmsRun.GetSpectrum(scanNum, false);
                var matchIon = new Ion(Composition.Parse(match.Composition), match.Charge);

                var nativeId = spec.NativeId;
                if (string.IsNullOrWhiteSpace(spec.NativeId))
                {
                    nativeId = "scan=" + spec.ScanNum;
                }
                var specIdent = creator.AddSpectrumIdentification(specData, nativeId, spec.ElutionTime, match.MostAbundantIsotopeMz,
                                                                  match.Charge, 1, double.NaN);
                specIdent.CalculatedMassToCharge = matchIon.GetMonoIsotopicMz();
                var pep = new PeptideObj(match.Sequence);

                var modText = match.Modifications;
                if (!string.IsNullOrWhiteSpace(modText))
                {
                    var mods = modText.Split(',');
                    foreach (var mod in mods)
                    {
                        var tokens  = mod.Split(' ');
                        var modInfo = modDict[tokens[0]];
                        var modObj  = new ModificationObj(CV.CVID.MS_unknown_modification, modInfo.Name, int.Parse(tokens[1]), modInfo.Mass);
                        pep.Modifications.Add(modObj);
                    }
                }
                specIdent.Peptide = pep;

                var proteinName        = match.ProteinName;
                var protLength         = match.ProteinLength;
                var proteinDescription = match.ProteinDescription;
                var dbSeq = new DbSequenceObj(searchDb, protLength, proteinName, proteinDescription);

                var start = match.Start;
                var end   = match.End;
                var pepEv = new PeptideEvidenceObj(dbSeq, pep, start, end, match.Pre, match.Post, match.ProteinName.StartsWith("XXX"));
                specIdent.AddPeptideEvidence(pepEv);

                var probability = match.Probability;

                specIdent.CVParams.Add(new CVParamObj()
                {
                    Cvid = CV.CVID.MS_chemical_compound_formula, Value = match.Composition,
                });
                //specIdent.CVParams.Add(new CVParamObj() { Cvid = CV.CVID.MS_number_of_matched_peaks, Value = match.NumMatchedFragments.ToString(), });
                specIdent.CVParams.Add(new CVParamObj()
                {
                    Cvid = CV.CVID.MS_MSPathFinder_RawScore, Value = probability.ToString(CultureInfo.InvariantCulture),
                });
                specIdent.CVParams.Add(new CVParamObj()
                {
                    Cvid = CV.CVID.MS_MSPathFinder_SpecEValue, Value = match.SpecEValue.ToString(CultureInfo.InvariantCulture),
                });
                specIdent.CVParams.Add(new CVParamObj()
                {
                    Cvid = CV.CVID.MS_MSPathFinder_EValue, Value = match.EValue.ToString(CultureInfo.InvariantCulture),
                });
                if (match.HasTdaScores)
                {
                    specIdent.CVParams.Add(new CVParamObj()
                    {
                        Cvid = CV.CVID.MS_MSPathFinder_QValue, Value = match.QValue.ToString(CultureInfo.InvariantCulture),
                    });
                    specIdent.CVParams.Add(new CVParamObj()
                    {
                        Cvid = CV.CVID.MS_MSPathFinder_PepQValue, Value = match.PepQValue.ToString(CultureInfo.InvariantCulture),
                    });
                }
                // MS-GF+ similarity: find/add isotope error?
                // MS-GF+ similarity: find/add assumed dissociation method?
                //specIdent.UserParams.Add(new UserParamObj() {Name = "Assumed Dissociation Method", Value = match.});
            }

            var identData = creator.GetIdentData();

            MzIdentMlReaderWriter.Write(new MzIdentMLType(identData), outputFilePath);
        }
Ejemplo n.º 4
0
        private void WriteResultsToFile(DatabaseSequenceSpectrumMatch[] matches, string outputFilePath, FastaDatabase database)
        {
            using (var writer = new StreamWriter(outputFilePath))
            {
                writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" +
                                 "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tProbability\tSpecEValue\tEValue");

                foreach (var scanNum in _ms2ScanNums)
                {
                    var match = matches[scanNum];
                    if (match == null)
                    {
                        continue;
                    }

                    var sequence           = match.Sequence;
                    var offset             = match.Offset;
                    var start              = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages;
                    var end                = start + sequence.Length - 1;
                    var proteinName        = database.GetProteinName(match.Offset);
                    var protLength         = database.GetProteinLength(proteinName);
                    var ion                = match.Ion;
                    var proteinDescription = database.GetProteinDescription(match.Offset);
                    var probability        = CompositeScorer.GetProbability(match.Score);

                    // Note for DblToString(value, 9, true), by having "9" and "true",
                    // values between 100 and 999 Da will have 7 digits after the decimal place, and
                    // values between 1000 and 9999 will have 6 digits after the decimal place
                    writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\t{16}\t{17}",
                                     scanNum,
                                     match.Pre,                                                                                              // Pre
                                     sequence,                                                                                               // Sequence
                                     match.Post,                                                                                             // Post
                                     match.ModificationText,                                                                                 // Modifications
                                     ion.Composition,                                                                                        // Composition
                                     proteinName,                                                                                            // ProteinName
                                     proteinDescription,                                                                                     // ProteinDescription
                                     protLength,                                                                                             // ProteinLength
                                     start,                                                                                                  // Start position in protein
                                     end,                                                                                                    // End position in protein
                                     ion.Charge,                                                                                             // precursorCharge
                                     StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true),                                   // MostAbundantIsotopeMz
                                     StringUtilities.DblToString(ion.Composition.Mass, 9, true),                                             // Mass
                                     match.NumMatchedFragments,                                                                              // (Number of matched fragments)
                                     StringUtilities.DblToString(probability, 4),                                                            // Probability
                                     StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue), 6, true, 0.001),                           // EValue; will be displayed using scientific notation if the value is less than 0.001
                                     StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue * database.GetNumEntries()), 6, true, 0.001) // SpecEValue; will be displayed using scientific notation if the value is less than 0.001
                                     );
                }
            }
        }