Example #1
0
        private void WriteResultsToFile(SortedSet <DatabaseSequenceSpectrumMatch>[] matches, string outputFilePath, FastaDatabase database)
        {
            using (var writer = new StreamWriter(outputFilePath))
            {
                writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" +
                                 "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tIcScore"
                                 );
                for (var scanNum = _run.MinLcScan; scanNum <= _run.MaxLcScan; scanNum++)
                {
                    if (matches[scanNum] == null)
                    {
                        continue;
                    }
                    foreach (var match in matches[scanNum].Reverse())
                    {
                        var sequence    = match.Sequence;
                        var offset      = match.Offset;
                        var start       = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages;
                        var end         = start + sequence.Length - 1;
                        var proteinName = database.GetProteinName(match.Offset);
                        var protLength  = database.GetProteinLength(proteinName);
                        var ion         = match.Ion;

                        var scores = _bottomUpScorer.GetScores(match, ion.Composition, ion.Charge, scanNum);

                        if (ion == null)
                        {
                            Console.WriteLine(@"Null ion!");
                        }
                        if (scores == null)
                        {
                            Console.WriteLine(@"Null scores");
                        }

                        // Note for DblToString(value, 9, true), by having "9" and "true",
                        // values between 100 and 999 Da will have 7 digits after the decimal place, and
                        // values between 1000 and 9999 will have 6 digits after the decimal place

                        writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}",
                                         scanNum,
                                         match.Pre,
                                         sequence,                                                             // Sequence
                                         match.Post,
                                         scores.Modifications,                                                 // Modifications
                                         ion.Composition,                                                      // Composition
                                         proteinName,                                                          // ProteinName
                                         database.GetProteinDescription(match.Offset),                         // ProteinDescription
                                         protLength,                                                           // ProteinLength
                                         start,                                                                // Start
                                         end,                                                                  // End
                                         ion.Charge,                                                           // precursorCharge
                                         StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true), // MostAbundantIsotopeMz
                                         StringUtilities.DblToString(ion.Composition.Mass, 9, true),           // Mass
                                         match.Score,
                                         scores.Score                                                          // Score (re-scored)
                                         );
                    }
                }
            }
        }
Example #2
0
        public void TestGettingProteinLengthAndPosition()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta";

            if (!File.Exists(dbFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile);
            }

            var db = new FastaDatabase(dbFile);

            db.Read();
            var indexedDb = new IndexedDatabase(db);

            foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsets(6, 20, 2, 0, Enzyme.Trypsin))
            {
                var annotation = peptideAnnotationAndOffset.Annotation;
                var offset     = peptideAnnotationAndOffset.Offset;
                Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}",
                                  annotation,
                                  offset,
                                  db.GetProteinName(offset),
                                  db.GetProteinLength(db.GetProteinName(offset)),
                                  db.GetOneBasedPositionInProtein(offset) + 1);
            }
        }
Example #3
0
        public void TestGettingProteinLengthAndPosition()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\Short.fasta"));

            var db = new FastaDatabase(fastaFile.FullName);

            db.Read();
            var indexedDb = new IndexedDatabase(db);

            foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsets(6, 20, 2, 0, Enzyme.Trypsin))
            {
                var annotation = peptideAnnotationAndOffset.Annotation;
                var offset     = peptideAnnotationAndOffset.Offset;
                Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}",
                                  annotation,
                                  offset,
                                  db.GetProteinName(offset),
                                  db.GetProteinLength(db.GetProteinName(offset)),
                                  db.GetOneBasedPositionInProtein(offset) + 1);
            }
        }
Example #4
0
        private void WriteResultsToFile(DatabaseSequenceSpectrumMatch[] matches, string outputFilePath, FastaDatabase database)
        {
            using (var writer = new StreamWriter(outputFilePath))
            {
                writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" +
                                 "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tProbability\tSpecEValue\tEValue");

                foreach (var scanNum in _ms2ScanNums)
                {
                    var match = matches[scanNum];
                    if (match == null)
                    {
                        continue;
                    }

                    var sequence           = match.Sequence;
                    var offset             = match.Offset;
                    var start              = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages;
                    var end                = start + sequence.Length - 1;
                    var proteinName        = database.GetProteinName(match.Offset);
                    var protLength         = database.GetProteinLength(proteinName);
                    var ion                = match.Ion;
                    var proteinDescription = database.GetProteinDescription(match.Offset);
                    var probability        = CompositeScorer.GetProbability(match.Score);

                    // Note for DblToString(value, 9, true), by having "9" and "true",
                    // values between 100 and 999 Da will have 7 digits after the decimal place, and
                    // values between 1000 and 9999 will have 6 digits after the decimal place
                    writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\t{16}\t{17}",
                                     scanNum,
                                     match.Pre,                                                                                              // Pre
                                     sequence,                                                                                               // Sequence
                                     match.Post,                                                                                             // Post
                                     match.ModificationText,                                                                                 // Modifications
                                     ion.Composition,                                                                                        // Composition
                                     proteinName,                                                                                            // ProteinName
                                     proteinDescription,                                                                                     // ProteinDescription
                                     protLength,                                                                                             // ProteinLength
                                     start,                                                                                                  // Start position in protein
                                     end,                                                                                                    // End position in protein
                                     ion.Charge,                                                                                             // precursorCharge
                                     StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true),                                   // MostAbundantIsotopeMz
                                     StringUtilities.DblToString(ion.Composition.Mass, 9, true),                                             // Mass
                                     match.NumMatchedFragments,                                                                              // (Number of matched fragments)
                                     StringUtilities.DblToString(probability, 4),                                                            // Probability
                                     StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue), 6, true, 0.001),                           // EValue; will be displayed using scientific notation if the value is less than 0.001
                                     StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue * database.GetNumEntries()), 6, true, 0.001) // SpecEValue; will be displayed using scientific notation if the value is less than 0.001
                                     );
                }
            }
        }
Example #5
0
        public void AddProteinLengths()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string databaseFilePath = @"H:\Research\IPRG2015\database\yeast6proteaprotein.fasta";

            if (!File.Exists(databaseFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath);
            }

            var database = new FastaDatabase(databaseFilePath);

            database.Read();

            const string resultPath = @"H:\Research\IPRG2015\AMT_Peptides_NA.tsv";

            if (!File.Exists(resultPath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultPath);
            }

            const string outputFilePath = @"H:\Research\IPRG2015\AMT_Peptides.tsv";

            using (var writer = new StreamWriter(outputFilePath))
            {
                foreach (var line in File.ReadLines(resultPath))
                {
                    var data = line.Split(null);
                    if (data.Length != 14)
                    {
                        continue;
                    }
                    var peptide = data[0];
                    if (peptide.Equals("Peptide"))
                    {
                        writer.WriteLine("Peptide\tProtein\tLength\t{0}", string.Join("\t", data.Skip(2)));
                        continue;
                    }
                    var protein = data[1];
                    var length  = database.GetProteinLength(protein);
                    writer.WriteLine("{0}\t{1}\t{2}\t{3}", peptide, protein, length, string.Join("\t", data.Skip(2)));
                }
            }
        }
Example #6
0
        public void GenerateAbrfSpecCountAllProteins()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string dir = @"H:\Research\IPRG2015";

            if (!Directory.Exists(dir))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, dir);
            }

            const double qValueThreshold = 0.01;
            //var names = new[] { "ENO1_YEAST", "ADH1_YEAST", "CYC_BOVIN", "ALBU_BOVIN" };
            //var accessions = new[] { "P00924", "P00330", "P62894", "P02769" };

            const string resultDir = dir + @"\NTT1";

            if (!Directory.Exists(resultDir))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, resultDir);
            }

            var msgfResultFiles = Directory.GetFiles(resultDir, "*.tsv").ToArray();

            var specCount = new Dictionary <string, int[]>();  // protein name => array of counts

            for (var i = 0; i < msgfResultFiles.Length; i++)
            {
                var msgfResultFile = msgfResultFiles[i];

                MsGfPlusHeaderInformation headerInfo = null;

                var prevScanNum = -1;
                foreach (var line in File.ReadLines(msgfResultFile))
                {
                    if (line.StartsWith("#"))
                    {
                        headerInfo = new MsGfPlusHeaderInformation(line);
                        continue;
                    }

                    var match = new MsGfMatch(line, headerInfo);

                    if (match.ScanNum == prevScanNum)
                    {
                        continue;
                    }
                    prevScanNum = match.ScanNum;

                    if (!match.IsValid || match.Protein.StartsWith(FastaDatabase.DecoyProteinPrefix))
                    {
                        continue;
                    }
                    if (match.QValue > qValueThreshold)
                    {
                        continue;
                    }

                    var proteins = match.Protein.Split(';');
                    foreach (var protein in proteins)
                    {
                        var   proteinName = protein.Substring(0, protein.LastIndexOf("(pre=", StringComparison.Ordinal));
                        int[] countArr;
                        if (!specCount.TryGetValue(proteinName, out countArr))
                        {
                            specCount[proteinName] = new int[msgfResultFiles.Length];
                        }
                        specCount[proteinName][i]++;
                    }
                }
            }

            // Writing
            const string databaseFilePath = dir + @"\database\iPRG2015.fasta";
            var          database         = new FastaDatabase(databaseFilePath);

            database.Read();

            //            var spikeInAccessions = new[] { "STANDARD_Alpha-Casein", "STANDARD_Beta-Lactoglobulin", "STANDARD_Carbonic-Anhydrase", "P02769"};

            const string outputFilePath = dir + @"\SpecCountAllProteins.tsv";

            using (var writer = new StreamWriter(outputFilePath))
            {
                var fileIds = msgfResultFiles.Select(f => f.Substring(f.IndexOf("_sample", StringComparison.Ordinal) + 1,
                                                                      f.LastIndexOf('.') - f.IndexOf("_sample", StringComparison.Ordinal) - 1));
                writer.WriteLine("Protein\tLength\t" + string.Join("\t", fileIds) + "\tSpikeIn");
                foreach (var entry in specCount)
                {
                    var proteinId = entry.Key;
                    var length    = database.GetProteinLength(proteinId);
                    Assert.True(length > 0);
                    var counts = entry.Value;
                    Assert.True(counts.Length == msgfResultFiles.Length);
                    var spikeIn = 0;
                    //if (spikeInAccessions.Any(spikeInAccession => proteinId.StartsWith("sp|" + spikeInAccession)))
                    if (proteinId.StartsWith("sp|"))
                    {
                        spikeIn = 1;
                    }
                    writer.WriteLine("{0}\t{1}\t{2}\t{3}", proteinId, length, string.Join("\t", counts), spikeIn);
                }
            }
        }
Example #7
0
        public void ProcessIprg2015PreStudy()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string dir = @"H:\Research\IPRG2015";

            const string databaseFilePath = dir + @"\database\yeast6proteaprotein.fasta";

            if (!File.Exists(databaseFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath);
            }

            var database = new FastaDatabase(databaseFilePath);

            database.Read();

            const string jobFilePath = dir + @"\Jobs.tsv";

            if (!File.Exists(jobFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, jobFilePath);
            }

            var jobParser   = new TsvFileParser(jobFilePath);
            var jobs        = jobParser.GetData("Jobs").Select(j => Convert.ToInt32(j)).ToArray();
            var experiments = jobParser.GetData("Experiments").Select(e => e.Split('_')[2]).ToArray();

            //const string resultFilePath = dir + @"\AMT_Proteins_NA.tsv";
            //const string outputFilePath = dir + @"\AMT_Proteins.tsv";

            const string resultFilePath = dir + @"\AMT_Peptides_NA.tsv";
            const string outputFilePath = dir + @"\AMT_Peptides.tsv";

            var parser    = new TsvFileParser(resultFilePath);
            var headers   = parser.GetHeaders();
            var jobColNum = new int[jobs.Length];

            for (var i = 0; i < jobs.Length; i++)
            {
                for (var j = 0; j < headers.Count; j++)
                {
                    if (headers[j].Contains("" + jobs[i]))
                    {
                        jobColNum[i] = j;
                        break;
                    }
                }
            }

            for (var i = 0; i < jobs.Length; i++)
            {
                Console.WriteLine("{0}\t{1}\t{2}", jobs[i], jobColNum[i], experiments[i]);
            }

            using (var writer = new StreamWriter(outputFilePath))
            {
                var peptides   = parser.GetData("Peptide");   // Peptides
                var proteins   = parser.GetData("Reference"); // Proteins
                var abundances = new string[jobs.Length][];
                for (var i = 0; i < jobs.Length; i++)
                {
                    abundances[i] = parser.GetData(headers[jobColNum[i]]).ToArray();
                }

                if (peptides != null)
                {
                    writer.Write("Peptide\t");
                }
                writer.Write("Protein\tLength");
                for (var i = 0; i < jobs.Length; i++)
                {
                    writer.Write("\t" + experiments[i]);
                }
                writer.WriteLine("\tSpikeIn");
                for (var i = 0; i < proteins.Count; i++)
                {
                    var protein = proteins[i];
                    if (protein.StartsWith("XXX") || protein.StartsWith("Contaminant"))
                    {
                        continue;
                    }
                    var length = database.GetProteinLength(protein);
                    //if (length <= 0)
                    //{
                    //    Console.WriteLine("Shit!");
                    //    return;
                    //}
                    if (peptides != null)
                    {
                        writer.Write(peptides[i] + "\t");
                    }
                    writer.Write(protein + "\t" + length);
                    for (var j = 0; j < jobs.Length; j++)
                    {
                        writer.Write("\t" + abundances[j][i]);
                    }
                    writer.WriteLine("\t" + (protein.StartsWith("STANDARD") ? 1 : 0));
                }
            }
        }