public void TestTagBasedSearchCompRef() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const string dataSetPath = @"D:\MassSpecFiles\CompRef"; const string fastaFilePath = @"D:\MassSpecFiles\CompRef\ID_003278_4B4B3CB1.fasta"; const string modsFilePath = @"D:\MassSpecFiles\CompRef\Mods.txt"; if (!Directory.Exists(dataSetPath)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, dataSetPath); } if (!File.Exists(modsFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, modsFilePath); } if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fileEntries = Directory.GetFiles(dataSetPath); var dataset = (from fileName in fileEntries where fileName.EndsWith("pbf") select Path.GetFileNameWithoutExtension(fileName)).ToList(); dataset.Sort(); var fastaDb = new FastaDatabase(fastaFilePath); var tolerance = new Tolerance(10); var aaSet = new AminoAcidSet(modsFilePath); for (var i = 0; i < dataset.Count; i++) { var rawFile = string.Format(@"{0}\{1}.pbf", dataSetPath, dataset[i]); var ms1File = string.Format(@"{0}\{1}.ms1ft", dataSetPath, dataset[i]); var tagFilePath = MassSpecDataReaderFactory.ChangeExtension(rawFile, ".seqtag"); var run = PbfLcMsRun.GetLcMsRun(rawFile); //const int minTagLength = 5; //var tagParser = new SequenceTagParser(tagFilePath, minTagLength, 100); Console.WriteLine("-----------------{0}--------------------", rawFile); TestTagBasedSearch(run, fastaDb, tolerance, aaSet); Console.WriteLine("-----------------------------------------------------------------------"); } }
public void TestCountingPeptides() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } // const string dbFile = @"C:\cygwin\home\kims336\Data\QCShew\ID_003456_9B916A8B.fasta"; // const string dbFile = @"H:\Research\DDAPlus\database\Yeast_SGD_withContam.fasta"; // const string dbFile = @"H:\Research\CPTAC_Phospho\database\ID_004208_295531A4.fasta"; var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); //var numPeptides = indexedDb.IntactSequenceAnnotationsAndOffsets(21, 300, 0).LongCount()*31; var peptides = indexedDb .SequenceAnnotationsAndOffsetsWithNtermOrCtermCleavageNoLargerThan( 100, 300, 1, 0); var numPeptides = 0; foreach (var peptide in peptides) { Console.WriteLine("{0}\t{1}", peptide.Annotation, peptide.Offset); numPeptides++; } //var numPeptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 150).LongCount(); //var numPeptides = // indexedDb.AnnotationsAndOffsets(7, 40, 2, 2, Enzyme.Trypsin).LongCount(); //var numPeptides = indexedDb.AnnotationsAndOffsets(6, 40, 2, 2, Enzyme.Trypsin).LongCount(); //var numPeptides = indexedDb.IntactSequenceAnnotationsAndOffsets(30, 250, 0).LongCount(); // .Select(annotationAndSequence => annotationAndSequence.Annotation.Length - 4) // .Aggregate(0L, (current, length) => current + Math.Min(length - 29, 30)); Console.WriteLine("NumPeptides: {0}", numPeptides); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
public void AddProteinLengths() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string databaseFilePath = @"H:\Research\IPRG2015\database\yeast6proteaprotein.fasta"; if (!File.Exists(databaseFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath); } var database = new FastaDatabase(databaseFilePath); database.Read(); const string resultPath = @"H:\Research\IPRG2015\AMT_Peptides_NA.tsv"; if (!File.Exists(resultPath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultPath); } const string outputFilePath = @"H:\Research\IPRG2015\AMT_Peptides.tsv"; using (var writer = new StreamWriter(outputFilePath)) { foreach (var line in File.ReadLines(resultPath)) { var data = line.Split(null); if (data.Length != 14) { continue; } var peptide = data[0]; if (peptide.Equals("Peptide")) { writer.WriteLine("Peptide\tProtein\tLength\t{0}", string.Join("\t", data.Skip(2))); continue; } var protein = data[1]; var length = database.GetProteinLength(protein); writer.WriteLine("{0}\t{1}\t{2}\t{3}", peptide, protein, length, string.Join("\t", data.Skip(2))); } } }
public void TestSumParallel() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); //var array = Enumerable.Range(0, short.MaxValue).ToArray(); var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\ID_002216_235ACCEA.fasta")); var db = new FastaDatabase(fastaFile.FullName); db.Read(); //var indexedDb = new IndexedDatabase(db); //indexedDb.Read(); //var peptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30); var charArray = db.Characters().Select(c => (int)c).ToList(); // Test methods. var defaultSum = SumAsParallel(charArray); var parallelSum = SumAsParallel(charArray); Console.WriteLine("Default sum {0}", defaultSum); Console.WriteLine("Parallel sum {0}", parallelSum); Assert.AreEqual(parallelSum, defaultSum); const int m = 100; var s1 = Stopwatch.StartNew(); for (var i = 0; i < m; i++) { SumDefault(charArray); } s1.Stop(); var s2 = Stopwatch.StartNew(); for (var i = 0; i < m; i++) { SumAsParallel(charArray); } s2.Stop(); Console.WriteLine("{0:F2} msec/sum, on average for default", s1.Elapsed.TotalMilliseconds / m); Console.WriteLine("{0:F2} msec/sum, on average for parallel", s2.Elapsed.TotalMilliseconds / m); }
private IEnumerable <AnnotationAndOffset> GetAnnotationsAndOffsets(FastaDatabase database) { var indexedDbTarget = new IndexedDatabase(database); IEnumerable <AnnotationAndOffset> annotationsAndOffsets; if (NumTolerableTermini == 0) { annotationsAndOffsets = indexedDbTarget.AnnotationsAndOffsetsNoEnzyme(MinSequenceLength, MaxSequenceLength); } else { annotationsAndOffsets = indexedDbTarget.AnnotationsAndOffsets(MinSequenceLength, MaxSequenceLength, NumTolerableTermini, 2, Enzyme); } return(annotationsAndOffsets); }
public void TestSequenceEnumerationParallel2() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var sw = new System.Diagnostics.Stopwatch(); var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\ID_002216_235ACCEA.fasta")); var db = new FastaDatabase(fastaFile.FullName); db.Read(); var indexedDb = new IndexedDatabase(db); var arr = db.Characters().ToArray(); sw.Start(); //var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30); // var num = annotationsAndOffsets.AsParallel().LongCount(annotationsAndOffset => annotationsAndOffset.Annotation.IndexOf('W') >= 0); //var num = annotationsAndOffsets.LongCount(annotationsAndOffset => annotationsAndOffset.Annotation.IndexOf('W') >= 0); //var num = arr.AsParallel().Where(c => c == 'W').LongCount(); var num = 0; var sum = 0L; //foreach (var c in arr) for (var a = 0; a < arr.Length; a++) { var c = arr[a]; for (var i = 0; i < c * 10000; i++) { sum += i; } // Interlocked.Increment(ref num); if (++num == 1000) { break; } } Console.WriteLine("NumPeptides: {0}", sum); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
public void TestSearchWithTagGeneration() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const string rawFilePath = @"D:\MassSpecFiles\training\raw\QC_Shew_Intact_26Sep14_Bane_C2Column3.pbf"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } var run = PbfLcMsRun.GetLcMsRun(rawFilePath); const string fastaFilePath = @"D:\MSPathFinder\Fasta\ID_002216_235ACCEA.fasta"; //const string fastaFilePath = @"D:\MassSpecFiles\60k\ID_004973_9BA6912F.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var tolerance = new Tolerance(10); var modsFilePath = @"D:\MSPathFinder\Fasta\Mods.txt"; if (!File.Exists(modsFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, modsFilePath); } var aaSet = new AminoAcidSet(modsFilePath); //TestTagBasedSearch(run, fastaDb, tolerance, aaSet); var tagSearchEngine = new ScanBasedTagSearchEngine(run, new SequenceTagGenerator(run, new Tolerance(8)), new LcMsPeakMatrix(run), fastaDb, tolerance, aaSet); var matchedTags = tagSearchEngine.RunSearch(4672); foreach (var match in matchedTags) { Console.Write(match.Sequence); Console.WriteLine("\t{0}\t{1}\t{2}", match.TagMatch.StartIndex, match.TagMatch.EndIndex, match.TagMatch.Mass); } }
private void RunSearch(SortedSet <DatabaseSequenceSpectrumMatch>[] matches, FastaDatabase db, ISequenceFilter sequenceFilter, CancellationToken?cancellationToken = null, IProgress <ProgressData> progress = null) { var progData = new ProgressData(progress) { Status = "Searching for matches" }; var sw = new Stopwatch(); long estimatedProteins; var annotationsAndOffsets = GetAnnotationsAndOffsets(db, out estimatedProteins, cancellationToken); Console.WriteLine(@"Estimated proteins: " + estimatedProteins); var numProteins = 0; var lastUpdate = DateTime.MinValue; // Force original update of 0% sw.Reset(); sw.Start(); var pfeOptions = new ParallelOptions { MaxDegreeOfParallelism = MaxNumThreads, CancellationToken = cancellationToken ?? CancellationToken.None }; var maxNumNTermCleavages = SearchMode == InternalCleavageType.NoInternalCleavage ? MaxNumNTermCleavages : 0; //foreach (var annotationAndOffset in annotationsAndOffsets) Parallel.ForEach(annotationsAndOffsets, pfeOptions, annotationAndOffset => { if (cancellationToken != null && cancellationToken.Value.IsCancellationRequested) { //return matches; return; } SearchProgressReport(ref numProteins, ref lastUpdate, estimatedProteins, sw, progData); SearchForMatches(annotationAndOffset, sequenceFilter, matches, maxNumNTermCleavages, db.IsDecoy, cancellationToken); }); Console.WriteLine(@"Collected candidate matches: {0}", GetNumberOfMatches(matches)); progData.StatusInternal = string.Empty; progData.Report(100.0); }
public void TestCountingPeptides() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\Short.fasta")); var db = new FastaDatabase(fastaFile.FullName); var indexedDb = new IndexedDatabase(db); //var numPeptides = indexedDb.IntactSequenceAnnotationsAndOffsets(21, 300, 0).LongCount()*31; var peptides = indexedDb .SequenceAnnotationsAndOffsetsWithNtermOrCtermCleavageNoLargerThan( 100, 300, 1, 0); var numPeptides = 0; foreach (var peptide in peptides) { if (numPeptides < 20) { Console.WriteLine("{0}\t{1}", peptide.Annotation, peptide.Offset); } numPeptides++; } //var numPeptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 150).LongCount(); //var numPeptides = // indexedDb.AnnotationsAndOffsets(7, 40, 2, 2, Enzyme.Trypsin).LongCount(); //var numPeptides = indexedDb.AnnotationsAndOffsets(6, 40, 2, 2, Enzyme.Trypsin).LongCount(); //var numPeptides = indexedDb.IntactSequenceAnnotationsAndOffsets(30, 250, 0).LongCount(); // .Select(annotationAndSequence => annotationAndSequence.Annotation.Length - 4) // .Aggregate(0L, (current, length) => current + Math.Min(length - 29, 30)); Console.WriteLine("NumPeptides: {0}", numPeptides); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
public ProteoformSpectrumMatchContainer(FastaDatabase database, int[] ms2ScanVector, int maxModifications, int maxNumMatchesPerSpectrum, int minScore = 4) { Database = database; NumMatchesPerSpectrum = maxNumMatchesPerSpectrum; _scoreCutoff = minScore; Ms2ScanVector = ms2ScanVector; _ms2ScanToIndexMap = new int[ms2ScanVector.Last() + 1]; for (var i = 0; i < ms2ScanVector.Length; i++) { var scanNum = ms2ScanVector[i]; _ms2ScanToIndexMap[scanNum] = i; } _matchedSet = new SortedSet <DatabaseSequenceSpectrumMatch> [maxModifications + 1][]; for (var i = 0; i <= maxModifications; i++) { _matchedSet[i] = new SortedSet <DatabaseSequenceSpectrumMatch> [ms2ScanVector.Length]; } _checkedOutScanNumbers = new List <int>(); }
private void OutputMergedResult(TextWriter writer, TsvFileParser parser, FastaDatabase fastaDb) { var scoreColumn = parser.GetData("#MatchedFragments") ?? parser.GetData("Score"); var qValColumn = parser.GetData("QValue"); for (var i = 0; i < parser.NumData; i++) { var sequence = parser.GetData("Sequence")[i]; var scanNum = int.Parse(parser.GetData("Scan")[i]); var mass = double.Parse(parser.GetData("Mass")[i]); var protName = parser.GetData("ProteinName")[i]; var protDesc = fastaDb.GetProteinDescription(protName); var firstResId = int.Parse(parser.GetData("Start")[i]); var lastResId = int.Parse(parser.GetData("End")[i]); var score = double.Parse(scoreColumn[i]); var mod = parser.GetData("Modifications")[i]; var qvalue = (qValColumn != null) ? qValColumn[i] : "0"; writer.Write(scanNum); writer.Write("\t"); writer.Write(sequence); writer.Write("\t"); writer.Write(mod); writer.Write("\t"); writer.Write(mass); writer.Write("\t"); writer.Write(protName); writer.Write("\t"); writer.Write(protDesc); writer.Write("\t"); writer.Write(firstResId); writer.Write("\t"); writer.Write(lastResId); writer.Write("\t"); writer.Write(score); writer.Write("\t"); writer.Write(qvalue); writer.Write("\n"); } }
public void TestSumParallel() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); //var array = Enumerable.Range(0, short.MaxValue).ToArray(); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002216_235ACCEA.fasta"; var db = new FastaDatabase(dbFile); db.Read(); //var indexedDb = new IndexedDatabase(db); //indexedDb.Read(); //var peptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30); var charArray = db.Characters().Select(c => (int)c); // Test methods. Console.WriteLine(SumAsParallel(charArray)); Console.WriteLine(SumDefault(charArray)); const int m = 100; var s1 = Stopwatch.StartNew(); for (var i = 0; i < m; i++) { SumDefault(charArray); } s1.Stop(); var s2 = Stopwatch.StartNew(); for (var i = 0; i < m; i++) { SumAsParallel(charArray); } s2.Stop(); Console.WriteLine((s1.Elapsed.TotalMilliseconds * 1000000 / m).ToString("0.00 ns")); Console.WriteLine((s2.Elapsed.TotalMilliseconds * 1000000 / m).ToString("0.00 ns")); Console.Read(); }
public void TestTagMatchingSingleSpec() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3"; const int scanNum = 4533; // Parse sequence tags var tagFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".seqtag"); const int minTagLength = 8; const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(tagFileName)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFileName); } if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); var tagParser = new SequenceTagParser(tagFileName, minTagLength); var tags = tagParser.GetSequenceTags(scanNum); foreach (var tag in tags) { var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence) .Select(index => fastaDb.GetProteinName(index)).ToArray(); if (matchedProteins.Any()) { Console.WriteLine("{0}\t{1}\t{2}\t{3}", tag.Sequence, tag.IsPrefix, tag.FlankingMass, string.Join("\t", matchedProteins)); } } }
public void TestTagBasedSearchForLewy() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const string rawFilePath = @"D:\MassSpecFiles\Lewy\Lewy_AT_AD1_21May15_Bane_14-09-01RZ.pbf"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } var run = PbfLcMsRun.GetLcMsRun(rawFilePath); //const int minTagLength = 4; var tagFilePath = MassSpecDataReaderFactory.ChangeExtension(rawFilePath, ".seqtag"); //var tagParser = new SequenceTagParser(tagFilePath, minTagLength, 10000); const string fastaFilePath = @"D:\MassSpecFiles\Lewy\a4_human.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var tolerance = new Tolerance(10); var modsFilePath = @"D:\MassSpecFiles\Lewy\Mods.txt"; if (!File.Exists(modsFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, modsFilePath); } var aaSet = new AminoAcidSet(modsFilePath); TestTagBasedSearch(run, fastaDb, tolerance, aaSet); }
public FeatureBasedTagSearchEngine( LcMsRun run, Ms1FtParser featureParser, SequenceTagParser tagParser, FastaDatabase fastaDb, Tolerance tolerance, AminoAcidSet aaSet, double maxSequenceMass = 50000.0, int minProductIonCharge = 1, int maxProductIonCharge = 20) : this( run, featureParser, null, tagParser, fastaDb, tolerance, aaSet, maxSequenceMass, minProductIonCharge, maxProductIonCharge) { }
private static void TestCountingPeptides() { var aaSet = new AminoAcidSet(); var sw = new Stopwatch(); sw.Start(); //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002166_F86E3B2F.fasta"; const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_003456_9B916A8B.fasta"; // const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta"; var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); indexedDb.Read(); //var numPeptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 150).LongCount(); var peptides = indexedDb.AnnotationsAndOffsets(7, 40, 2, 2, Enzyme.Trypsin); Parallel.ForEach(peptides, annotationAndOffset => //foreach(var annotationAndOffset in peptides) { var annotation = annotationAndOffset.Annotation; var offset = annotationAndOffset.Offset; var graph = SequenceGraph.CreateGraph(aaSet, annotation); } ) ; // Console.WriteLine("NumPeptides: {0}", numPeptides); sw.Stop(); var sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"{0:f4} sec", sec); }
public void TestFeatureIdMatching() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string resultFilePath = @"H:\Research\QCShew_TopDown\Production\M1_V092\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } var resultParser = new MsPathFinderParser(resultFilePath); const double qValueThreshold = 0.01; const double tolerancePpm = 13; const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3"; var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw"); if (!File.Exists(rawFileName)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFileName); } var run = PbfLcMsRun.GetLcMsRun(rawFileName); var idList = resultParser.GetIdList().TakeWhile(id => id.QValue <= qValueThreshold).OrderBy(id => id.Mass).ToList(); var idMassList = idList.Select(id => id.Mass).ToList(); var idFlag = new bool[idList.Count]; // Parse sequence tags var tagFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".seqtag"); const int minTagLength = 6; const int numProtMatches = 4; // const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta"; if (!File.Exists(tagFileName)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFileName); } if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); var tagParser = new SequenceTagParser(tagFileName, minTagLength); var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft"); var featureParser = new TsvFileParser(featureFileName); var minScan = featureParser.GetData("MinScan").Select(s => Convert.ToInt32(s)).ToArray(); var maxScan = featureParser.GetData("MaxScan").Select(s => Convert.ToInt32(s)).ToArray(); var minCharge = featureParser.GetData("MinCharge").Select(s => Convert.ToInt32(s)).ToArray(); var maxCharge = featureParser.GetData("MaxCharge").Select(s => Convert.ToInt32(s)).ToArray(); var monoMass = featureParser.GetData("MonoMass").Select(Convert.ToDouble).ToArray(); var numFeaturesWithId = 0; var numFeaturesWithMs2 = 0; var numFeaturesWithTags = 0; var numFeaturesWithMatchingTags = 0; var numFeaturesWithTwoOrMoreMatchingTags = 0; var numFeaturesWithNoIdAndMatchingTags = 0; for (var i = 0; i < featureParser.NumData; i++) { var mass = monoMass[i]; // Find Id var tolDa = new Tolerance(tolerancePpm).GetToleranceAsDa(mass, 1); var minMass = mass - tolDa; var maxMass = mass + tolDa; var index = idMassList.BinarySearch(mass); if (index < 0) { index = ~index; } var matchedId = new List <MsPathFinderId>(); // go down var curIndex = index - 1; while (curIndex >= 0) { var curId = idList[curIndex]; if (curId.Mass < minMass) { break; } if (curId.Scan > minScan[i] && curId.Scan < maxScan[i] && curId.Charge >= minCharge[i] && curId.Charge <= maxCharge[i]) { matchedId.Add(curId); idFlag[curIndex] = true; } --curIndex; } // go up curIndex = index; while (curIndex < idList.Count) { var curId = idList[curIndex]; if (curId.Mass > maxMass) { break; } if (curId.Scan >= minScan[i] && curId.Scan <= maxScan[i] && curId.Charge >= minCharge[i] && curId.Charge <= maxCharge[i]) { matchedId.Add(curId); idFlag[curIndex] = true; } ++curIndex; } var hasId = false; if (matchedId.Any()) { ++numFeaturesWithId; hasId = true; } // Find MS2 scans // var numMs2Scans = 0; var tags = new List <SequenceTag>(); var hasMs2 = false; for (var scanNum = minScan[i]; scanNum <= maxScan[i]; scanNum++) { var isolationWindow = run.GetIsolationWindow(scanNum); if (isolationWindow == null) { continue; } var isolationWindowTargetMz = isolationWindow.IsolationWindowTargetMz; var charge = (int)Math.Round(mass / isolationWindowTargetMz); if (charge < minCharge[i] || charge > maxCharge[i]) { continue; } var mz = Ion.GetIsotopeMz(mass, charge, Averagine.GetIsotopomerEnvelope(mass).MostAbundantIsotopeIndex); if (isolationWindow.Contains(mz)) { // ++numMs2Scans; tags.AddRange(tagParser.GetSequenceTags(scanNum)); hasMs2 = true; } } if (hasMs2) { ++numFeaturesWithMs2; } if (tags.Any()) { ++numFeaturesWithTags; } var protHist = new Dictionary <string, int>(); var hasMatchedTag = false; foreach (var tag in tags) { var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).Select(idx => fastaDb.GetProteinName(idx)).ToArray(); if (matchedProteins.Any()) { hasMatchedTag = true; foreach (var protein in matchedProteins) { int num; if (protHist.TryGetValue(protein, out num)) { protHist[protein] = num + 1; } else { protHist[protein] = 1; } } } } if (hasMatchedTag) { ++numFeaturesWithMatchingTags; if (!hasId) { ++numFeaturesWithNoIdAndMatchingTags; } } if (protHist.Any()) { var maxOcc = protHist.Values.Max(); if (maxOcc >= numProtMatches) { ++numFeaturesWithTwoOrMoreMatchingTags; } } } Console.WriteLine("NumFeatures: {0}", featureParser.NumData); Console.WriteLine("NumId: {0}", idList.Count); Console.WriteLine("NumFeaturesWithId: {0} ({1})", numFeaturesWithId, numFeaturesWithId / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithMs2: {0} ({1})", numFeaturesWithMs2, numFeaturesWithMs2 / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithTag: {0} ({1})", numFeaturesWithTags, numFeaturesWithTags / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithMatchedTag: {0} ({1})", numFeaturesWithMatchingTags, numFeaturesWithMatchingTags / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithMoreThanOneMatchedTag: {0} ({1})", numFeaturesWithTwoOrMoreMatchingTags, numFeaturesWithTwoOrMoreMatchingTags / (float)featureParser.NumData); Console.WriteLine("NumFeaturesWithNoIdAndMatchedTag: {0} ({1})", numFeaturesWithNoIdAndMatchingTags, numFeaturesWithNoIdAndMatchingTags / (float)featureParser.NumData); for (var i = 0; i < idFlag.Length; i++) { if (!idFlag[i]) { Console.WriteLine(idList[i].Scan); } } // Console.WriteLine(string.Join(",", filter.GetMatchingMs2ScanNums(8115.973001))); // // Console.WriteLine(featureFileName); }
public void FindProteinDeltaMass() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const string folderPath = @"D:\MassSpecFiles\Glyco\"; if (!Directory.Exists(folderPath)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, folderPath); } var fileSet = new string[] { "User_sample_test_02252015", "User_sample_test_MWCO_02262016", "User_sample_test_SEC_F3_03022105", "User_sample_test_SEC_F1_02272015", "User_sample_test_SEC_F2_02282015" }; const string fastaFilePath = folderPath + "ID_003836_DA9CC1E4.fasta"; for (var i = 0; i < fileSet.Length; i++) { var datasetName = fileSet[i]; var tagFilePath = folderPath + datasetName + ".seqtag"; //var outputFilePath = folderPath + datasetName + ".matchedtag"; var outputFilePath = folderPath + datasetName + ".dmass"; var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); using (var writer = new StreamWriter(outputFilePath)) { var isHeader = true; var nReadSeqTag = 0; Console.WriteLine(@"Reading {0} file", tagFilePath); var nColumn = 0; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; nColumn = line.Split('\t').Length; writer.WriteLine(line + "\t" + "Protein" + "\t" + "DetectedFlankingMass" + "\t" + "ExpectedFlankingMass" + "\t" + "DeltaMass"); continue; } var token = line.Split('\t'); if (token.Length != nColumn) { continue; } var tag = token[1]; //var scan = Convert.ToInt32(token[0]); if (tag.Length < 6) { continue; } var nTerminal = token[2].Equals("1"); var detectedFlankingMass = Double.Parse(token[3]); if (!nTerminal) { detectedFlankingMass -= Composition.H2O.Mass; } nReadSeqTag++; var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index)) .Distinct().ToArray(); if (matchedProteins.Length < 1) { continue; } foreach (var protName in matchedProteins) { var seqStr = fastaDb.GetProteinSequence(protName); var oriSeq = new Sequence(seqStr, AminoAcidSet.GetStandardAminoAcidSet()); var startIdx = 0; while (true) { var idx = seqStr.IndexOf(tag, startIdx); if (idx < 0) { break; //no matching } //var nClv = (nTerminal) ? idx : seqStr.Length - idx - tag.Length; var nClv = (nTerminal) ? 2 : 1; for (var j = 0; j < nClv; j++) { var flankComposition = (nTerminal) ? oriSeq.GetComposition(j, idx) : oriSeq.GetComposition(idx + tag.Length, oriSeq.Count - j); var massDiff = (detectedFlankingMass - flankComposition.Mass); if (massDiff > -500 && massDiff < 2000) { //writer.WriteLine(massDiff); writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", line, protName, detectedFlankingMass, flankComposition.Mass, massDiff); } if (massDiff > 2000) { break; } } startIdx = idx + tag.Length; } } //var matchedProteinStr = string.Join(",", matchedProteins); //var massDiffStr = string.Join(",", massDiffList); //writer.WriteLine("{0}\t{1}\t{2}\t{3}", line, matchedProteins.Length, matchedProteinStr, massDiffStr); } Console.WriteLine(@"{0} seq tags are processed", nReadSeqTag); } Console.WriteLine(@"Done"); } }
public void CountMatchedScansPerProtein() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const int minTagLength = 6; var proteinToScan = new Dictionary <string, HashSet <int> >(); const string fastaFilePath = @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); Console.WriteLine(@"Sequence length: {0}", fastaDb.GetSequence().Length); //const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv"; //const string tagFilePath = @"\\protoapps\UserData\Jungkap\Co_culture\23B_pellet_TD_3Feb14_Bane_PL011402.seqtag"; const string tagFilePath = @"D:\MassSpecFiles\co_culture\23A_pellet_TD_3Feb14_Bane_PL011402.seqtag"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } var isHeader = true; var numMatchedPairs = 0; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; continue; } var token = line.Split('\t'); if (token.Length != 3) { continue; } var scan = Convert.ToInt32(token[0]); var tag = token[1]; if (tag.Length < minTagLength) { continue; } foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index))) { ++numMatchedPairs; HashSet <int> matchedScans; if (proteinToScan.TryGetValue(matchedProtein, out matchedScans)) { matchedScans.Add(scan); } else { matchedScans = new HashSet <int> { scan }; proteinToScan.Add(matchedProtein, matchedScans); } } } var numMatchedProteins = proteinToScan.Keys.Count; var numAllProteins = fastaDb.GetNumEntries(); Console.WriteLine("NumAllProteins: {0}", numAllProteins); Console.WriteLine("NumMatchedProteins: {0}", numMatchedProteins); Console.WriteLine("AvgMatchedScansPerProtein: {0}", numMatchedPairs / (float)numAllProteins); }
public void CountMatchedProteins() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const int minTagLength = 3; var scanToProtein = new Dictionary <int, string>(); var idTag = new Dictionary <int, bool>(); const string resultFilePath = @"H:\Research\ProMex\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } var parser = new TsvFileParser(resultFilePath); var scans = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var proteinNames = parser.GetData("ProteinName").ToArray(); var qValues = parser.GetData("QValue").Select(Convert.ToDouble).ToArray(); for (var i = 0; i < qValues.Length; i++) { if (qValues[i] > 0.01) { break; } scanToProtein.Add(scans[i], proteinNames[i]); idTag.Add(scans[i], false); } const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } var run = PbfLcMsRun.GetLcMsRun(rawFilePath); const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } // const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta"; // const string fastaFilePath = // @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); Console.WriteLine("Sequence length: {0}", fastaDb.GetSequence().Length); const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.seqtag"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } var hist = new Dictionary <int, int>(); var scanSet = new HashSet <int>(); HashSet <string> proteinSetForThisScan = null; var prevScan = -1; var totalNumMatches = 0L; var isHeader = true; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; continue; } var token = line.Split('\t'); if (token.Length < 3) { continue; } var scan = Convert.ToInt32(token[0]); var proteinId = scanToProtein.ContainsKey(scan) ? scanToProtein[scan] : null; if (scan != prevScan) { if (proteinSetForThisScan != null) { var numMatches = proteinSetForThisScan.Count; int numOcc; if (hist.TryGetValue(numMatches, out numOcc)) { hist[numMatches] = numOcc + 1; } else { hist.Add(numMatches, 1); } } prevScan = scan; proteinSetForThisScan = new HashSet <string>(); } scanSet.Add(scan); var tag = token[1]; if (tag.Length < minTagLength) { continue; } if (proteinSetForThisScan == null) { continue; } var numMatchesForThisTag = 0; foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index))) { proteinSetForThisScan.Add(matchedProtein); ++numMatchesForThisTag; if (proteinId != null && matchedProtein.Equals(proteinId)) { idTag[scan] = true; } } totalNumMatches += numMatchesForThisTag; // if (numMatchesForThisTag > 10) // { // Console.WriteLine("{0}\t{1}", tag, numMatchesForThisTag); // } } if (proteinSetForThisScan != null) { var numMatches = proteinSetForThisScan.Count; int numOcc; if (hist.TryGetValue(numMatches, out numOcc)) { hist[numMatches] = numOcc + 1; } else { hist.Add(numMatches, 1); } } Console.WriteLine("AvgNumMatches: {0}", totalNumMatches / (float)scanSet.Count); Console.WriteLine("Histogram:"); foreach (var entry in hist.OrderBy(e => e.Key)) { Console.WriteLine("{0}\t{1}", entry.Key, entry.Value); } Console.WriteLine("NumId: {0}", idTag.Count); Console.WriteLine("NumIdByTag: {0}", idTag.Select(e => e.Value).Count(v => v)); }
[TestCase(3, @"TEST_FOLDER\MSPathFinderT\ID_005133_8491EFA2.fasta", 323719193)] // 3MB //[TestCase(6, @"TEST_FOLDER\MSPathFinderT\ID_004530_B63BD900.fasta", 595227563)] // 6MB //[TestCase(15, @"TEST_FOLDER\MSPathFinderT\ID_004208_295531A4.fasta", 1882434687)] // 15MB public void TestSequenceEnumeration(double size, string dbFile, int expected) { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName, dbFile); var fastaFile = Utils.GetTestFile(methodName, dbFile.Replace("TEST_FOLDER", Utils.DEFAULT_TEST_FILE_FOLDER)); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); var db = new FastaDatabase(fastaFile.FullName); var indexedDb = new IndexedDatabase(db); var numSequences = 0L; var timeDB = sw.Elapsed; Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds"); var estimatedAnnOff = indexedDb.EstimateTotalPeptides(0, 30, 250); var timeEstimate = sw.Elapsed; Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds"); //int coreCount = 0; //foreach (var item in new System.Management.ManagementObjectSearcher("Select NumberOfCores from Win32_Processor").Get()) //{ // coreCount += int.Parse(item["NumberOfCores"].ToString()); //} //Console.WriteLine("Number Of Cores: {0}", coreCount); //Console.WriteLine("Processors: " + System.Environment.ProcessorCount); Console.WriteLine("Estimated results: " + estimatedAnnOff); var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzymeParallel(30, 250); var timeGetAnn = sw.Elapsed; Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds"); /*/Parallel.ForEach( * annotationsAndOffsets, * // new ParallelOptions { MaxDegreeOfParallelism = 2}, * annotationAndOffset => * { * Interlocked.Increment(ref numSequences); * //++numSequences; * } * );/**/ //annotationsAndOffsets.Select(annotationsAndOffset => annotationsAndOffset.) // Below, original: 110, 109(total) seconds // Parallelizing AnnotationsAndOffsetsNoEnzyme: 86 seconds // Parallelizing AnnotationsAndOffsetsNoEnzyme, yield returns: 79.6, 94, 60, 60 seconds // // 3MB // serial: // Parallel2: 107, // // 6MB // serial: // Parallel2: // // 15MB // serial: // Parallel2: //using (var ofstream = new FileStream(Path.Combine(@"F:\InformedProteomicsTestFiles", Path.GetFileNameWithoutExtension(fastaFile) + "_par.txt"), FileMode.Create)) //using (var fout = new StreamWriter(ofstream)) //{ // foreach (var annOff in annotationsAndOffsets) // { // numSequences++; // fout.WriteLine(annOff.Annotation); // } //} numSequences = annotationsAndOffsets.Count(); var timeParForEach = sw.Elapsed; Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds"); Console.WriteLine("NumPeptides: {0}", numSequences); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); //Assert.AreEqual(188961836, numSequences); Assert.AreEqual(expected, numSequences); }
public MzidResultsWriter(FastaDatabase db, LcMsRun run, MsPfParameters options) { this.database = db; this.lcmsRun = run; this.options = options; }
[TestCase(15, @"TEST_FOLDER\MSPathFinderT\ID_004208_295531A4.fasta", 14862126)] // 15MB public void TestSequenceEnumerationNCTerm(double size, string dbFile, int expected) { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName, dbFile); var fastaFile = Utils.GetTestFile(methodName, dbFile.Replace("TEST_FOLDER", Utils.DEFAULT_TEST_FILE_FOLDER)); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); const int numNTermCleavages = 1; const int numCTermCleavages = 0; var db = new FastaDatabase(fastaFile.FullName); var indexedDb = new IndexedDatabase(db); var numSequences = 0L; var timeDB = sw.Elapsed; Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds"); var estimatedAnnOff = indexedDb.EstimateTotalPeptides(1, 21, 300, numNTermCleavages, numCTermCleavages); var timeEstimate = sw.Elapsed; Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds"); Console.WriteLine("Estimated results: " + estimatedAnnOff); var annotationsAndOffsets = indexedDb.SequenceAnnotationsAndOffsetsWithNtermOrCtermCleavageNoLargerThan(21, 300, numNTermCleavages, numCTermCleavages); var timeGetAnn = sw.Elapsed; Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds"); /*/Parallel.ForEach( * annotationsAndOffsets, * // new ParallelOptions { MaxDegreeOfParallelism = 2}, * annotationAndOffset => * { * Interlocked.Increment(ref numSequences); * //++numSequences; * } * );/**/ //using (var ofstream = new FileStream(Path.Combine(@"F:\InformedProteomicsTestFiles", Path.GetFileNameWithoutExtension(fastaFile) + "_par.txt"), FileMode.Create)) //using (var fout = new StreamWriter(ofstream)) //{ // foreach (var annOff in annotationsAndOffsets) // { // numSequences++; // fout.WriteLine(annOff.Annotation); // } //} //foreach (var sao in annotationsAndOffsets) //{ // numSequences++; //} numSequences = annotationsAndOffsets.Count(); var timeParForEach = sw.Elapsed; Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds"); Console.WriteLine("NumPeptides: {0}", numSequences); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); //Assert.AreEqual(188961836, numSequences); Assert.AreEqual(expected, numSequences); }
public void TestCountingProteoformsCloseToNTermOrCTerm() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const int minSequenceLength = 21; // 21 const int maxSequenceLength = 300; // 300 const int maxNumNTermCleavages = 1; const int maxNumCTermCleavages = 0; var sw = new System.Diagnostics.Stopwatch(); sw.Start(); //const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownQCShew\database\ID_002216_235ACCEA.fasta"; const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); var both = 0L; var nTermOnly = 0L; var cTermOnly = 0L; foreach ( var annotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsets(minSequenceLength, int.MaxValue, maxNumCTermCleavages)) { // numCTermCleavages <= maxNumCTermCleavages var annotation = annotationAndOffset.Annotation; var length = (annotation.Length - 4); var numNTermCleavage = 0; int cleavedLength; while ((cleavedLength = length - numNTermCleavage) >= minSequenceLength) { if (cleavedLength <= maxSequenceLength) { if (numNTermCleavage <= maxNumNTermCleavages) { ++both; } else { ++cTermOnly; } var anno = numNTermCleavage == 0 ? annotation : string.Format("{0}.{1}", annotation[1 + numNTermCleavage], annotation.Substring(2 + numNTermCleavage)); Console.WriteLine(anno); } ++numNTermCleavage; } } foreach ( var annotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsetsWithCTermCleavagesLargerThan(minSequenceLength, int.MaxValue, maxNumCTermCleavages)) { // numCTermCleavages > maxNumCTermCleavages var annotation = annotationAndOffset.Annotation; var length = (annotation.Length - 4); for (var numNTermCleavage = 0; numNTermCleavage <= maxNumNTermCleavages; numNTermCleavage++) { var cleavedLength = length - numNTermCleavage; if (cleavedLength >= minSequenceLength && cleavedLength <= maxSequenceLength) { ++nTermOnly; var anno = numNTermCleavage == 0 ? annotation : string.Format("{0}.{1}", annotation[1 + numNTermCleavage], annotation.Substring(2 + numNTermCleavage)); Console.WriteLine(anno); } } } Console.WriteLine("Both: {0}", both); Console.WriteLine("N-term only: {0}", nTermOnly); Console.WriteLine("C-term only: {0}", cTermOnly); Console.WriteLine("All: {0}", both + nTermOnly + cTermOnly); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
public void TestGetProteinsWithTagMatchingSingleSpec() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataSet = @"H:\Research\Lewy\raw\Lewy_intact_07"; // const int scanNum = 5158; const int minTagLength = 7; const int minNumTagMatches = 1; var aminoAcidSet = AminoAcidSet.GetStandardAminoAcidSet(); const int scanNum = 2; // Parse sequence tags //const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag"); const string rawFilePath = ""; const string fastaFilePath = @"H:\Research\Lewy\ID_004858_0EE8CF61.fasta"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); //var tagParser = new SequenceTagParser(tagFileName, minTagLength); //var tags = tagParser.GetSequenceTags(scanNum); var run = PbfLcMsRun.GetLcMsRun(rawFilePath); var spec = run.GetSpectrum(scanNum) as ProductSpectrum; var tagFinder = new SequenceTagFinder(spec, new Tolerance(5)); var tags = tagFinder.GetAllSequenceTagString(); var proteinsToTags = new Dictionary <string, IList <MatchedTag> >(); foreach (var tag in tags) { var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray(); foreach (var index in matchedIndices) { var protein = fastaDb.GetProteinName(index); var startIndex = fastaDb.GetOneBasedPositionInProtein(index); var matchedTag = new MatchedTag(tag, startIndex, 0.0); IList <MatchedTag> existingTags; if (proteinsToTags.TryGetValue(protein, out existingTags)) { existingTags.Add(matchedTag); } else { proteinsToTags.Add(protein, new List <MatchedTag> { matchedTag }); } } } foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count)) { if (entry.Value.Count < minNumTagMatches) { break; } var proteinName = entry.Key; var proteinSequence = fastaDb.GetProteinSequence(proteinName); var protein = new Sequence(proteinSequence, aminoAcidSet); Console.WriteLine(proteinName + "\t" + entry.Value.Count); foreach (var matchedTag in entry.Value) { var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0} ({1})\t{2}\t{3} ({4})\t{5}\t{6}\t{7}", matchedTag.NTermFlankingMass, (matchedTag.NTermFlankingMass - nTermMass), seq, matchedTag.CTermFlankingMass, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); } } }
public void TestFasta() { var db = new FastaDatabase(@"\\protoapps\UserData\Jungkap\Lewy\db\ID_005140_7A170668.fasta"); Console.WriteLine(db.GetNumEntries()); }
public void TestNominalMassErrors() { const int MAX_RUNTIME_SECONDS = 60; var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const int minLength = 300; const int maxLength = 400; var sw = new System.Diagnostics.Stopwatch(); var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\ID_003962_71E1A1D4.fasta")); var db = new FastaDatabase(fastaFile.FullName); db.Read(); var indexedDb = new IndexedDatabase(db); var numSequences = 0L; sw.Start(); var hist = new long[11]; var aaSet = new AminoAcidSet(); foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsetsNoEnzyme(minLength, maxLength)) { ++numSequences; var annotation = peptideAnnotationAndOffset.Annotation; var sequenceStr = annotation.Substring(2, annotation.Length - 4); var sequenceComp = aaSet.GetComposition(sequenceStr); var mass = sequenceComp.Mass; var nominalMass = sequenceComp.NominalMass; var error = (int)Math.Round(mass * Constants.RescalingConstant) - nominalMass; var errorBin = error + hist.Length / 2; if (errorBin < 0) { errorBin = 0; } if (errorBin >= hist.Length) { errorBin = hist.Length - 1; } hist[errorBin]++; if (numSequences % 100 == 0 && sw.Elapsed.TotalSeconds > MAX_RUNTIME_SECONDS) { break; } } Console.WriteLine("Sequence count: {0:N0}", numSequences); Console.WriteLine("{0,10} {1,10} {2,10}", "Bin ", "Count", "Fraction"); for (var i = 0; i < hist.Length; i++) { Console.WriteLine("{0,10:F1} {1,10:N0} {2,10:F1}%", i - hist.Length / 2, hist[i], hist[i] / (double)numSequences * 100); } sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:F1} sec", sw.Elapsed.TotalSeconds); }
public void TestRunningTimeChromGen() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string rafFilePath = @"C:\cygwin\home\kims336\Data\QCShewQE\QC_Shew_13_04_A_17Feb14_Samwise_13-07-28.raf"; if (!File.Exists(rafFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rafFilePath); } var rafRun = new PbfLcMsRun(rafFilePath); var tolerance = new Tolerance(10); const string dbFile = @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); var aaSet = new AminoAcidSet(Modification.Carbamidomethylation); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); var numPeptides = 0; foreach (var peptide in indexedDb.AnnotationsAndOffsets(6, 30, 2, 2, Enzyme.Trypsin)) { ++numPeptides; var comp = new Sequence(peptide.Annotation.Substring(2, peptide.Annotation.Length - 4), aaSet).Composition + Composition.H2O; var mz = new Ion(comp, 2).GetMonoIsotopicMz(); //Console.WriteLine(peptide.Annotation + " " + mz); rafRun.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance); //run.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance); //var xic1 = run.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance); //var xic2 = rafRun.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance); //Assert.True(xic1.Count == xic2.Count); //for (var i = 0; i < xic1.Count; i++) //{ // if (!xic1[i].Equals(xic2[i])) // { // Console.WriteLine("{0} {1} {2}", i, xic1[i], xic2[i]); // } // Assert.True(xic1[i].Equals(xic2[i])); //} if (numPeptides == 100000) { break; } } sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
public void TestTagMatching() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); // Parse sequence tags const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3"; const int minTagLength = 8; var tagFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".seqtag"); if (!File.Exists(tagFileName)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFileName); } var tagParser = new SequenceTagParser(tagFileName, minTagLength); // Parse raw file var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw"); if (!File.Exists(rawFileName)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFileName); } var run = PbfLcMsRun.GetLcMsRun(rawFileName); // Parse ID file const string resultFilePath = @"H:\Research\QCShew_TopDown\Production\M1_V092\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } var resultParser = new MsPathFinderParser(resultFilePath); const double qValueThreshold = 0.01; var idList = resultParser.GetIdWithQValuesNoLargerThan(qValueThreshold); var idFlag = new bool[run.MaxLcScan + 1]; foreach (var id in idList) { idFlag[id.Scan] = true; } const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } // const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta"; var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); var numMs2Spectra = 0; var numSpectraWithTag = 0; var numSpectraWithMatchingTag = 0; var numSpectraWithMatchedTagNoId = 0; foreach (var ms2ScanNum in run.GetScanNumbers(2)) { ++numMs2Spectra; var tags = tagParser.GetSequenceTags(ms2ScanNum); if (tags != null) { ++numSpectraWithTag; foreach (var tag in tags) { if (searchableDb.Search(tag.Sequence) >= 0) { //Console.WriteLine(tag.Sequence); ++numSpectraWithMatchingTag; if (!idFlag[ms2ScanNum]) { ++numSpectraWithMatchedTagNoId; } break; } } } } Console.WriteLine("Tag length: {0}", minTagLength); Console.WriteLine("NumMs2Spectra: {0}", numMs2Spectra); Console.WriteLine("NumMs2SpectraWithTags: {0} ({1})", numSpectraWithTag, numSpectraWithTag / (float)numMs2Spectra); Console.WriteLine("NumMs2SpectraWithMatchingTags: {0} ({1})", numSpectraWithMatchingTag, numSpectraWithMatchingTag / (float)numMs2Spectra); Console.WriteLine("NumMs2SpectraWithMatchingTagsWithNoId: {0} ({1})", numSpectraWithMatchedTagNoId, numSpectraWithMatchedTagNoId / (float)numMs2Spectra); }
public void TestFeatureId() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3"; if (!File.Exists(dataSet)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dataSet); } // Feature: 5236-5286 6-12 8480.3681 5 const int minScanNum = 5236; const int maxScanNum = 5286; const double featureMass = 8480.3681; //const int minScanNum = 7251; //const int maxScanNum = 7326; //const double featureMass = 32347.18; // const int minScanNum = 4451; // const int maxScanNum = 4541; // const double featureMass = 31267.95; var tolerance = new Tolerance(10); var relaxedTolerance = new Tolerance(20); const int minTagLength = 5; const int minMergedTagLength = 7; const int minNumTagMatches = 1; var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw"); var run = PbfLcMsRun.GetLcMsRun(rawFileName); var aminoAcidSet = AminoAcidSet.GetStandardAminoAcidSet(); var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft"); var filter = new Ms1FtFilter(run, tolerance, featureFileName); var ms2ScanNums = filter.GetMatchingMs2ScanNums(featureMass) .Where(scanNum => scanNum > minScanNum && scanNum < maxScanNum) .ToArray(); const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag"); const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); var tagParser = new SequenceTagParser(tagFileName, minTagLength); var proteinsToTags = new Dictionary <string, IList <MatchedTag> >(); foreach (var ms2ScanNum in ms2ScanNums) { var tags = tagParser.GetSequenceTags(ms2ScanNum); foreach (var tag in tags) { var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray(); foreach (var index in matchedIndices) { var protein = fastaDb.GetProteinName(index); var startIndex = fastaDb.GetZeroBasedPositionInProtein(index); var matchedTag = new MatchedTag(tag, startIndex, featureMass); IList <MatchedTag> existingTags; if (proteinsToTags.TryGetValue(protein, out existingTags)) { existingTags.Add(matchedTag); } else { proteinsToTags.Add(protein, new List <MatchedTag> { matchedTag }); } } } } foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count)) { if (entry.Value.Count < minNumTagMatches) { break; } var proteinName = entry.Key; var proteinSequence = fastaDb.GetProteinSequence(proteinName); var protein = new Sequence(proteinSequence, aminoAcidSet); Console.WriteLine(proteinName + "\t" + entry.Value.Count); var matchedTagSet = new MatchedTagSet(proteinSequence, aminoAcidSet, tolerance, relaxedTolerance); Console.WriteLine("********** Before merging"); foreach (var matchedTag in entry.Value) { var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}", (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); matchedTagSet.Add(matchedTag); } Console.WriteLine("********** After merging"); foreach (var matchedTag in matchedTagSet.Tags) { if (matchedTag.Length < minMergedTagLength) { continue; } var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}", (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); } break; } }