public SearchableDatabase(FastaDatabase fastaDatabase) { FastaDatabase = fastaDatabase; _sequence = fastaDatabase.GetSequence(); _suffixArray = new int[_sequence.Length]; SAIS.sufsort(_sequence, _suffixArray, _sequence.Length); var neighboringLcps = new byte[_suffixArray.Length]; neighboringLcps[0] = 0; for (var i = 1; i < _suffixArray.Length; i++) { var lcp = IndexedDatabase.GetLcp(_sequence, _suffixArray[i - 1], _suffixArray[i]); neighboringLcps[i] = lcp; } _leftLcps = new byte[_suffixArray.Length]; _rightLcps = new byte[_suffixArray.Length]; InitializeLcps(neighboringLcps, _leftLcps, _rightLcps, 0, _suffixArray.Length-1); }
/// <summary> /// Constructor /// </summary> /// <param name="fastaDatabase"></param> public SearchableDatabase(FastaDatabase fastaDatabase) { FastaDatabase = fastaDatabase; _sequence = fastaDatabase.GetSequence(); _suffixArray = new int[_sequence.Length]; SAIS.sufsort(_sequence, _suffixArray, _sequence.Length); var neighboringLcps = new byte[_suffixArray.Length]; neighboringLcps[0] = 0; for (var i = 1; i < _suffixArray.Length; i++) { var lcp = IndexedDatabase.GetLcp(_sequence, _suffixArray[i - 1], _suffixArray[i]); neighboringLcps[i] = lcp; } _leftLcps = new byte[_suffixArray.Length]; _rightLcps = new byte[_suffixArray.Length]; InitializeLcps(neighboringLcps, _leftLcps, _rightLcps, 0, _suffixArray.Length - 1); }
private void CreatePermutedLongestCommonPrefixFile() { if (File.Exists(_pLcpFilePath)) { File.Delete(_pLcpFilePath); } var sequence = FastaDatabase.GetSequence(); //Console.WriteLine("Annotation: {0}", System.Text.Encoding.ASCII.GetString(sequence)); var suffixArray = new int[sequence.Length - 1]; SAIS.sufsort(sequence, suffixArray, sequence.Length - 1); var prevIndex = sequence.Length - 1; var pLcp = new byte[suffixArray.Length]; // Data dependency: cannot run in parallel foreach (var index in suffixArray) { var lcp = GetLcp(sequence, prevIndex, index); pLcp[index] = lcp; //Console.WriteLine("{0}\t{1}", System.Text.Encoding.ASCII.GetString(sequence, offset, sequence.Length - offset - 1), lcp); prevIndex = index; } using (var fs = new FileStream(_pLcpFilePath, FileMode.OpenOrCreate, FileAccess.Write)) { foreach (var lcp in pLcp) { //Console.WriteLine("LCP: {0}", lcp); fs.WriteByte(lcp); } fs.Write(BitConverter.GetBytes(FastaDatabase.FileFormatId), 0, sizeof(int)); fs.Write(BitConverter.GetBytes(FastaDatabase.GetLastWriteTimeHash()), 0, sizeof(int)); } }
public void CountMatchedScansPerProtein() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const int minTagLength = 6; var proteinToScan = new Dictionary<string, HashSet<int>>(); const string fastaFilePath = @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); Console.WriteLine(@"Sequence length: {0}", fastaDb.GetSequence().Length); //const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv"; //const string tagFilePath = @"\\protoapps\UserData\Jungkap\Co_culture\23B_pellet_TD_3Feb14_Bane_PL011402.seqtag"; const string tagFilePath = @"D:\MassSpecFiles\co_culture\23A_pellet_TD_3Feb14_Bane_PL011402.seqtag"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } var isHeader = true; var numMatchedPairs = 0; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; continue; } var token = line.Split('\t'); if (token.Length != 3) continue; var scan = Convert.ToInt32(token[0]); var tag = token[1]; if (tag.Length < minTagLength) continue; foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index))) { ++numMatchedPairs; HashSet<int> matchedScans; if (proteinToScan.TryGetValue(matchedProtein, out matchedScans)) { matchedScans.Add(scan); } else { matchedScans = new HashSet<int> {scan}; proteinToScan.Add(matchedProtein, matchedScans); } } } var numMatchedProteins = proteinToScan.Keys.Count; var numAllProteins = fastaDb.GetNumEntries(); Console.WriteLine("NumAllProteins: {0}", numAllProteins); Console.WriteLine("NumMatchedProteins: {0}", numMatchedProteins); Console.WriteLine("AvgMatchedScansPerProtein: {0}", numMatchedPairs / (float)numAllProteins); }
public void CountMatchedProteins() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const int minTagLength = 3; var scanToProtein = new Dictionary<int, string>(); var idTag = new Dictionary<int, bool>(); const string resultFilePath = @"H:\Research\ProMex\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } var parser = new TsvFileParser(resultFilePath); var scans = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var proteinNames = parser.GetData("ProteinName").ToArray(); var qValues = parser.GetData("QValue").Select(Convert.ToDouble).ToArray(); for (var i = 0; i < qValues.Length; i++) { if (qValues[i] > 0.01) break; scanToProtein.Add(scans[i], proteinNames[i]); idTag.Add(scans[i], false); } const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } var run = PbfLcMsRun.GetLcMsRun(rawFilePath); const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } // const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta"; // const string fastaFilePath = // @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); Console.WriteLine("Sequence length: {0}", fastaDb.GetSequence().Length); const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.seqtag"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } var hist = new Dictionary<int, int>(); var scanSet = new HashSet<int>(); HashSet<string> proteinSetForThisScan = null; var prevScan = -1; var totalNumMatches = 0L; var isHeader = true; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; continue; } var token = line.Split('\t'); if (token.Length < 3) continue; var scan = Convert.ToInt32(token[0]); var proteinId = scanToProtein.ContainsKey(scan) ? scanToProtein[scan] : null; if (scan != prevScan) { if (proteinSetForThisScan != null) { var numMatches = proteinSetForThisScan.Count; int numOcc; if (hist.TryGetValue(numMatches, out numOcc)) hist[numMatches] = numOcc + 1; else hist.Add(numMatches, 1); } prevScan = scan; proteinSetForThisScan = new HashSet<string>(); } scanSet.Add(scan); var tag = token[1]; if (tag.Length < minTagLength) continue; if (proteinSetForThisScan == null) continue; var numMatchesForThisTag = 0; foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index))) { proteinSetForThisScan.Add(matchedProtein); ++numMatchesForThisTag; if (proteinId != null && matchedProtein.Equals(proteinId)) { idTag[scan] = true; } } totalNumMatches += numMatchesForThisTag; // if (numMatchesForThisTag > 10) // { // Console.WriteLine("{0}\t{1}", tag, numMatchesForThisTag); // } } if (proteinSetForThisScan != null) { var numMatches = proteinSetForThisScan.Count; int numOcc; if (hist.TryGetValue(numMatches, out numOcc)) hist[numMatches] = numOcc + 1; else hist.Add(numMatches, 1); } Console.WriteLine("AvgNumMatches: {0}", totalNumMatches/(float)scanSet.Count); Console.WriteLine("Histogram:"); foreach (var entry in hist.OrderBy(e => e.Key)) { Console.WriteLine("{0}\t{1}", entry.Key, entry.Value); } Console.WriteLine("NumId: {0}", idTag.Count); Console.WriteLine("NumIdByTag: {0}", idTag.Select(e => e.Value).Count(v => v)); }