public void CalculateProteinCoverage(List <FastaItem> fastaItems) { Parallel.ForEach(MyProteinList, myProtein => //foreach (MyProtein myProtein in MyProteinList) { //Locate the corrsponding fasta item myProtein.Locus = myProtein.Locus.TrimEnd('\r', '\n'); PatternTools.FastaParser.FastaItem item = fastaItems.Find(a => a.SequenceIdentifier.Equals(myProtein.Locus)); if (item != null) { myProtein.Coverage = Math.Round(item.Coverage(myProtein.DistinctPeptides), 4); myProtein.Length = item.Sequence.Length; myProtein.MolWt = Math.Round(item.MonoisotopicMass, 1); myProtein.Description = item.Description; myProtein.Sequence = item.Sequence; } else { //For some bizarre reason the key was not found Console.WriteLine("ERROR:: + Protein must be found in the database: " + myProtein.Locus); myProtein.Coverage = 0; myProtein.Length = 0; myProtein.MolWt = 0; myProtein.Description = "Description not found in database"; } } ); }
public void IncludeScrambled() { List <FastaItem> myScrambled = new List <FastaItem>(myItems.Count); for (int i = 0; i < myItems.Count; i++) { FastaItem f = new FastaItem(); f.SequenceIdentifier = "Scrambled_" + myItems[i].SequenceIdentifier; f.Sequence = ScrambleString(myItems[i].Sequence); f.Description = ""; myScrambled.Add(f); } myItems.AddRange(myScrambled); }
/// <summary> /// /// </summary> /// <param name="fileName"></param> /// <param name="removeReverse"></param> /// <param name="removeIPIVersionNumber"></param> /// <param name="considerOnlyFirstName"></param> /// <param name="useRegexForTitleOrDescription">If you wish to only consider items that have certain words in their descriptions or names. You should also set the Regex parameter</param> /// <param name="theRegex"></param> public void ParseFile( StreamReader sr, bool removeReverse, DBTypes dbType, bool useRegexForTitleOrDescription = false, Regex theRegex = null ) { //Prepare the stream Reader variables string read; Regex ipiVersion = new Regex(@"\.[0-9]+", RegexOptions.Compiled); Regex ipiRemove = new Regex(@">IPI:", RegexOptions.Compiled); Regex NextProtIDRegex = new Regex(@">nxp:([A-Z|0-9|\-|_]+) ", RegexOptions.Compiled); Regex correctIPI = new Regex(@"^\|>", RegexOptions.Compiled); Regex spaceSplitter = new Regex(@" ", RegexOptions.Compiled); Regex SequestFriendlyGI = new Regex(@">gi\|([0-9]*)", RegexOptions.Compiled); Regex SequestFriendlyGIDescription = new Regex(@">[^ ]* (.*)", RegexOptions.Compiled); Regex SequestFriendlyIPI = new Regex(@"IPI:(IPI[0-9|\.]+)\|", RegexOptions.Compiled); Regex SequestFriendlyIPIDescription = new Regex(@"Tax_Id=[0-9]+ Gene_Symbol=(.*)", RegexOptions.Compiled); Regex SequestFriendUniProt = new Regex(@"[a-z]+\|([A-Z|0-9|\-]+)\|", RegexOptions.Compiled); Regex SequestFriendlyUniProtDescription = new Regex(@" (.*)"); Regex TitleSpaceDescriptionID = new Regex(@">(\S+)", RegexOptions.Compiled); Regex TitleSpaceDescriptionDescription = new Regex(@" (.*)", RegexOptions.Compiled); //Declare the PathCandidates we will use //Parse and load to Ram FastaItem item = new FastaItem(); int SequenceCounter = 0; int lineCounter = 0; int problematicCounter = 0; while ((read = sr.ReadLine()) != null) { lineCounter++; if (read.StartsWith(">")) { if (SequenceCounter > 0) { if (useRegexForTitleOrDescription) { if (theRegex.IsMatch(item.SequenceIdentifier) || theRegex.IsMatch(item.Description)) { myItems.Add(item); } } else { myItems.Add(item); } } else { SequenceCounter++; } //---Parse search engine friendly item = new FastaItem(); Match id; Match description; if (dbType.Equals(DBTypes.NCBInr)) { id = SequestFriendlyGI.Match(read); description = SequestFriendlyGIDescription.Match(read); item.Description = description.Groups[1].Value; } else if (dbType.Equals(DBTypes.IPI)) { id = SequestFriendlyIPI.Match(read); description = SequestFriendlyIPIDescription.Match(read); item.Description = description.Groups[1].Value; } else if (dbType.Equals(DBTypes.IDSpaceDescription)) { id = TitleSpaceDescriptionID.Match(read); description = TitleSpaceDescriptionDescription.Match(read); item.Description = description.Groups[1].Value; } else if (dbType.Equals(DBTypes.UniProt)) { id = SequestFriendUniProt.Match(read); description = SequestFriendlyUniProtDescription.Match(read); item.Description = description.Groups[1].Value; } else if (dbType.Equals(DBTypes.NeXtProt)) { id = NextProtIDRegex.Match(read); List <string> cols = Regex.Split(read, Regex.Escape(@"\")).ToList(); string desc = cols.Find(a => a.StartsWith("Pname=")); item.Description = desc.Remove(0, 6); } else if (dbType.Equals(DBTypes.Contaminant)) { //we are working with dbtypes = all id = Regex.Match(read, @">(.*)"); //we do not want a match in the description description = Regex.Match(read, @"cvbcvbfgz"); item.Description = description.Groups[1].Value; } else { string[] cols = Regex.Split(read, " "); id = Regex.Match(cols[0], @">(.*)"); try { string desc = read.Substring(id.Length + 1); description = Regex.Match(desc, @"(.*)"); item.Description = description.Groups[1].Value; } catch { item.Description = "No Description"; } } if (id.Groups[1].Value.Length == 0) { item.SequenceIdentifier = "ProblematicID" + problematicCounter++; Console.WriteLine("Problem parsing identifier: " + lineCounter + " problem:" + problematicCounter); } else { item.SequenceIdentifier = id.Groups[1].Value; } //------------ } else { item.Sequence += read; } } Console.WriteLine(lineCounter + "lines parsed in the fasta DB."); //And now for the last item if (useRegexForTitleOrDescription) { if (theRegex.IsMatch(item.SequenceIdentifier) || theRegex.IsMatch(item.Description)) { myItems.Add(item); } } else { myItems.Add(item); } if (removeReverse) { myItems.RemoveAll(a => a.SequenceIdentifier.Contains("Reverse_")); } sr.Close(); }