void GroupProteinsOrPeptides( IDictionary <string, Node> entities, IReadOnlyList <string> entityNames, Type entityType, GlobalIDContainer globalIDTracker) { for (var count = 0; count != entityNames.Count; count++) { // Is the key there? if (!entities.ContainsKey(entityNames[count])) { continue; } // Get the protein or peptide var entity = entities[entityNames[count]]; // Only proceed if the correct type if (entity.GetType() != entityType) { continue; } var duplicates = new NodeChildren <Node>(); // Look for duplicates and add to a duplicate list duplicates.AddRange(FindDuplicates(entity)); if (duplicates.Count <= 1) { continue; } // Create a protein or peptide group from the duplicates Group newGroup; if (entityType == typeof(Protein)) { newGroup = new ProteinGroup(duplicates, globalIDTracker); } else if (entityType == typeof(Peptide)) { newGroup = new PeptideGroup(duplicates, globalIDTracker); } else { throw new Exception("Invalid type: must be Protein or Peptide"); } foreach (var duplicateItem in duplicates) { // Remove entities from the library, add the new group entities.Remove(duplicateItem.NodeName); } entities.Add(newGroup.NodeName, newGroup); } }
private static string GetProteinList(Node node, GlobalIDContainer globalIDTracker) { if (node.NodeName.IndexOf(Group.LIST_SEP_CHAR) > 0) { var proteinList = globalIDTracker.IDListToNameListString(node.NodeName, Group.LIST_SEP_CHAR); return(proteinList.Replace(Group.LIST_SEP_CHAR.ToString(), "; ")); } return(node.NodeName); }
/// <summary> /// Group proteins having similar peptides /// </summary> /// <param name="peptideProteinMapList">List of protein to peptide mappings</param> /// <param name="clusteredProteins">Parsimonious list of protein</param> /// <param name="globalIDTracker"></param> /// <returns></returns> /// <remarks>Runs on a separate thread which allows for deeper recursion</remarks> // ReSharper disable once UnusedMember.Global public bool PerformParsimonyThreaded( List <RowEntry> peptideProteinMapList, out List <Node> clusteredProteins, out GlobalIDContainer globalIDTracker) { mPeptideProteinMapList = peptideProteinMapList; // Increase the stack size from 1 MB to 50 MB const int stackSizeInBytes = 50 * 1024 * 1024; // Since we're using a larger stack size, we can increase the maximum DFS recursion depth mMaxDFSRecursionDepth = 20000; var thread = new Thread(PerformParsimonyWork, stackSizeInBytes); thread.Start(); var startTime = DateTime.UtcNow; while (true) { // Wait for 2 seconds thread.Join(2000); // Check whether the thread is still running if (!thread.IsAlive) { break; } // Check whether the thread has been running too long if (!(DateTime.UtcNow.Subtract(startTime).TotalHours > MAX_RUNTIME_HOURS)) { continue; } OnErrorEvent("Parsimony calculations have run for over " + MAX_RUNTIME_HOURS + " hours; aborting"); try { thread.Abort(); } catch { // Ignore errors here; } break; } clusteredProteins = mClusteredProteins; globalIDTracker = mGlobalIDTracker; return(mProcessingSucceeded); }
/// <summary> /// Collapses redundant protein and peptides into single groups /// </summary> /// <param name="proteins"></param> /// <param name="peptides"></param> /// <param name="globalIDTracker"></param> public void CollapseNodes( Dictionary <string, Node> proteins, Dictionary <string, Node> peptides, GlobalIDContainer globalIDTracker) { var proteinNames = proteins.Keys.ToList(); GroupProteinsOrPeptides(proteins, proteinNames, typeof(Protein), globalIDTracker); var peptideNames = peptides.Keys.ToList(); GroupProteinsOrPeptides(peptides, peptideNames, typeof(Peptide), globalIDTracker); }
/// <summary> /// Group proteins having similar peptides /// </summary> /// <param name="peptideProteinMapList">List of protein to peptide mappings</param> /// <param name="clusteredProteins">Parsimonious list of protein</param> /// <param name="globalIDTracker"></param> /// <returns></returns> // ReSharper disable once UnusedMember.Global public bool PerformParsimony( List <RowEntry> peptideProteinMapList, out List <Node> clusteredProteins, out GlobalIDContainer globalIDTracker) { mPeptideProteinMapList = peptideProteinMapList; mMaxDFSRecursionDepth = DEFAULT_MAX_DFS_RECURSION_DEPTH; PerformParsimonyWork(); clusteredProteins = mClusteredProteins; globalIDTracker = mGlobalIDTracker; return(mProcessingSucceeded); }
private void TestNodeBuilder(List <RowEntry> peptideProteinMapList, IReadOnlyList <string> expectedOutput) { var nodeBuilder = new NodeBuilder(); var nodeCollapser = new NodeCollapser(); var dfs = new DFS(); var cover = new Cover(); var globalIDTracker = new GlobalIDContainer(); nodeBuilder.RunAlgorithm(peptideProteinMapList, out var proteins, out var peptides); nodeCollapser.RunAlgorithm(proteins, peptides, globalIDTracker); var proteinsWithChildren = proteins.Values.ToList(); var clusteredProteinSets = dfs.RunAlgorithm(proteinsWithChildren); var clusteredProteins = cover.RunAlgorithm(clusteredProteinSets); var outLines = Utilities.ConvertNodesToStringList(clusteredProteins, globalIDTracker); foreach (var item in outLines) { Console.WriteLine(item); } if (expectedOutput == null || expectedOutput.Count == 0) { return; } for (var i = 0; i < outLines.Count; i++) { if (i >= expectedOutput.Count) { Assert.Fail("Extra, unexpected output line: " + outLines[i]); } Assert.AreEqual(expectedOutput[i], outLines[i], "Mismatch on line {0}", i + 1); } if (expectedOutput.Count > outLines.Count) { Assert.Fail("Output did not include additional, expected lines, starting with " + expectedOutput[outLines.Count]); } }
public static List <string> ConvertNodesToStringList(List <Node> outData, GlobalIDContainer globalIDTracker) { var outLines = new List <string>(); var headerNames = new List <string> { "Protein", "Peptide" }; outLines.Add(string.Join("\t", headerNames)); foreach (var node in outData) { foreach (var child in node.Children) { if (child.GetType() == typeof(PeptideGroup)) { var proteinList = GetProteinList(node, globalIDTracker); foreach (var groupedPeptide in ((Group)child).GetNodeGroup()) { outLines.Add(string.Format("{0}\t{1}", proteinList, groupedPeptide.NodeName)); } } else if (node is ProteinGroup) { var proteinList = GetProteinList(node, globalIDTracker); outLines.Add(string.Format("{0}\t{1}", proteinList, child.NodeName)); } else { outLines.Add(string.Format("{0}\t{1}", node.NodeName, child.NodeName)); } } } return(outLines); }
public PeptideGroup(NodeChildren <Node> groupedNodes, GlobalIDContainer globalIDTracker) : base(NodeTypeName.PeptideGroup, groupedNodes, globalIDTracker) { }
/// <summary> /// Group proteins having similar peptides /// </summary> /// <returns></returns> private void PerformParsimonyWork() { mProcessingSucceeded = false; mClusteredProteins = new List <Node>(); mGlobalIDTracker = new GlobalIDContainer(); // Prepare objects and algorithms var nodeBuilder = new NodeBuilder(); var nodeCollapser = new NodeCollapser(); var dfs = new DFS { MaxRecursionDepth = mMaxDFSRecursionDepth }; var cover = new Cover(); RegisterEvents(dfs); RegisterEvents(cover); if (ShowProgressAtConsole) { Console.WriteLine(); OnStatusEvent("Finding parsimonious protein groups"); } nodeBuilder.RunAlgorithm(mPeptideProteinMapList, out var proteins, out var peptides); if (proteins == null || proteins.Count == 0) { throw new Exception("Error in PerformParsimony: Protein list is empty"); } if (peptides == null || peptides.Count == 0) { throw new Exception("Error in PerformParsimony: Peptide list is empty"); } nodeCollapser.RunAlgorithm(proteins, peptides, mGlobalIDTracker); if (proteins == null || proteins.Count == 0) { throw new Exception("Error in PerformParsimony after nodeCollapser.RunAlgorithm: Protein list is empty"); } if (peptides == null || peptides.Count == 0) { throw new Exception("Error in PerformParsimony after nodeCollapser.RunAlgorithm: Peptide list is empty"); } var proteinsWithChildren = proteins.Values.ToList(); var clusteredProteinSets = dfs.RunAlgorithm(proteinsWithChildren); if (clusteredProteinSets == null || clusteredProteinSets.Count == 0) { throw new Exception("Error in PerformParsimony: DFS returned an empty protein list"); } mClusteredProteins = cover.RunAlgorithm(clusteredProteinSets); if (mClusteredProteins == null || mClusteredProteins.Count == 0) { throw new Exception("Error in PerformParsimony: cover.RunAlgorithm returned an empty protein list"); } if (ShowProgressAtConsole) { OnStatusEvent("Iteration Complete, found {0} protein groups", mClusteredProteins.Count); } mProcessingSucceeded = true; }
/// <summary> /// Creates tab-delimited text file parsimonyResultsFilePath with the protein groups and member peptides /// Creates tab-delimited text file proteinGroupMembersFilePath with the members of each protein group /// </summary> /// <param name="outData"></param> /// <param name="parsimonyResultsFilePath"></param> /// <param name="proteinGroupMembersFilePath"></param> /// <param name="globalIDTracker"></param> public static void SaveResults(List <Node> outData, string parsimonyResultsFilePath, string proteinGroupMembersFilePath, GlobalIDContainer globalIDTracker) { using var resultsWriter = new StreamWriter(new FileStream(parsimonyResultsFilePath, FileMode.Create, FileAccess.Write, FileShare.Read)); using var proteinGroupsWriter = new StreamWriter(new FileStream(proteinGroupMembersFilePath, FileMode.Create, FileAccess.Write, FileShare.Read)); var header = "GroupID\tProtein_First\tPeptide\tProtein_List\tProtein_Count\tGroup_Count"; resultsWriter.WriteLine(header); header = "GroupID\tProtein"; proteinGroupsWriter.WriteLine(header); // Step through the data to determine the number of groups that each peptide is in var peptideToProteinGroupMap = new Dictionary <string, int>(); foreach (var proteinNode in outData) { foreach (var child in proteinNode.Children) { if (child.GetType() == typeof(PeptideGroup)) { var currentPeptides = (Group)child; foreach (var groupedPeptide in currentPeptides.GetNodeGroup()) { UpdatePeptideToProteinGroupMap(peptideToProteinGroupMap, groupedPeptide.NodeName); } } else { UpdatePeptideToProteinGroupMap(peptideToProteinGroupMap, child.NodeName); } } } // Now write out the results var groupID = 0; foreach (var proteinNode in outData) { string proteinFirst; string proteinNameOrList; int proteinsInGroupCount; groupID++; // Append one or more lines to T_Parsimony_Group_Members.txt if (proteinNode.GetType() == typeof(ProteinGroup)) { var currentGroup = (ProteinGroup)proteinNode; proteinFirst = currentGroup.NodeNameFirst; if (currentGroup.NodeName.IndexOf(Group.LIST_SEP_CHAR) > 0) { var proteinList = globalIDTracker.IDListToNameList(currentGroup.NodeName, Group.LIST_SEP_CHAR); proteinNameOrList = string.Join("; ", proteinList); proteinsInGroupCount = proteinList.Count; foreach (var proteinMember in proteinList) { WriteOutputGroupMemberLine(proteinGroupsWriter, groupID, proteinMember); } } else { // Note: this code should never be reached proteinNameOrList = currentGroup.NodeName; proteinsInGroupCount = 1; WriteOutputGroupMemberLine(proteinGroupsWriter, groupID, proteinNameOrList); } } else { proteinFirst = proteinNode.NodeName; proteinNameOrList = proteinNode.NodeName; proteinsInGroupCount = 1; WriteOutputGroupMemberLine(proteinGroupsWriter, groupID, proteinNameOrList); } // Append one or more lines to T_Parsimony_Grouping.txt foreach (var child in proteinNode.Children) { if (child.GetType() == typeof(PeptideGroup)) { var currentPeptides = (Group)child; foreach (var groupedPeptide in currentPeptides.GetNodeGroup()) { WriteOutputGroupingLine(resultsWriter, peptideToProteinGroupMap, groupID, proteinFirst, groupedPeptide.NodeName, proteinNameOrList, proteinsInGroupCount); } } else { WriteOutputGroupingLine(resultsWriter, peptideToProteinGroupMap, groupID, proteinFirst, child.NodeName, proteinNameOrList, proteinsInGroupCount); } } } }
public void RunAlgorithm(Dictionary <string, Node> protein, Dictionary <string, Node> pep, GlobalIDContainer globalIDTracker) { CollapseNodes(protein, pep, globalIDTracker); }