/// <summary> /// Try and extend dangling links following /// graph clean-up after erosion. /// </summary> /// <param name="isForwardDirection">Boolean indicating direction of dangling link.</param> /// <param name="danglingLink">Dangling Link.</param> /// <param name="node">Node that is next on the link.</param> /// <param name="sameOrientation">Orientation of link.</param> /// <param name="removeLast">Boolean indicating if last node /// in link has to be removed before extending.</param> /// <returns>Length of dangling link found after extension.</returns> private int ExtendDanglingLink(bool isForwardDirection, DeBruijnPath danglingLink, DeBruijnNode node, bool sameOrientation, bool removeLast) { if (removeLast) { danglingLink.PathNodes.Remove(node); } if (danglingLink.PathNodes.Count == 0) { // DanglingLink is empty. So check if node is an end-point. if (node.RightExtensionNodesCount == 0) { danglingLink = this.TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true); } else if (node.LeftExtensionNodesCount == 0) { danglingLink = this.TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true); } else { // Not an end-point. Return length as 0 return(0); } } else { // Extend existing link. danglingLink = this.TraceDanglingExtensionLink(isForwardDirection, danglingLink, node, sameOrientation); } // Return length of dangling link found. return(danglingLink == null ? 0 : danglingLink.PathNodes.Count); }
/// <summary> /// Gets the DebruijnPath from the specified blocking collection. /// </summary> /// <param name="debruijnPaths">Blocking collection.</param> /// <returns>IEnumerable of Debruijn Path.</returns> private IEnumerable <DeBruijnPath> GetPaths(BlockingCollection <DeBruijnPath> debruijnPaths) { while (!debruijnPaths.IsCompleted) { DeBruijnPath path = null; if (debruijnPaths.TryTake(out path)) { yield return(path); } } }
public IndelData(DeBruijnPath originalPath, int kmerSize) { OkayData = true; SeqLength = (ushort)(kmerSize + (originalPath.PathNodes.Count - 1)); MinKmerCount = originalPath.PathNodes.Min(v => v.KmerCount); NodesInPath = originalPath.PathNodes.ToList(); //Now to decide the rough start and end of this sequence, also need to orient it. SetStartAndEnd(); var path = new DeBruijnPath(NodesInPath); Seq = path.ConvertToSequence(kmerSize); }
/// <summary> /// Removes nodes in link from the graph. /// Parallelization Note: Locks required here. We are modifying graph structure here. /// </summary> /// <param name="nodes">List of nodes to remove.</param> /// <param name="lastNodes">Set of all nodes occurring at end of dangling links.</param> private static void RemoveLinkNodes(DeBruijnPath nodes, HashSet <DeBruijnNode> lastNodes) { // Nodes in the list are part of a single dangling link. // Only the last element of link can have left or right extensions that are valid parts of graph. DeBruijnNode linkStartNode = nodes.PathNodes.Last(); // Update adjacency of nodes connected to the last node. // Read lock not required as linkStartNode's dictionary will not get updated // Locks used during removal of extensions. foreach (DeBruijnNode graphNode in linkStartNode.GetExtensionNodes()) { // Condition to avoid updating other linkStartNode's dictionary. Reduces conflicts. if (!lastNodes.Contains(graphNode)) { graphNode.RemoveExtensionThreadSafe(linkStartNode); } } }
/// <summary> /// Extract best path from list of paths. For the current cluster /// of paths, return only those that should be removed. /// </summary> /// <param name="divergingPaths">List of redundant paths.</param> /// <returns>List of paths nodes to be deleted.</returns> private static DeBruijnPathList ExtractBestPath(DeBruijnPathList divergingPaths) { // Find "best" path. Except for best path, return rest for removal int bestPathIndex = GetBestPath(divergingPaths); DeBruijnPath bestPath = divergingPaths.Paths[bestPathIndex]; divergingPaths.Paths.RemoveAt(bestPathIndex); // There can be overlap between redundant paths. // Remove path nodes that occur in best path foreach (var path in divergingPaths.Paths) { path.RemoveAll(n => bestPath.PathNodes.Contains(n)); } return(divergingPaths); }
/// <summary> /// Checks if 'node' can be added to 'link' without /// violating any conditions pertaining to dangling links. /// Returns null if loop is found or length exceeds threshold. /// Otherwise, adds node to link and returns /// </summary> /// <param name="link">Dangling link</param> /// <param name="node">Node to be added</param> /// <param name="reachedErrorEndPoint">Indicates if we have reached end of dangling link</param> /// <returns>Updated dangling link</returns> private DeBruijnPath CheckAndAddDanglingNode(DeBruijnPath link, DeBruijnNode node, out bool reachedErrorEndPoint) { if (_erodeThreshold != -1 && link.PathNodes.Count == 0 && node.KmerCount < _erodeThreshold) { if (node.IsMarked()) { // There is a loop in this link. No need to update link. // Set flag for end point reached as true and return. reachedErrorEndPoint = true; return(link); } else { node.MarkNode(); reachedErrorEndPoint = false; return(link); } } if (link.PathNodes.Contains(node)) { // There is a loop in this link. No need to update link. // Set flag for end point reached as true and return. reachedErrorEndPoint = true; return(link); } if (link.PathNodes.Count >= _lengthThreshold) { // Length crosses threshold. Not a dangling link. // So set reached error end point as true and return null. reachedErrorEndPoint = true; return(null); } // No error conditions found. Add node to link. reachedErrorEndPoint = false; link.PathNodes.Add(node); return(link); }
/// <summary> /// Starting from potential end of dangling link, trace back along /// extension edges in graph to find if it is a valid dangling link. /// Parallelization Note: No locks used in TraceDanglingLink. /// We only read graph structure here. No modifications are made. /// </summary> /// <param name="isForwardDirection">Boolean indicating direction of dangling link</param> /// <param name="link">Dangling Link</param> /// <param name="node">Node that is next on the link</param> /// <param name="sameOrientation">Orientation of link</param> /// <returns>List of nodes in dangling link</returns> private DeBruijnPath TraceDanglingExtensionLink(bool isForwardDirection, DeBruijnPath link, DeBruijnNode node, bool sameOrientation) { Dictionary <DeBruijnNode, DeBruijnEdge> sameDirectionExtensions, oppDirectionExtensions; bool reachedEndPoint = false; while (!reachedEndPoint) { // Get extensions going in same and opposite directions. if (isForwardDirection ^ sameOrientation) { sameDirectionExtensions = node.LeftExtensionNodes; oppDirectionExtensions = node.RightExtensionNodes; } else { sameDirectionExtensions = node.RightExtensionNodes; oppDirectionExtensions = node.LeftExtensionNodes; } if (sameDirectionExtensions.Count == 0) { // Found other end of dangling link // Add this and return return(CheckAndAddDanglingNode(link, node, out reachedEndPoint)); } else if (oppDirectionExtensions.Count > 1) { // Have reached a point of ambiguity. Return list without updating it if (_erodeThreshold != -1 && !node.IsMarked()) { lock (_danglingLinkExtensionTasks) { _danglingLinkExtensionTasks.Add(new Task <int>((o) => ExtendDanglingLink(isForwardDirection, link, node, sameOrientation, false), TaskCreationOptions.None)); } return(null); } return(link); } else if (sameDirectionExtensions.Count > 1) { // Have reached a point of ambiguity. Return list after updating it link = CheckAndAddDanglingNode(link, node, out reachedEndPoint); if (_erodeThreshold != -1 && reachedEndPoint != true && !node.IsMarked()) { lock (_danglingLinkExtensionTasks) { _danglingLinkExtensionTasks.Add(new Task <int>((o) => ExtendDanglingLink(isForwardDirection, link, node, sameOrientation, true), TaskCreationOptions.None)); } return(null); } return(link); } else { // (sameDirectionExtensions == 1 && oppDirectionExtensions == 1) // Continue traceback. Add this node to that list and recurse. link = CheckAndAddDanglingNode(link, node, out reachedEndPoint); if (reachedEndPoint) { // Loop is found or threshold length has been exceeded. return(link); } else { node = sameDirectionExtensions.First().Key; sameOrientation = !(sameOrientation ^ sameDirectionExtensions.First().Value.IsSameOrientation); } } } return(null); // code will never reach here. Valid returns happen within the while loop. }
/// <summary> /// Erode ends of graph that have coverage less than given erodeThreshold. /// As optimization, we also check for dangling links and keeps track of the /// lengths of the links found. No removal is done at this step. /// This is done to get an idea of the different lengths at /// which to run the dangling links purger step. /// This method returns the lengths of dangling links found. /// Locks: Method only does reads. No locking necessary here. /// </summary> /// <param name="graph">Input graph</param> /// <param name="erodeThreshold">Threshold for erosion</param> /// <returns>List of lengths of dangling links detected</returns> public IEnumerable <int> ErodeGraphEnds(DeBruijnGraph graph, int erodeThreshold = -1) { if (graph == null) { throw new ArgumentNullException("graph"); } _erodeThreshold = erodeThreshold; _danglingLinkLengths = new SortedSet <int>(); _danglingLinkExtensionTasks = new List <Task <int> >(); ICollection <DeBruijnNode> graphNodes = graph.Nodes; do { // Make graphNodes into an Array so that Range Partitioning can be used. DeBruijnNode[] graphNodesList = graphNodes.ToArray(); int rangeSize = (int)Math.Ceiling((float)graph.Nodes.Count / Environment.ProcessorCount); if (rangeSize != 0 && graphNodes.Count != 0) { _danglingLinkLengths.UnionWith( Partitioner.Create(0, graphNodesList.Length, rangeSize).AsParallel().SelectMany(chunk => { SortedSet <int> linkLengths = new SortedSet <int>(); for (int i = chunk.Item1; i < chunk.Item2; i++) { DeBruijnNode node = graphNodesList[i]; if (node.ExtensionsCount == 0) { if (_erodeThreshold != -1 && node.KmerCount < _erodeThreshold) { // Mark node for erosion node.MarkNode(); } else { // Single node island linkLengths.Add(1); } } else if (node.RightExtensionNodes.Count == 0) { // End of possible dangling link // Traceback to see if it is part of a dangling link DeBruijnPath link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true); if (link != null && link.PathNodes.Count > 0) { linkLengths.Add(link.PathNodes.Count); } } else if (node.LeftExtensionNodes.Count == 0) { // End of possible dangling link // Traceback to see if it is part of a dangling link DeBruijnPath link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true); if (link != null && link.PathNodes.Count > 0) { linkLengths.Add(link.PathNodes.Count); } } } return(linkLengths); })); // Remove eroded nodes. In the out paranter, get the list of new // end-points that was created by removing eroded nodes. RemoveErodedNodes(graph, out graphNodes); } } while (graphNodes != null && graphNodes.Count > 0); _erodeThreshold = -1; ExtendDanglingLinks(); return(_danglingLinkLengths); }
/// <summary> /// Erode ends of graph that have coverage less than given erodeThreshold. /// As optimization, we also check for dangling links and keeps track of the /// lengths of the links found. No removal is done at this step. /// This is done to get an idea of the different lengths at /// which to run the dangling links purger step. /// This method returns the lengths of dangling links found. /// Locks: Method only does reads. No locking necessary here. /// </summary> /// <param name="graph">Input graph.</param> /// <param name="erosionThreshold">Threshold for erosion.</param> /// <returns>List of lengths of dangling links detected.</returns> public IEnumerable <int> ErodeGraphEnds(DeBruijnGraph graph, int erosionThreshold = -1) { if (graph == null) { throw new ArgumentNullException("graph"); } this.erodeThreshold = erosionThreshold; this.danglingLinkLengths = new SortedSet <int>(); this.danglingLinkExtensionTasks = new List <Task <int> >(); IEnumerable <DeBruijnNode> graphNodes = graph.GetNodes(); BlockingCollection <int> linkLengths = new BlockingCollection <int>(); Task collectionTask = Task.Factory.StartNew(() => { while (!linkLengths.IsCompleted) { int length; if (linkLengths.TryTake(out length)) { this.danglingLinkLengths.Add(length); } } }); bool continueSearching = true; while (continueSearching) { continueSearching = false; Parallel.ForEach( graphNodes, (node) => { continueSearching = true; if (node.ExtensionsCount == 0) { if (erosionThreshold != -1 && node.KmerCount < erosionThreshold) { // Mark node for erosion node.MarkNodeForDelete(); } else { // Single node island. linkLengths.Add(1); } } else if (node.RightExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link. DeBruijnPath link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true); if (link != null && link.PathNodes.Count > 0) { linkLengths.Add(link.PathNodes.Count); } } else if (node.LeftExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link. DeBruijnPath link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true); if (link != null && link.PathNodes.Count > 0) { linkLengths.Add(link.PathNodes.Count); } } }); // Remove eroded nodes. In the out parameter, get the list of new // end-points that was created by removing eroded nodes. IList <DeBruijnNode> nodes = RemoveErodedNodes(graph); if (nodes.Count == 0) { break; } graphNodes = nodes; } linkLengths.CompleteAdding(); Task.WaitAll(collectionTask); erosionThreshold = -1; this.ExtendDanglingLinks(); return(this.danglingLinkLengths); }
/// <summary> /// Starting from potential end of dangling link, trace back along /// extension edges in graph to find if it is a valid dangling link. /// Parallelization Note: No locks used in TraceDanglingLink. /// We only read graph structure here. No modifications are made. /// </summary> /// <param name="isForwardDirection">Boolean indicating direction of dangling link.</param> /// <param name="link">Dangling Link.</param> /// <param name="node">Node that is next on the link.</param> /// <param name="sameOrientation">Orientation of link.</param> /// <returns>List of nodes in dangling link.</returns> private DeBruijnPath TraceDanglingExtensionLink(bool isForwardDirection, DeBruijnPath link, DeBruijnNode node, bool sameOrientation) { bool reachedEndPoint = false; while (!reachedEndPoint) { // Get extensions going in same and opposite directions. Dictionary <DeBruijnNode, bool> sameDirectionExtensions; int sameDirectionExtensionsCount; int oppDirectionExtensionsCount; if (isForwardDirection ^ sameOrientation) { sameDirectionExtensionsCount = node.LeftExtensionNodesCount; oppDirectionExtensionsCount = node.RightExtensionNodesCount; //Avoid self references here and below //TODO: We should force the k-mer to be large enough that there is no sameDirectionExtensions = node.GetLeftExtensionNodesWithOrientation(). Where(x => x.Key != node). ToDictionary(x => x.Key, y => y.Value); } else { sameDirectionExtensionsCount = node.RightExtensionNodesCount; oppDirectionExtensionsCount = node.LeftExtensionNodesCount; sameDirectionExtensions = node.GetRightExtensionNodesWithOrientation(). Where(x => x.Key != node). ToDictionary(x => x.Key, y => y.Value); } if (sameDirectionExtensionsCount == 0) { // Found other end of dangling link // Add this and return. return(this.CheckAndAddDanglingNode(link, node, out reachedEndPoint)); } if (oppDirectionExtensionsCount > 1) { // Have reached a point of ambiguity. Return list without updating it. if (this.erodeThreshold != -1 && !node.IsMarkedForDelete) { lock (this.danglingLinkExtensionTasks) { //THis task essentially just returns back to this method after other ones are removed this.danglingLinkExtensionTasks.Add(new Task <int>((o) => this.ExtendDanglingLink(isForwardDirection, link, node, sameOrientation, false), TaskCreationOptions.None)); } return(null); } return(link); } if (sameDirectionExtensionsCount > 1) { // Have reached a point of ambiguity. Return list after updating it. link = this.CheckAndAddDanglingNode(link, node, out reachedEndPoint); if (this.erodeThreshold != -1 && reachedEndPoint != true && !node.IsMarkedForDelete) { lock (this.danglingLinkExtensionTasks) { this.danglingLinkExtensionTasks.Add(new Task <int>((o) => this.ExtendDanglingLink(isForwardDirection, link, node, sameOrientation, true), TaskCreationOptions.None)); } return(null); } return(link); } // (sameDirectionExtensions == 1 && oppDirectionExtensions == 1) // Continue trace back. Add this node to that list and recurse. link = this.CheckAndAddDanglingNode(link, node, out reachedEndPoint); if (reachedEndPoint) { // Loop is found or threshold length has been exceeded. return(link); } //still in loop, so just add the extension and keeps going var item = sameDirectionExtensions.First(); node = item.Key; sameOrientation = !(sameOrientation ^ item.Value); } return(null); // code will never reach here. Valid returns happen within the while loop. }
/// <summary> /// Starting from potential end of dangling link, trace back along /// extension edges in graph to find if it is a valid dangling link. /// Parallelization Note: No locks used in TraceDanglingLink. /// We only read graph structure here. No modifications are made. /// </summary> /// <param name="isForwardDirection">Boolean indicating direction of dangling link.</param> /// <param name="link">Dangling Link.</param> /// <param name="node">Node that is next on the link.</param> /// <param name="sameOrientation">Orientation of link.</param> /// <returns>List of nodes in dangling link.</returns> private DeBruijnPath TraceDanglingExtensionLink(bool isForwardDirection, DeBruijnPath link, DeBruijnNode node, bool sameOrientation) { for (; ;) { // Get extensions going in same and opposite directions. Dictionary <DeBruijnNode, bool> sameDirectionExtensions; int sameDirectionExtensionsCount, oppDirectionExtensionsCount; if (isForwardDirection ^ sameOrientation) { sameDirectionExtensionsCount = node.LeftExtensionNodesCount; oppDirectionExtensionsCount = node.RightExtensionNodesCount; sameDirectionExtensions = node.GetLeftExtensionNodesWithOrientation(); } else { sameDirectionExtensionsCount = node.RightExtensionNodesCount; oppDirectionExtensionsCount = node.LeftExtensionNodesCount; sameDirectionExtensions = node.GetRightExtensionNodesWithOrientation(); } bool reachedEndPoint; if (sameDirectionExtensionsCount == 0) { // Found other end of dangling link return(CheckAndAddDanglingNode(link, node, out reachedEndPoint)); } if (oppDirectionExtensionsCount > 1) { // Have reached a point of ambiguity. Return list without updating it. if (this.erodeThreshold != -1 && !node.IsMarkedForDelete) { lock (this.danglingLinkExtensionTasks) { // This task essentially just returns back to this method after other ones are removed this.danglingLinkExtensionTasks.Add(new Task <int>(_ => ExtendDanglingLink(isForwardDirection, link, node, sameOrientation, false), TaskCreationOptions.None)); } return(null); } return(link); } if (sameDirectionExtensionsCount > 1) { // Have reached a point of ambiguity. Return list after updating it. link = CheckAndAddDanglingNode(link, node, out reachedEndPoint); if (this.erodeThreshold != -1 && reachedEndPoint != true && !node.IsMarkedForDelete) { lock (this.danglingLinkExtensionTasks) { this.danglingLinkExtensionTasks.Add(new Task <int>(_ => ExtendDanglingLink(isForwardDirection, link, node, sameOrientation, true), TaskCreationOptions.None)); } return(null); } return(link); } // (sameDirectionExtensions == 1 && oppDirectionExtensions == 1) // Continue trace back. Add this node to that list and recurse. link = CheckAndAddDanglingNode(link, node, out reachedEndPoint); if (reachedEndPoint) { // Loop is found or threshold length has been exceeded. return(link); } // Still in loop, so just add the extension and keeps going var item = sameDirectionExtensions.First(); node = item.Key; sameOrientation = !(sameOrientation ^ item.Value); } }
/// <summary> /// Traces diverging paths in given direction. /// For each path in the set of diverging paths, extend path by one node /// at a time. Continue this until all diverging paths converge to a /// single node or length threshold is exceeded. /// If paths converge, add path cluster containing list of redundant /// path nodes to list of redundant paths and return. /// </summary> /// <param name="startNode">Node at starting point of divergence.</param> /// <param name="divergingNodes">List of diverging nodes.</param> /// <param name="isRightExtension">Bool indicating direction of divergence.</param> /// <param name="redundantPaths">List of redundant paths.</param> private void TraceDivergingExtensionPaths( DeBruijnNode startNode, Dictionary <DeBruijnNode, bool> divergingNodes, bool isRightExtension, List <DeBruijnPathList> redundantPaths) { //maka a new path with each having the same start node, and a differing second node based on orientation List <PathWithOrientation> divergingPaths = new List <PathWithOrientation>( divergingNodes.Select(n => new PathWithOrientation(startNode, n.Key, n.Value))); int maxDivergingPathLength = 2; // Extend paths till length threshold is exceeded. // In case paths coverge within threshold, we break out of while. // Make a list of paths that we have finished following, this would be any path that // has reached a divergent end, possibly before the others have. var finishedPaths = new List <PathWithOrientation>(divergingPaths.Count); while (maxDivergingPathLength <= this.pathLengthThreshold && finishedPaths.Count != divergingPaths.Count) { // Extend each path in cluster. While performing path extension // also keep track of whether they have converged var startCount = divergingPaths.Count; for (int k = 0; k < startCount; k++) { var path = divergingPaths [k]; if (finishedPaths.Contains(path)) { continue; } /* We go left if we are already heading left in the same orientation, or if * we are heading right with a different orientation */ var grabLeftNext = isRightExtension ^ path.IsSameOrientation; var endNode = path.Nodes.Last(); var nextNodes = grabLeftNext ? endNode.GetLeftExtensionNodesWithOrientation() : endNode.GetRightExtensionNodesWithOrientation(); // If this path ends, we don't continue to follow it. if (nextNodes.Count == 0) { finishedPaths.Add(path); continue; } PathWithOrientation oldPath = null; if (nextNodes.Count > 1) { oldPath = new PathWithOrientation(path); } for (int i = 0; i < nextNodes.Count; i++) { KeyValuePair <DeBruijnNode, bool> nextNode = nextNodes.ElementAt(i); // if more than one, deep copy and continue. if (i > 0) { path = new PathWithOrientation(oldPath); divergingPaths.Add(path); } if (path.Nodes.Contains(nextNode.Key)) { // Loop in path //TODO: Not necessarily true, could overlap with itself but go out the other way finishedPaths.Add(path); continue; } else { // Update path orientation path.IsSameOrientation = !(path.IsSameOrientation ^ nextNode.Value); path.Nodes.Add(nextNode.Key); } } } maxDivergingPathLength++; /* Now to check for convergence, this is true if all paths can end with the same node * equivalent to all paths having the same node somewhere. * TODO: Slow implementation is brute force N by all measure * first step would be to search only over the smallest possible path */ var firstPathNodes = divergingPaths[0].Nodes; DeBruijnNode endingNode = null; for (int i = 1; i < firstPathNodes.Count; i++) { var presentInAll = true; var cur_node = firstPathNodes[i]; for (int k = 1; k < divergingPaths.Count; k++) { var c_path = divergingPaths[k]; if (!c_path.Nodes.Contains(cur_node)) { presentInAll = false; break; } } if (presentInAll) { endingNode = cur_node; break; } } // Paths have been extended. Check for convergence if (endingNode != null) { DeBruijnPathList dpl = new DeBruijnPathList(divergingPaths.Count); //If they have all converged, we now trim off any nodes at the end that didn't apply. for (int i = 0; i < divergingPaths.Count; i++) { var cur_path = divergingPaths[i]; DeBruijnPath dp; if (endingNode != cur_path.Nodes.Last()) { var indexOfEnd = cur_path.Nodes.IndexOf(endingNode); dp = new DeBruijnPath(cur_path.Nodes.Take(indexOfEnd + 1)); } else { dp = new DeBruijnPath(cur_path.Nodes); } dpl.AddPath(dp); } // Note: all paths have the same end node. lock (redundantPaths) { // Redundant paths found redundantPaths.Add(dpl); } return; } } }