/// <summary> /// Detect nodes that are on redundant paths. /// Start from any node that has ambiguous (more than one) extensions. /// From this node, trace path for each extension until either they /// converge to a single node or threshold length is exceeded. /// In case they converge, we have a set of redundant paths. /// We pick the best path based on the kmer counts of the path nodes. /// All paths other than the best one are returned for removal. /// Locks: Method only does reads. No locking necessary here or its callees. /// </summary> /// <param name="deBruijnGraph">De Bruijn Graph.</param> /// <returns>List of path nodes to be deleted.</returns> public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph) { if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } DeBruijnGraph.ValidateGraph(deBruijnGraph); this.graph = deBruijnGraph; List<DeBruijnPathList> redundantPaths = new List<DeBruijnPathList>(); Parallel.ForEach( deBruijnGraph.GetNodes(), node => { // Need to check for both left and right extensions for ambiguity. if (node.RightExtensionNodesCount > 1) { TraceDivergingExtensionPaths(node, node.GetRightExtensionNodesWithOrientation(), true, redundantPaths); } if (node.LeftExtensionNodesCount > 1) { TraceDivergingExtensionPaths(node, node.GetLeftExtensionNodesWithOrientation(), false, redundantPaths); } }); redundantPaths = RemoveDuplicates(redundantPaths); return DetachBestPath(redundantPaths); }
/// <summary> /// Validate input graph. /// Throws exception if graph is null. /// </summary> /// <param name="graph">Input graph.</param> public static void ValidateGraph(DeBruijnGraph graph) { if (graph == null) { throw new ArgumentNullException("graph"); } }
/// <summary> /// Build contig sequences from the graph. /// </summary> /// <param name="deBruijnGraph">De Bruijn graph.</param> /// <returns>List of contig data.</returns> public IEnumerable<ISequence> Build(DeBruijnGraph deBruijnGraph) { if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } _graph = deBruijnGraph; _coverageThreshold = -1; DeBruijnGraph.ValidateGraph(deBruijnGraph); ExcludeAmbiguousExtensions(); Parallel.ForEach(_graph.GetNodes(), n => n.PurgeInvalidExtensions()); return GetSimplePaths(true); }
/// <summary> /// Build contigs from graph. For contigs whose coverage is less than /// the specified threshold, remove graph nodes belonging to them. /// </summary> /// <param name="deBruijnGraph">DeBruijn Graph.</param> /// <param name="coverageThresholdForContigs">Coverage Threshold for contigs.</param> /// <returns>Number of nodes removed.</returns> public long RemoveLowCoverageContigs(DeBruijnGraph deBruijnGraph, double coverageThresholdForContigs) { if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } if (coverageThresholdForContigs <= 0) { throw new ArgumentException("For removing low coverage contigs, coverage threshold should be a positive number"); } _coverageThreshold = coverageThresholdForContigs; _graph = deBruijnGraph; DeBruijnGraph.ValidateGraph(deBruijnGraph); ExcludeAmbiguousExtensions(); Parallel.ForEach(deBruijnGraph.GetNodes(), n => n.ComputeValidExtensions()); GetSimplePaths(false); Parallel.ForEach(deBruijnGraph.GetNodes(),n=>n.UndoAmbiguousExtensions()); return deBruijnGraph.RemoveMarkedNodes(); }
/// <summary> /// Validate the graph nodes sequence, left edges and right edges /// </summary> /// <param name="graph">graph object</param> /// <param name="nodeName">xml node name used for different testcases</param> internal void ValidateGraph(DeBruijnGraph graph, string nodeName) { string nodesSequence = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.NodesSequenceNode); string nodesLeftEdges = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.NodesLeftEdgesCountNode); string nodesRightEdges = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.NodeRightEdgesCountNode); string[] leftEdgesCount = ReadStringFromFile(nodesLeftEdges).Replace("\r\n", "").Split(','); string[] rightEdgesCount = ReadStringFromFile(nodesRightEdges).Replace("\r\n", "").Split(','); string[] nodesSequences = ReadStringFromFile(nodesSequence).Replace("\r\n", "").Split(','); // Validate the nodes for (int iseq = 0; iseq < nodesSequences.Length; iseq++) { DeBruijnNode dbnodes = graph.GetNodes().First(n => graph.GetNodeSequence(n).ConvertToString() == nodesSequences[iseq] || graph.GetNodeSequence(n).GetReverseComplementedSequence().ConvertToString() == nodesSequences[iseq]); //Due to parallelization the left edges and right edges count //can be swapped while processing. if actual left edges count //is either equal to expected left edges count or right edges count and vice versa. Assert.IsTrue( dbnodes.LeftExtensionNodesCount.ToString((IFormatProvider)null) == leftEdgesCount[iseq] || dbnodes.LeftExtensionNodesCount.ToString((IFormatProvider)null) == rightEdgesCount[iseq]); Assert.IsTrue( dbnodes.RightExtensionNodesCount.ToString((IFormatProvider)null) == leftEdgesCount[iseq] || dbnodes.RightExtensionNodesCount.ToString((IFormatProvider)null) == rightEdgesCount[iseq]); } }
/// <summary> /// Erode ends of graph that have coverage less than given erodeThreshold. /// As optimization, we also check for dangling links and keeps track of the /// lengths of the links found. No removal is done at this step. /// This is done to get an idea of the different lengths at /// which to run the dangling links purger step. /// This method returns the lengths of dangling links found. /// Locks: Method only does reads. No locking necessary here. /// </summary> /// <param name="graph">Input graph.</param> /// <param name="erosionThreshold">Threshold for erosion.</param> /// <returns>List of lengths of dangling links detected.</returns> public IEnumerable<int> ErodeGraphEnds(DeBruijnGraph graph, int erosionThreshold = -1) { if (graph == null) { throw new ArgumentNullException("graph"); } this.erodeThreshold = erosionThreshold; this.danglingLinkLengths = new SortedSet<int>(); this.danglingLinkExtensionTasks = new List<Task<int>>(); IEnumerable<DeBruijnNode> graphNodes = graph.GetNodes(); BlockingCollection<int> linkLengths = new BlockingCollection<int>(); // Start consumer task // TODO: This length list should contain elements from ~1-19, decide if // having a separate thread keep those numbers sorted is worthwhile Task collectionTask = Task.Run(() => { while (!linkLengths.IsCompleted) { int length; if (linkLengths.TryTake(out length)) { this.danglingLinkLengths.Add(length); } } }); //and now the producer bool continueSearching = true; while (continueSearching) { continueSearching = false; int threshold = erosionThreshold; Parallel.ForEach(graphNodes, node => { continueSearching = true; if (node.ExtensionsCount == 0) { if (threshold != -1 && node.KmerCount < threshold) { // Mark node for erosion node.MarkNodeForDelete(); } else { // Single node island. linkLengths.Add(1); } } else if (node.RightExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link. DeBruijnPath link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true); if (link != null && link.PathNodes.Count > 0) { linkLengths.Add(link.PathNodes.Count); } } else if (node.LeftExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link. DeBruijnPath link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true); if (link != null && link.PathNodes.Count > 0) { linkLengths.Add(link.PathNodes.Count); } } }); // Remove eroded nodes. In the out parameter, get the list of new // end-points that was created by removing eroded nodes. IList<DeBruijnNode> nodes = RemoveErodedNodes(graph); if (nodes.Count == 0) { break; } graphNodes = nodes; } linkLengths.CompleteAdding(); Task.WaitAll(collectionTask); ExtendDanglingLinks(); return this.danglingLinkLengths; }
/// <summary> /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables. /// After nodes are deleted, some new end-points might be created. We need to check for /// dangling links at these new points. This list is returned in the out parameter. /// /// TODO: Perhaps refactor code so that the graph is only manipulated by itself? /// Might make it easier to implement future performance improvements, or cost performance /// </summary> /// <param name="graph">De Bruijn Graph.</param> private static IList<DeBruijnNode> RemoveErodedNodes(DeBruijnGraph graph) { bool eroded = graph.RemoveMarkedNodes()>0; IList<DeBruijnNode> graphNodes; if (eroded) { graphNodes = graph.GetNodes().AsParallel().Where(n => { bool wasEndPoint = (n.LeftExtensionNodesCount == 0 || n.RightExtensionNodesCount == 0); n.RemoveMarkedExtensions(); // Check if this is a new end point. return (wasEndPoint || (n.LeftExtensionNodesCount == 0 || n.RightExtensionNodesCount == 0)); }).ToList(); } else { graphNodes = new List<DeBruijnNode>(); } return graphNodes; }
/// <summary> /// Removes nodes that are part of dangling links. /// </summary> /// <param name="deBruijnGraph">Input graph.</param> /// <param name="nodesList">List of dangling link nodes.</param> public void RemoveErroneousNodes(DeBruijnGraph deBruijnGraph, DeBruijnPathList nodesList) { // Argument Validation if (deBruijnGraph == null) throw new ArgumentNullException("deBruijnGraph"); if (nodesList == null) throw new ArgumentNullException("nodesList"); var lastNodes = new HashSet<DeBruijnNode>(nodesList.Paths.Select(nl => nl.PathNodes.Last())); // Update extensions and Delete nodes from graph. deBruijnGraph.RemoveNodes( nodesList.Paths.AsParallel().SelectMany(nodes => { RemoveLinkNodes(nodes, lastNodes); return nodes.PathNodes; })); }
/// <summary> /// Detect nodes that are part of dangling links. /// Locks: Method only does reads. No locking necessary here or its callees. /// </summary> /// <param name="deBruijnGraph">Input graph.</param> /// <returns>List of nodes in dangling links.</returns> public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph) { if (deBruijnGraph == null) throw new ArgumentNullException("deBruijnGraph"); var debruijnPaths = new ConcurrentBag<DeBruijnPath>(); Parallel.ForEach(deBruijnGraph.GetNodes(), node => { if (node.ExtensionsCount == 0) { // Single node island debruijnPaths.Add(new DeBruijnPath(node)); } else if (node.RightExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link var link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true); if (link != null && link.PathNodes.Count>0) { debruijnPaths.Add(link); } } else if (node.LeftExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link var link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true); if (link != null && link.PathNodes.Count>0)//if the first node is below the threshold, it is not added, leaving a link with no nodes, so check is needed { debruijnPaths.Add(link); } } } ); return new DeBruijnPathList(debruijnPaths); }
/// <summary> /// Removes nodes that are part of redundant paths. /// </summary> /// <param name="deBruijnGraph">De Bruijn graph.</param> /// <param name="nodesList">Path nodes to be deleted.</param> public void RemoveErroneousNodes(DeBruijnGraph deBruijnGraph, DeBruijnPathList nodesList) { if (this.graph == null) { throw new ArgumentNullException("deBruijnGraph"); } DeBruijnGraph.ValidateGraph(deBruijnGraph); if (nodesList == null) { throw new ArgumentNullException("nodesList"); } this.graph = deBruijnGraph; // Neighbors of all nodes have to be updated. HashSet<DeBruijnNode> deleteNodes = new HashSet<DeBruijnNode>( nodesList.Paths.AsParallel().SelectMany(nl => nl.PathNodes)); // Update extensions for deletion // No need for read-write lock as deleteNode's dictionary is being read, // and only other graph node's dictionaries are updated. Parallel.ForEach( deleteNodes, node => { foreach (DeBruijnNode extension in node.GetExtensionNodes()) { // If the neighbor is also to be deleted, there is no use of updation in that case if (!deleteNodes.Contains(extension)) { extension.RemoveExtensionThreadSafe(node); } } }); // Delete nodes from graph this.graph.RemoveNodes(deleteNodes); }
/// <summary> /// Step 1: Building k-mers from sequence reads /// Step 2: Build de bruijn graph for input set of k-mers. /// Sets the _assemblerGraph field. /// </summary> protected virtual void CreateGraph() { Graph = new DeBruijnGraph(this.kmerLength); Graph.Build(this.sequenceReads); }
/// <summary> /// Validate graph generated using DeBruijnGraph.CreateGraph() with kmers /// </summary> /// <param name="nodeName">xml node name used for different testcases</param> internal void ValidateDeBruijnGraphBuild(string nodeName) { string filePath = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.FilePathNode); string kmerLength = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.KmerLengthNode); // Get the input reads and build kmers IEnumerable<ISequence> sequenceReads = null; using (FastAParser parser = new FastAParser(filePath)) { sequenceReads = parser.Parse(); this.KmerLength = int.Parse(kmerLength, (IFormatProvider)null); this.SequenceReads.Clear(); this.SetSequenceReads(sequenceReads.ToList()); DeBruijnGraph graph = new DeBruijnGraph(this.KmerLength); graph.Build(this.SequenceReads); ValidateGraph(graph, nodeName); } ApplicationLog.WriteLine(@"Padena BVT : DeBruijnGraph Build() validation for Padena step2 completed successfully"); }