/// <summary> /// Removes nodes that are part of redundant paths. /// </summary> /// <param name="deBruijnGraph">De Bruijn graph.</param> /// <param name="nodesList">Path nodes to be deleted.</param> public void RemoveErroneousNodes(DeBruijnGraph deBruijnGraph, DeBruijnPathList nodesList) { DeBruijnGraph.ValidateGraph(deBruijnGraph); if (nodesList == null) { throw new ArgumentNullException("nodesList"); } this.graph = deBruijnGraph; // Neighbors of all nodes have to be updated. HashSet <DeBruijnNode> deleteNodes = new HashSet <DeBruijnNode>( nodesList.Paths.AsParallel().SelectMany(nl => nl.PathNodes)); // Update extensions for deletion // No need for read-write lock as deleteNode's dictionary is being read, // and only other graph node's dictionaries are updated. Parallel.ForEach( deleteNodes, node => { foreach (DeBruijnNode extension in node.GetExtensionNodes()) { // If the neighbor is also to be deleted, there is no use of updation in that case if (!deleteNodes.Contains(extension)) { extension.RemoveExtensionThreadSafe(node); } } }); // Delete nodes from graph this.graph.RemoveNodes(deleteNodes); }
/// <summary> /// Detect nodes that are on redundant paths. /// Start from any node that has ambiguous (more than one) extensions. /// From this node, trace path for each extension until either they /// converge to a single node or threshold length is exceeded. /// In case they converge, we have a set of redundant paths. /// We pick the best path based on the kmer counts of the path nodes. /// All paths other than the best one are returned for removal. /// Locks: Method only does reads. No locking necessary here or its callees. /// </summary> /// <param name="deBruijnGraph">De Bruijn Graph.</param> /// <returns>List of path nodes to be deleted.</returns> public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph) { if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } DeBruijnGraph.ValidateGraph(deBruijnGraph); this.graph = deBruijnGraph; List <DeBruijnPathList> redundantPaths = new List <DeBruijnPathList>(); Parallel.ForEach( deBruijnGraph.GetNodes(), node => { // Need to check for both left and right extensions for ambiguity. if (node.RightExtensionNodesCount > 1) { TraceDivergingExtensionPaths(node, node.GetRightExtensionNodesWithOrientation(), true, redundantPaths); } if (node.LeftExtensionNodesCount > 1) { TraceDivergingExtensionPaths(node, node.GetLeftExtensionNodesWithOrientation(), false, redundantPaths); } }); redundantPaths = RemoveDuplicates(redundantPaths); return(DetachBestPath(redundantPaths)); }
/// <summary> /// Build contig sequences from the graph. /// </summary> /// <param name="graph">De Bruijn graph</param> /// <returns>List of contig data</returns> public IList <ISequence> Build(DeBruijnGraph graph) { _graph = graph; _coverageThreshold = -1; DeBruijnGraph.ValidateGraph(_graph); ExcludeAmbiguousExtensions(); _graph.Nodes.AsParallel().ForAll(n => n.PurgeInvalidExtensions()); return(GetSimplePaths()); }
/// <summary> /// Build contig sequences from the graph. /// </summary> /// <param name="deBruijnGraph">De Bruijn graph.</param> /// <returns>List of contig data.</returns> public IEnumerable <ISequence> Build(DeBruijnGraph deBruijnGraph) { if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } this._graph = deBruijnGraph; this._coverageThreshold = Double.NaN; DeBruijnGraph.ValidateGraph(deBruijnGraph); this.ExcludeAmbiguousExtensions(); Parallel.ForEach(_graph.GetNodes(), n => n.PurgeInvalidExtensions()); return(this.GetSimplePaths(true)); }
/// <summary> /// Build contigs from graph. For contigs whose coverage is less than /// the specified threshold, remove graph nodes belonging to them. /// </summary> /// <param name="graph">DeBruijn Graph</param> /// <param name="coverageThreshold">Coverage Threshold for contigs</param> /// <returns>Number of nodes removed</returns> public int RemoveLowCoverageContigs(DeBruijnGraph graph, double coverageThreshold) { if (coverageThreshold <= 0) { throw new ArgumentException("For removing low coverage contigs, coverage threshold should be a positive number"); } _coverageThreshold = coverageThreshold; _graph = graph; DeBruijnGraph.ValidateGraph(_graph); ExcludeAmbiguousExtensions(); _graph.Nodes.AsParallel().ForAll(n => n.ComputeValidExtensions()); GetSimplePaths(); _graph.Nodes.AsParallel().ForAll(n => n.UndoAmbiguousExtensions()); return(_graph.Nodes.RemoveWhere(n => n.IsMarked())); }
/// <summary> /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables. /// </summary> /// <param name="graph">De Bruijn Graph.</param> public void RemoveLowCoverageNodes(DeBruijnGraph graph) { //Basic strategy here, start at all reference nodes, go find everything that isn't in there //and remove it. DeBruijnGraph.ValidateGraph(graph); //Mark all nodes as not visited //Now visit everyone that is connected to the reference somehow //Now mark any unvisited node for deletion. if (Bio.CrossPlatform.Environment.GetRunningPlatform() != Bio.CrossPlatform.Environment.Platform.Mac) { Parallel.ForEach(graph.GetNodes(), new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount }, x => { if (x.KmerCount < CoverageCutOff) { x.MarkNodeForDelete(); } }); Parallel.ForEach( graph.GetNodes(), (node) => { node.RemoveMarkedExtensions(); }); } else { foreach (var x in graph.GetNodes()) { if (x.KmerCount < CoverageCutOff) { x.MarkNodeForDelete(); } } foreach (var node in graph.GetNodes()) { node.RemoveMarkedExtensions(); } } //Now to delete them, since they are not connected to anything we are keeping, //no need to alter the graph structure graph.RemoveMarkedNodes(); }
/// <summary> /// Build contigs from graph. For contigs whose coverage is less than /// the specified threshold, remove graph nodes belonging to them. /// </summary> /// <param name="deBruijnGraph">DeBruijn Graph.</param> /// <param name="coverageThresholdForContigs">Coverage Threshold for contigs.</param> /// <returns>Number of nodes removed.</returns> public long RemoveLowCoverageContigs(DeBruijnGraph deBruijnGraph, double coverageThresholdForContigs) { if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } if (coverageThresholdForContigs <= 0) { throw new ArgumentException("For removing low coverage contigs, coverage threshold should be a positive number"); } this._coverageThreshold = coverageThresholdForContigs; this._graph = deBruijnGraph; DeBruijnGraph.ValidateGraph(deBruijnGraph); this.ExcludeAmbiguousExtensions(); Parallel.ForEach(deBruijnGraph.GetNodes(), n => n.ComputeValidExtensions()); this.GetSimplePaths(false); Parallel.ForEach(deBruijnGraph.GetNodes(), n => n.UndoAmbiguousExtensions()); return(deBruijnGraph.RemoveMarkedNodes()); }
/// <summary> /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables. /// </summary> /// <param name="graph">De Bruijn Graph.</param> public void RemoveUnconnectedNodes(DeBruijnGraph graph, IEnumerable <DeBruijnNode> referenceNodes) { //Basic strategy here, start at all reference nodes, go find everything that isn't in there //and remove it. DeBruijnGraph.ValidateGraph(graph); //Mark all nodes as not visited graph.SetNodeVisitState(false); //Now visit everyone that is connected to the reference somehow //This loop should spend basically all its time on the first node foreach (DeBruijnNode node in referenceNodes) { if (node.IsVisited) { continue; } else { visitAllConnectedNodes(node); } } //Now mark any unvisited node for deletion. Parallel.ForEach(graph.GetNodes(), new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount }, x => { if (!x.IsVisited) { x.MarkNodeForDelete(); } }); Parallel.ForEach( graph.GetNodes(), (node) => { node.RemoveMarkedExtensions(); }); //Now to delete them, since they are not connected to anything we are keeping, //no need to alter the graph structure graph.RemoveMarkedNodes(); }
/// <summary> /// Detect nodes that are on redundant paths. /// Start from any node that has ambiguous (more than one) extensions. /// From this node, trace path for each extension until either they /// converge to a single node or threshold length is exceeded. /// In case they converge, we have a set of redundant paths. /// We pick the best path based on the kmer counts of the path nodes. /// All paths other than the best one are returned for removal. /// Locks: Method only does reads. No locking necessary here or its callees. /// </summary> /// <param name="graph">De Bruijn Graph</param> /// <returns>List of path nodes to be deleted</returns> public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph graph) { DeBruijnGraph.ValidateGraph(graph); _graph = graph; List <DeBruijnPathList> redundantPaths = new List <DeBruijnPathList>(); Parallel.ForEach(_graph.Nodes, node => { // Need to check for both left and right extensions for ambiguity. if (node.RightExtensionNodes.Count > 1) { TraceDivergingExtensionPaths(node, node.RightExtensionNodes, true, redundantPaths); } if (node.LeftExtensionNodes.Count > 1) { TraceDivergingExtensionPaths(node, node.LeftExtensionNodes, false, redundantPaths); } }); redundantPaths = RemoveDuplicates(redundantPaths); return(DetachBestPath(redundantPaths)); }
/// <summary> /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables. /// </summary> /// <param name="graph">De Bruijn Graph.</param> public static int RemovePathologicalNodes(DeBruijnGraph graph) { //Basic strategy here, start at all reference nodes, go find everything that isn't in there //and remove it. DeBruijnGraph.ValidateGraph(graph); var badSeq = Enumerable.Repeat((byte)'A', graph.KmerLength).ToArray(); var seq = new Bio.Sequence(Bio.Alphabets.DNA, badSeq, false); var badkmer1 = KmerData32.GetKmers(seq, graph.KmerLength).First().KmerData; badSeq = Enumerable.Repeat((byte)'G', graph.KmerLength).ToArray(); seq = new Bio.Sequence(Bio.Alphabets.DNA, badSeq, false); var badkmer2 = KmerData32.GetKmers(seq, graph.KmerLength).First().KmerData; var badNodeCount = 0; foreach (var x in graph.GetNodes()) { if (x.NodeValue.KmerData == badkmer1 || x.NodeValue.KmerData == badkmer2 || x.ContainsSelfReference) { x.MarkNodeForDelete(); Interlocked.Increment(ref badNodeCount); } } foreach (var node in graph.GetNodes()) { node.RemoveMarkedExtensions(); } //Now to delete them, since they are not connected to anything we are keeping, //no need to alter the graph structure graph.RemoveMarkedNodes(); return(badNodeCount); }