/// <summary> /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables. /// </summary> /// <param name="graph">De Bruijn Graph.</param> public void RemoveLowCoverageNodes(DeBruijnGraph graph) { //Basic strategy here, start at all reference nodes, go find everything that isn't in there //and remove it. DeBruijnGraph.ValidateGraph(graph); //Mark all nodes as not visited //Now visit everyone that is connected to the reference somehow //Now mark any unvisited node for deletion. if (Bio.CrossPlatform.Environment.GetRunningPlatform() != Bio.CrossPlatform.Environment.Platform.Mac) { Parallel.ForEach(graph.GetNodes(), new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount }, x => { if (x.KmerCount < CoverageCutOff) { x.MarkNodeForDelete(); } }); Parallel.ForEach( graph.GetNodes(), (node) => { node.RemoveMarkedExtensions(); }); } else { foreach (var x in graph.GetNodes()) { if (x.KmerCount < CoverageCutOff) { x.MarkNodeForDelete(); } } foreach (var node in graph.GetNodes()) { node.RemoveMarkedExtensions(); } } //Now to delete them, since they are not connected to anything we are keeping, //no need to alter the graph structure graph.RemoveMarkedNodes(); }
/// <summary> /// Build contigs from graph. For contigs whose coverage is less than /// the specified threshold, remove graph nodes belonging to them. /// </summary> /// <param name="deBruijnGraph">DeBruijn Graph.</param> /// <param name="coverageThresholdForContigs">Coverage Threshold for contigs.</param> /// <returns>Number of nodes removed.</returns> public long RemoveLowCoverageContigs(DeBruijnGraph deBruijnGraph, double coverageThresholdForContigs) { if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } if (coverageThresholdForContigs <= 0) { throw new ArgumentException("For removing low coverage contigs, coverage threshold should be a positive number"); } this._coverageThreshold = coverageThresholdForContigs; this._graph = deBruijnGraph; DeBruijnGraph.ValidateGraph(deBruijnGraph); this.ExcludeAmbiguousExtensions(); Parallel.ForEach(deBruijnGraph.GetNodes(), n => n.ComputeValidExtensions()); this.GetSimplePaths(false); Parallel.ForEach(deBruijnGraph.GetNodes(), n => n.UndoAmbiguousExtensions()); return(deBruijnGraph.RemoveMarkedNodes()); }
/// <summary> /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables. /// </summary> /// <param name="graph">De Bruijn Graph.</param> public void RemoveUnconnectedNodes(DeBruijnGraph graph, IEnumerable <DeBruijnNode> referenceNodes) { //Basic strategy here, start at all reference nodes, go find everything that isn't in there //and remove it. DeBruijnGraph.ValidateGraph(graph); //Mark all nodes as not visited graph.SetNodeVisitState(false); //Now visit everyone that is connected to the reference somehow //This loop should spend basically all its time on the first node foreach (DeBruijnNode node in referenceNodes) { if (node.IsVisited) { continue; } else { visitAllConnectedNodes(node); } } //Now mark any unvisited node for deletion. Parallel.ForEach(graph.GetNodes(), new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount }, x => { if (!x.IsVisited) { x.MarkNodeForDelete(); } }); Parallel.ForEach( graph.GetNodes(), (node) => { node.RemoveMarkedExtensions(); }); //Now to delete them, since they are not connected to anything we are keeping, //no need to alter the graph structure graph.RemoveMarkedNodes(); }
/// <summary> /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables. /// After nodes are deleted, some new end-points might be created. We need to check for /// dangling links at these new points. This list is returned in the out parameter. /// /// TODO: Perhaps refactor code so that the graph is only manipulated by itself? /// Might make it easier to implement future performance improvements, or cost performance /// </summary> /// <param name="graph">De Bruijn Graph.</param> private static IList <DeBruijnNode> RemoveErodedNodes(DeBruijnGraph graph) { bool eroded = graph.RemoveMarkedNodes() > 0; IList <DeBruijnNode> graphNodes = null; if (eroded) { graphNodes = graph.GetNodes().AsParallel().Where(n => { bool wasEndPoint = (n.LeftExtensionNodesCount == 0 || n.RightExtensionNodesCount == 0); n.RemoveMarkedExtensions(); // Check if this is a new end point. return(wasEndPoint || (n.LeftExtensionNodesCount == 0 || n.RightExtensionNodesCount == 0)); }).ToList(); } else { graphNodes = new List <DeBruijnNode>(); } return(graphNodes); }
/// <summary> /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables. /// </summary> /// <param name="graph">De Bruijn Graph.</param> public static int RemovePathologicalNodes(DeBruijnGraph graph) { //Basic strategy here, start at all reference nodes, go find everything that isn't in there //and remove it. DeBruijnGraph.ValidateGraph(graph); var badSeq = Enumerable.Repeat((byte)'A', graph.KmerLength).ToArray(); var seq = new Bio.Sequence(Bio.Alphabets.DNA, badSeq, false); var badkmer1 = KmerData32.GetKmers(seq, graph.KmerLength).First().KmerData; badSeq = Enumerable.Repeat((byte)'G', graph.KmerLength).ToArray(); seq = new Bio.Sequence(Bio.Alphabets.DNA, badSeq, false); var badkmer2 = KmerData32.GetKmers(seq, graph.KmerLength).First().KmerData; var badNodeCount = 0; foreach (var x in graph.GetNodes()) { if (x.NodeValue.KmerData == badkmer1 || x.NodeValue.KmerData == badkmer2 || x.ContainsSelfReference) { x.MarkNodeForDelete(); Interlocked.Increment(ref badNodeCount); } } foreach (var node in graph.GetNodes()) { node.RemoveMarkedExtensions(); } //Now to delete them, since they are not connected to anything we are keeping, //no need to alter the graph structure graph.RemoveMarkedNodes(); return(badNodeCount); }