/// <summary> /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables. /// After nodes are deleted, some new end-points might be created. We need to check for /// dangling links at these new points. This list is returned in the out parameter. /// </summary> /// <param name="graph">De Bruijn Graph.</param> private static IList <DeBruijnNode> RemoveErodedNodes(DeBruijnGraph graph) { bool eroded = false; Parallel.ForEach( graph.GetNodes(), (node) => { if (node.IsMarkedForDelete) { node.IsDeleted = true; eroded = true; } }); IList <DeBruijnNode> graphNodes = null; if (eroded) { graphNodes = graph.GetNodes().AsParallel().Where(n => { bool wasEndPoint = (n.LeftExtensionNodesCount == 0 || n.RightExtensionNodesCount == 0); n.RemoveMarkedExtensions(); // Check if this is a new end point. return(wasEndPoint || (n.LeftExtensionNodesCount == 0 || n.RightExtensionNodesCount == 0)); }).ToList(); } else { graphNodes = new List <DeBruijnNode>(); } return(graphNodes); }
public void TestDeBruijnGraphBuilderSmall() { const int KmerLength = 6; List <ISequence> reads = TestInputs.GetSmallReads(); this.KmerLength = KmerLength; this.SequenceReads.Clear(); this.SetSequenceReads(reads); this.CreateGraph(); DeBruijnGraph graph = this.Graph; Assert.AreEqual(20, graph.NodeCount); HashSet <string> nodeStrings = GetGraphNodesForSmallReads(); string nodeStr, nodeStrRC; foreach (DeBruijnNode node in graph.GetNodes()) { nodeStr = new string(graph.GetNodeSequence(node).Select(a => (char)a).ToArray()); nodeStrRC = new string(graph.GetNodeSequence(node).GetReverseComplementedSequence().Select(a => (char)a).ToArray()); Assert.IsTrue(nodeStrings.Contains(nodeStr) || nodeStrings.Contains(nodeStrRC)); } long totalEdges = graph.GetNodes().Select(n => n.ExtensionsCount).Sum(); Assert.AreEqual(51, totalEdges); }
public void TestDeBruijnGraphBuilderTiny() { const int KmerLength = 3; List <ISequence> reads = TestInputs.GetTinyReads(); this.KmerLength = KmerLength; this.SequenceReads.Clear(); this.SetSequenceReads(reads); this.CreateGraph(); DeBruijnGraph graph = this.Graph; Assert.AreEqual(9, graph.NodeCount); HashSet <string> nodeStrings = new HashSet <string>(graph.GetNodes().Select(n => new string(graph.GetNodeSequence(n).Select(a => (char)a).ToArray()))); Assert.IsTrue(nodeStrings.Contains("ATG") || nodeStrings.Contains("CAT")); Assert.IsTrue(nodeStrings.Contains("TGC") || nodeStrings.Contains("GCA")); Assert.IsTrue(nodeStrings.Contains("GCC") || nodeStrings.Contains("GGC")); Assert.IsTrue(nodeStrings.Contains("TCC") || nodeStrings.Contains("GGA")); Assert.IsTrue(nodeStrings.Contains("CCT") || nodeStrings.Contains("AGG")); Assert.IsTrue(nodeStrings.Contains("CTA") || nodeStrings.Contains("TAG")); Assert.IsTrue(nodeStrings.Contains("TAT") || nodeStrings.Contains("ATA")); Assert.IsTrue(nodeStrings.Contains("ATC") || nodeStrings.Contains("GAT")); Assert.IsTrue(nodeStrings.Contains("CTC") || nodeStrings.Contains("GAG")); long totalEdges = graph.GetNodes().Select(n => n.ExtensionsCount).Sum(); Assert.AreEqual(31, totalEdges); }
private List <DeBruijnPathList> GetIndelPaths(DeBruijnGraph deBruijnGraph) { List <DeBruijnPathList> redundantPaths = new List <DeBruijnPathList>(); Parallel.ForEach( deBruijnGraph.GetNodes(), node => { if (!node.ContainsSelfReference) { // Need to check for both left and right extensions for ambiguity. if (node.RightExtensionNodesCount > 1) { TraceDivergingExtensionPaths(node, node.GetRightExtensionNodesWithOrientation(), true, redundantPaths); } if (node.LeftExtensionNodesCount > 1) { TraceDivergingExtensionPaths(node, node.GetLeftExtensionNodesWithOrientation(), false, redundantPaths); } } } ); RedundantPathsPurger.ValidatePathsAreFromSameDirection(redundantPaths, deBruijnGraph.KmerLength); var indelPaths = redundantPaths.Where(x => x.Paths.Select(z => z.PathNodes.Count).Distinct().Count() != 1).ToList(); //TODO: Could merge the two filters here indelPaths = RemoveDuplicates(indelPaths); indelPaths = RemoveEmbeddedPaths(indelPaths); return(indelPaths); }
/// <summary> /// Detect nodes that are on redundant paths. /// Start from any node that has ambiguous (more than one) extensions. /// From this node, trace path for each extension until either they /// converge to a single node or threshold length is exceeded. /// In case they converge, we have a set of redundant paths. /// We pick the best path based on the kmer counts of the path nodes. /// All paths other than the best one are returned for removal. /// Locks: Method only does reads. No locking necessary here or its callees. /// </summary> /// <param name="deBruijnGraph">De Bruijn Graph.</param> /// <returns>List of path nodes to be deleted.</returns> public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph) { if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } DeBruijnGraph.ValidateGraph(deBruijnGraph); this.graph = deBruijnGraph; List <DeBruijnPathList> redundantPaths = new List <DeBruijnPathList>(); Parallel.ForEach( deBruijnGraph.GetNodes(), node => { // Need to check for both left and right extensions for ambiguity. if (node.RightExtensionNodesCount > 1) { TraceDivergingExtensionPaths(node, node.GetRightExtensionNodesWithOrientation(), true, redundantPaths); } if (node.LeftExtensionNodesCount > 1) { TraceDivergingExtensionPaths(node, node.GetLeftExtensionNodesWithOrientation(), false, redundantPaths); } }); redundantPaths = RemoveDuplicates(redundantPaths); return(DetachBestPath(redundantPaths)); }
/// <summary> /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables. /// </summary> /// <param name="graph">De Bruijn Graph.</param> public void RemoveLowCoverageNodes(DeBruijnGraph graph) { //Basic strategy here, start at all reference nodes, go find everything that isn't in there //and remove it. DeBruijnGraph.ValidateGraph(graph); //Mark all nodes as not visited //Now visit everyone that is connected to the reference somehow //Now mark any unvisited node for deletion. if (Bio.CrossPlatform.Environment.GetRunningPlatform() != Bio.CrossPlatform.Environment.Platform.Mac) { Parallel.ForEach(graph.GetNodes(), new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount }, x => { if (x.KmerCount < CoverageCutOff) { x.MarkNodeForDelete(); } }); Parallel.ForEach( graph.GetNodes(), (node) => { node.RemoveMarkedExtensions(); }); } else { foreach (var x in graph.GetNodes()) { if (x.KmerCount < CoverageCutOff) { x.MarkNodeForDelete(); } } foreach (var node in graph.GetNodes()) { node.RemoveMarkedExtensions(); } } //Now to delete them, since they are not connected to anything we are keeping, //no need to alter the graph structure graph.RemoveMarkedNodes(); }
public GraphGenerator(DeBruijnGraph assemblyGraph) { this._graph = assemblyGraph; CreateMetaNodes(); //verify all nodes visited if (_graph.GetNodes().Any(x => !x.IsVisited)) { throw new Exception("Failed to visit all nodes!"); } }
/// <summary> /// Build contigs from graph. For contigs whose coverage is less than /// the specified threshold, remove graph nodes belonging to them. /// </summary> /// <param name="deBruijnGraph">DeBruijn Graph.</param> /// <param name="coverageThresholdForContigs">Coverage Threshold for contigs.</param> /// <returns>Number of nodes removed.</returns> public long RemoveLowCoverageContigs(DeBruijnGraph deBruijnGraph, double coverageThresholdForContigs) { if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } if (coverageThresholdForContigs <= 0) { throw new ArgumentException("For removing low coverage contigs, coverage threshold should be a positive number"); } this._coverageThreshold = coverageThresholdForContigs; this._graph = deBruijnGraph; DeBruijnGraph.ValidateGraph(deBruijnGraph); this.ExcludeAmbiguousExtensions(); Parallel.ForEach(deBruijnGraph.GetNodes(), n => n.ComputeValidExtensions()); this.GetSimplePaths(false); Parallel.ForEach(deBruijnGraph.GetNodes(), n => n.UndoAmbiguousExtensions()); return(deBruijnGraph.RemoveMarkedNodes()); }
/// <summary> /// Detect nodes that are part of dangling links. /// Locks: Method only does reads. No locking necessary here or its callees. /// </summary> /// <param name="deBruijnGraph">Input graph.</param> /// <returns>List of nodes in dangling links.</returns> public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph) { if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } BlockingCollection <DeBruijnPath> debruijnPaths = new BlockingCollection <DeBruijnPath>(); Task[] tasks = new Task[1]; DeBruijnPathList danglingNodesList = null; Task collectionTask = Task.Factory.StartNew(() => { danglingNodesList = new DeBruijnPathList(this.GetPaths(debruijnPaths)); }); tasks[0] = collectionTask; Parallel.ForEach( deBruijnGraph.GetNodes(), (node) => { if (node.ExtensionsCount == 0) { // Single node island debruijnPaths.Add(new DeBruijnPath(node)); } else if (node.RightExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link var link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true); if (link != null) { debruijnPaths.Add(link); } } else if (node.LeftExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link var link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true); if (link != null) { debruijnPaths.Add(link); } } }); debruijnPaths.CompleteAdding(); Task.WaitAll(collectionTask); return(danglingNodesList); }
/// <summary> /// For nodes that have more than one extension in either direction, /// mark the extensions invalid. /// Locks: No locks used as extensions are only marked invalid, not deleted. /// Write locks not used because in only possible conflict both threads will /// try to write same value to memory. So race is harmless. /// </summary> private void ExcludeAmbiguousExtensions() { Parallel.ForEach(_graph.GetNodes(), node => { bool isPalindrome = node.IsPalindrome(this._graph.KmerLength); if (isPalindrome || node.LeftExtensionNodesCount > 1) { // Ambiguous. Remove all extensions foreach (DeBruijnNode left in node.GetLeftExtensionNodes()) { left.MarkExtensionInvalid(node); node.MarkLeftExtensionAsInvalid(left); } } else { // Remove self loops if (node.LeftExtensionNodesCount == 1 && node.GetLeftExtensionNodes().First() == node) { node.MarkLeftExtensionAsInvalid(node); } } if (isPalindrome || node.RightExtensionNodesCount > 1) { // Ambiguous. Remove all extensions foreach (DeBruijnNode right in node.GetRightExtensionNodes()) { right.MarkExtensionInvalid(node); node.MarkRightExtensionAsInvalid(right); } } else { // Remove self loops if (node.RightExtensionNodesCount == 1 && node.GetRightExtensionNodes().First() == node) { node.MarkRightExtensionAsInvalid(node); } } }); }
public List <DeletionAnalysis> FindAllDeletions(DeBruijnGraph graph, MitochondrialAssembly assembly) { LargeDeletionFinder.graph = graph; KmerLength = graph.KmerLength; //set all edges in the graph to not be visited graph.GetNodes().AsParallel().ForAll(x => x.ResetVisitState()); foreach (DeBruijnNode node in graph.GetNodes()) { //starting from any unused edges in the network, make any/all paths one can //take try { PossibleDeletionPaths.AddRange(ExtendFromStartNode(node)); } catch (Exception thrown) { Console.WriteLine(thrown.Message); } } DeletionReports = PossibleDeletionPaths.Select(x => new DeletionAnalysis(x)).ToList(); return(DeletionReports); }
/// <summary> /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables. /// </summary> /// <param name="graph">De Bruijn Graph.</param> public void RemoveUnconnectedNodes(DeBruijnGraph graph, IEnumerable <DeBruijnNode> referenceNodes) { //Basic strategy here, start at all reference nodes, go find everything that isn't in there //and remove it. DeBruijnGraph.ValidateGraph(graph); //Mark all nodes as not visited graph.SetNodeVisitState(false); //Now visit everyone that is connected to the reference somehow //This loop should spend basically all its time on the first node foreach (DeBruijnNode node in referenceNodes) { if (node.IsVisited) { continue; } else { visitAllConnectedNodes(node); } } //Now mark any unvisited node for deletion. Parallel.ForEach(graph.GetNodes(), new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount }, x => { if (!x.IsVisited) { x.MarkNodeForDelete(); } }); Parallel.ForEach( graph.GetNodes(), (node) => { node.RemoveMarkedExtensions(); }); //Now to delete them, since they are not connected to anything we are keeping, //no need to alter the graph structure graph.RemoveMarkedNodes(); }
/// <summary> /// Detect nodes that are part of dangling links. /// Locks: Method only does reads. No locking necessary here or its callees. /// </summary> /// <param name="deBruijnGraph">Input graph.</param> /// <returns>List of nodes in dangling links.</returns> public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph) { if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } ConcurrentBag <DeBruijnPath> debruijnPaths = new ConcurrentBag <DeBruijnPath>(); DeBruijnPathList danglingNodesList = null; Parallel.ForEach(deBruijnGraph.GetNodes(), node => { if (node.ExtensionsCount == 0) { // Single node island debruijnPaths.Add(new DeBruijnPath(node)); } else if (node.RightExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link var link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true); if (link != null && link.PathNodes.Count > 0) { debruijnPaths.Add(link); } } else if (node.LeftExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link var link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true); if (link != null && link.PathNodes.Count > 0) //if the first node is below the threshold, it is not added, leaving a link with no nodes, so check is needed { debruijnPaths.Add(link); } } else if (node.ContainsSelfReference) { //does it have not self references? if (node.ExtensionsCount == 1) { debruijnPaths.Add(new DeBruijnPath(node)); } } } ); danglingNodesList = new DeBruijnPathList(debruijnPaths); return(danglingNodesList); }
/// <summary> /// Build contig sequences from the graph. /// </summary> /// <param name="deBruijnGraph">De Bruijn graph.</param> /// <returns>List of contig data.</returns> public IEnumerable <ISequence> Build(DeBruijnGraph deBruijnGraph) { if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } this._graph = deBruijnGraph; this._coverageThreshold = Double.NaN; DeBruijnGraph.ValidateGraph(deBruijnGraph); this.ExcludeAmbiguousExtensions(); Parallel.ForEach(_graph.GetNodes(), n => n.PurgeInvalidExtensions()); return(this.GetSimplePaths(true)); }
/// <summary> /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables. /// </summary> /// <param name="graph">De Bruijn Graph.</param> public static int RemovePathologicalNodes(DeBruijnGraph graph) { //Basic strategy here, start at all reference nodes, go find everything that isn't in there //and remove it. DeBruijnGraph.ValidateGraph(graph); var badSeq = Enumerable.Repeat((byte)'A', graph.KmerLength).ToArray(); var seq = new Bio.Sequence(Bio.Alphabets.DNA, badSeq, false); var badkmer1 = KmerData32.GetKmers(seq, graph.KmerLength).First().KmerData; badSeq = Enumerable.Repeat((byte)'G', graph.KmerLength).ToArray(); seq = new Bio.Sequence(Bio.Alphabets.DNA, badSeq, false); var badkmer2 = KmerData32.GetKmers(seq, graph.KmerLength).First().KmerData; var badNodeCount = 0; foreach (var x in graph.GetNodes()) { if (x.NodeValue.KmerData == badkmer1 || x.NodeValue.KmerData == badkmer2 || x.ContainsSelfReference) { x.MarkNodeForDelete(); Interlocked.Increment(ref badNodeCount); } } foreach (var node in graph.GetNodes()) { node.RemoveMarkedExtensions(); } //Now to delete them, since they are not connected to anything we are keeping, //no need to alter the graph structure graph.RemoveMarkedNodes(); return(badNodeCount); }
/// <summary> /// Condense redundant paths down to simple paths /// </summary> /// <returns>List of simple paths.</returns> private IList<ISequence> CreateMegaNodes() { foreach(DeBruijnNode node in _graph.GetNodes()) { IList<ISequence> paths = new List<ISequence>(); Parallel.ForEach(this._graph.GetNodes(), node => { int validLeftExtensionsCount = node.LeftExtensionNodesCount; int validRightExtensionsCount = node.RightExtensionNodesCount; if (validLeftExtensionsCount + validRightExtensionsCount == 0) { // Island. Check coverage if (Double.IsNaN(_coverageThreshold)) { if (createContigSequences) { lock (paths) { paths.Add(_graph.GetNodeSequence(node)); } } } else { if (node.KmerCount < _coverageThreshold) { node.MarkNodeForDelete(); } } } else if (validLeftExtensionsCount == 1 && validRightExtensionsCount == 0) { TraceSimplePath(paths, node, false, createContigSequences); } else if (validRightExtensionsCount == 1 && validLeftExtensionsCount == 0) { TraceSimplePath(paths, node, true, createContigSequences); } }); return paths; } }
/// <summary> /// Condense redundant paths down to simple paths /// </summary> /// <returns>List of simple paths.</returns> private void CreateMetaNodes() { _graph.SetNodeVisitState(false); //First step now, condense all nodes into "MetaNodes" that are linearly connected. //Note: Loop avoids stack overflow. foreach (DeBruijnNode node in _graph.GetNodes()) { if (node.IsVisited) { continue; } else { var metaNode = new MetaNode(node, _graph); MetaNodes.Add(metaNode); } } }
/// <summary> /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables. /// After nodes are deleted, some new end-points might be created. We need to check for /// dangling links at these new points. This list is returned in the out parameter. /// /// TODO: Perhaps refactor code so that the graph is only manipulated by itself? /// Might make it easier to implement future performance improvements, or cost performance /// </summary> /// <param name="graph">De Bruijn Graph.</param> private static IList <DeBruijnNode> RemoveErodedNodes(DeBruijnGraph graph) { bool eroded = graph.RemoveMarkedNodes() > 0; IList <DeBruijnNode> graphNodes = null; if (eroded) { graphNodes = graph.GetNodes().AsParallel().Where(n => { bool wasEndPoint = (n.LeftExtensionNodesCount == 0 || n.RightExtensionNodesCount == 0); n.RemoveMarkedExtensions(); // Check if this is a new end point. return(wasEndPoint || (n.LeftExtensionNodesCount == 0 || n.RightExtensionNodesCount == 0)); }).ToList(); } else { graphNodes = new List <DeBruijnNode>(); } return(graphNodes); }
/// <summary> /// Erode ends of graph that have coverage less than given erodeThreshold. /// As optimization, we also check for dangling links and keeps track of the /// lengths of the links found. No removal is done at this step. /// This is done to get an idea of the different lengths at /// which to run the dangling links purger step. /// This method returns the lengths of dangling links found. /// Locks: Method only does reads. No locking necessary here. /// </summary> /// <param name="graph">Input graph.</param> /// <param name="erosionThreshold">Threshold for erosion.</param> /// <returns>List of lengths of dangling links detected.</returns> public IEnumerable <int> ErodeGraphEnds(DeBruijnGraph graph, int erosionThreshold = -1) { if (graph == null) { throw new ArgumentNullException("graph"); } this.erodeThreshold = erosionThreshold; this.danglingLinkLengths = new SortedSet <int>(); this.danglingLinkExtensionTasks = new List <Task <int> >(); IEnumerable <DeBruijnNode> graphNodes = graph.GetNodes(); BlockingCollection <int> linkLengths = new BlockingCollection <int>(); Task collectionTask = Task.Factory.StartNew(() => { while (!linkLengths.IsCompleted) { int length; if (linkLengths.TryTake(out length)) { this.danglingLinkLengths.Add(length); } } }); bool continueSearching = true; while (continueSearching) { continueSearching = false; Parallel.ForEach( graphNodes, (node) => { continueSearching = true; if (node.ExtensionsCount == 0) { if (erosionThreshold != -1 && node.KmerCount < erosionThreshold) { // Mark node for erosion node.MarkNodeForDelete(); } else { // Single node island. linkLengths.Add(1); } } else if (node.RightExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link. DeBruijnPath link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true); if (link != null && link.PathNodes.Count > 0) { linkLengths.Add(link.PathNodes.Count); } } else if (node.LeftExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link. DeBruijnPath link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true); if (link != null && link.PathNodes.Count > 0) { linkLengths.Add(link.PathNodes.Count); } } }); // Remove eroded nodes. In the out parameter, get the list of new // end-points that was created by removing eroded nodes. IList <DeBruijnNode> nodes = RemoveErodedNodes(graph); if (nodes.Count == 0) { break; } graphNodes = nodes; } linkLengths.CompleteAdding(); Task.WaitAll(collectionTask); erosionThreshold = -1; this.ExtendDanglingLinks(); return(this.danglingLinkLengths); }