예제 #1
0
        /// <summary>
        /// Detect nodes that are on redundant paths. 
        /// Start from any node that has ambiguous (more than one) extensions.
        /// From this node, trace path for each extension until either they 
        /// converge to a single node or threshold length is exceeded. 
        /// In case they converge, we have a set of redundant paths. 
        /// We pick the best path based on the kmer counts of the path nodes.
        /// All paths other than the best one are returned for removal.
        /// Locks: Method only does reads. No locking necessary here or its callees. 
        /// </summary>
        /// <param name="deBruijnGraph">De Bruijn Graph.</param>
        /// <returns>List of path nodes to be deleted.</returns>
        public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph)
        {
            if (deBruijnGraph == null)
            {
                throw new ArgumentNullException("deBruijnGraph");
            }

            DeBruijnGraph.ValidateGraph(deBruijnGraph);
            this.graph = deBruijnGraph;

            List<DeBruijnPathList> redundantPaths = new List<DeBruijnPathList>();
            Parallel.ForEach(
                deBruijnGraph.GetNodes(),
                node =>
                {
                    // Need to check for both left and right extensions for ambiguity.
                    if (node.RightExtensionNodesCount > 1)
                    {
                        TraceDivergingExtensionPaths(node, node.GetRightExtensionNodesWithOrientation(), true, redundantPaths);
                    }

                    if (node.LeftExtensionNodesCount > 1)
                    {
                        TraceDivergingExtensionPaths(node, node.GetLeftExtensionNodesWithOrientation(), false, redundantPaths);
                    }
                });

            redundantPaths = RemoveDuplicates(redundantPaths);
            return DetachBestPath(redundantPaths);
        }
예제 #2
0
 /// <summary>
 /// Validate input graph.
 /// Throws exception if graph is null.
 /// </summary>
 /// <param name="graph">Input graph.</param>
 public static void ValidateGraph(DeBruijnGraph graph)
 {
     if (graph == null)
     {
         throw new ArgumentNullException("graph");
     }
 }
예제 #3
0
        /// <summary>
        /// Build contig sequences from the graph.
        /// </summary>
        /// <param name="deBruijnGraph">De Bruijn graph.</param>
        /// <returns>List of contig data.</returns>
        public IEnumerable<ISequence> Build(DeBruijnGraph deBruijnGraph)
        {
            if (deBruijnGraph == null)
            {
                throw new ArgumentNullException("deBruijnGraph");
            }

            _graph = deBruijnGraph;
            _coverageThreshold = -1;
            DeBruijnGraph.ValidateGraph(deBruijnGraph);
            ExcludeAmbiguousExtensions();
            Parallel.ForEach(_graph.GetNodes(), n => n.PurgeInvalidExtensions());
            return GetSimplePaths(true);
        }
예제 #4
0
        /// <summary>
        /// Build contigs from graph. For contigs whose coverage is less than 
        /// the specified threshold, remove graph nodes belonging to them.
        /// </summary>
        /// <param name="deBruijnGraph">DeBruijn Graph.</param>
        /// <param name="coverageThresholdForContigs">Coverage Threshold for contigs.</param>
        /// <returns>Number of nodes removed.</returns>
        public long RemoveLowCoverageContigs(DeBruijnGraph deBruijnGraph, double coverageThresholdForContigs)
        {
            if (deBruijnGraph == null)
            {
                throw new ArgumentNullException("deBruijnGraph");
            }

            if (coverageThresholdForContigs <= 0)
            {
                throw new ArgumentException("For removing low coverage contigs, coverage threshold should be a positive number");
            }

            _coverageThreshold = coverageThresholdForContigs;
            _graph = deBruijnGraph;
            DeBruijnGraph.ValidateGraph(deBruijnGraph);
            ExcludeAmbiguousExtensions();
            Parallel.ForEach(deBruijnGraph.GetNodes(), n => n.ComputeValidExtensions());
            GetSimplePaths(false);
            Parallel.ForEach(deBruijnGraph.GetNodes(),n=>n.UndoAmbiguousExtensions());
            return deBruijnGraph.RemoveMarkedNodes();
        }
예제 #5
0
        /// <summary>
        /// Validate the graph nodes sequence, left edges and right edges
        /// </summary>
        /// <param name="graph">graph object</param>
        /// <param name="nodeName">xml node name used for different testcases</param>
        internal void ValidateGraph(DeBruijnGraph graph, string nodeName)
        {
            string nodesSequence = utilityObj.xmlUtil.GetTextValue(nodeName,
              Constants.NodesSequenceNode);
            string nodesLeftEdges = utilityObj.xmlUtil.GetTextValue(nodeName,
              Constants.NodesLeftEdgesCountNode);
            string nodesRightEdges = utilityObj.xmlUtil.GetTextValue(nodeName,
              Constants.NodeRightEdgesCountNode);

            string[] leftEdgesCount = ReadStringFromFile(nodesLeftEdges).Replace("\r\n", "").Split(',');
            string[] rightEdgesCount = ReadStringFromFile(nodesRightEdges).Replace("\r\n", "").Split(',');
            string[] nodesSequences = ReadStringFromFile(nodesSequence).Replace("\r\n", "").Split(',');

            // Validate the nodes 
            for (int iseq = 0; iseq < nodesSequences.Length; iseq++)
            {
                DeBruijnNode dbnodes = graph.GetNodes().First(n => graph.GetNodeSequence(n).ConvertToString() == nodesSequences[iseq]
                                                                || graph.GetNodeSequence(n).GetReverseComplementedSequence().ConvertToString() == nodesSequences[iseq]);

                //Due to parallelization the left edges and right edges count
                //can be swapped while processing. if actual left edges count 
                //is either equal to expected left edges count or right edges count and vice versa.
                Assert.IsTrue(
                  dbnodes.LeftExtensionNodesCount.ToString((IFormatProvider)null) == leftEdgesCount[iseq] ||
                  dbnodes.LeftExtensionNodesCount.ToString((IFormatProvider)null) == rightEdgesCount[iseq]);
                Assert.IsTrue(
                  dbnodes.RightExtensionNodesCount.ToString((IFormatProvider)null) == leftEdgesCount[iseq] ||
                  dbnodes.RightExtensionNodesCount.ToString((IFormatProvider)null) == rightEdgesCount[iseq]);
            }
        }
예제 #6
0
        /// <summary>
        /// Erode ends of graph that have coverage less than given erodeThreshold.
        /// As optimization, we also check for dangling links and keeps track of the
        /// lengths of the links found. No removal is done at this step.
        /// This is done to get an idea of the different lengths at 
        /// which to run the dangling links purger step.
        /// This method returns the lengths of dangling links found.
        /// Locks: Method only does reads. No locking necessary here. 
        /// </summary>
        /// <param name="graph">Input graph.</param>
        /// <param name="erosionThreshold">Threshold for erosion.</param>
        /// <returns>List of lengths of dangling links detected.</returns>
        public IEnumerable<int> ErodeGraphEnds(DeBruijnGraph graph, int erosionThreshold = -1)
        {
            if (graph == null)
            {
                throw new ArgumentNullException("graph");
            }

            this.erodeThreshold = erosionThreshold;
            this.danglingLinkLengths = new SortedSet<int>();
            this.danglingLinkExtensionTasks = new List<Task<int>>();

            IEnumerable<DeBruijnNode> graphNodes = graph.GetNodes();

            BlockingCollection<int> linkLengths = new BlockingCollection<int>();
            
            // Start consumer task
            // TODO: This length list should contain elements from ~1-19, decide if 
            // having a separate thread keep those numbers sorted is worthwhile
            Task collectionTask = Task.Run(() =>
            {
                while (!linkLengths.IsCompleted)
                {
                    int length;
                    if (linkLengths.TryTake(out length))
                    {
                        this.danglingLinkLengths.Add(length);
                    }
                }
            });
           
            //and now the producer
            bool continueSearching = true;
            while (continueSearching)
            {
                continueSearching = false;

                int threshold = erosionThreshold;
                Parallel.ForEach(graphNodes, node =>
                    {
                        continueSearching = true;
                        if (node.ExtensionsCount == 0)
                        {
                            if (threshold != -1 && node.KmerCount < threshold)
                            {
                                // Mark node for erosion
                                node.MarkNodeForDelete();
                            }
                            else
                            {
                                // Single node island.
                                linkLengths.Add(1);
                            }
                        }
                        else if (node.RightExtensionNodesCount == 0)
                        {
                            // End of possible dangling link
                            // Trace back to see if it is part of a dangling link.
                            DeBruijnPath link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true);
                            if (link != null && link.PathNodes.Count > 0)
                            {
                                linkLengths.Add(link.PathNodes.Count);
                            }
                        }
                        else if (node.LeftExtensionNodesCount == 0)
                        {
                            // End of possible dangling link
                            // Trace back to see if it is part of a dangling link.
                            DeBruijnPath link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true);
                            if (link != null && link.PathNodes.Count > 0)
                            {
                                linkLengths.Add(link.PathNodes.Count);
                            }
                        }
                    });

                // Remove eroded nodes. In the out parameter, get the list of new 
                // end-points that was created by removing eroded nodes.
                IList<DeBruijnNode> nodes = RemoveErodedNodes(graph);
                if (nodes.Count == 0)
                {
                    break;
                }

                graphNodes = nodes;
            }

            linkLengths.CompleteAdding();

            Task.WaitAll(collectionTask);

            ExtendDanglingLinks();
            return this.danglingLinkLengths;
        }
예제 #7
0
        /// <summary>
        /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables.
        /// After nodes are deleted, some new end-points might be created. We need to check for 
        /// dangling links at these new points. This list is returned in the out parameter.
        /// 
        /// TODO: Perhaps refactor code so that the graph is only manipulated by itself?
        /// Might make it easier to implement future performance improvements, or cost performance
        /// </summary>
        /// <param name="graph">De Bruijn Graph.</param>
        private static IList<DeBruijnNode> RemoveErodedNodes(DeBruijnGraph graph)
        {
            bool eroded = graph.RemoveMarkedNodes()>0;

            IList<DeBruijnNode> graphNodes;
            if (eroded)
            {
                graphNodes = graph.GetNodes().AsParallel().Where(n =>
                {
                    bool wasEndPoint = (n.LeftExtensionNodesCount == 0 || n.RightExtensionNodesCount == 0);
                    n.RemoveMarkedExtensions();
                    // Check if this is a new end point.
                    return (wasEndPoint || (n.LeftExtensionNodesCount == 0 || n.RightExtensionNodesCount == 0));
                }).ToList();
            }
            else
            {
                graphNodes = new List<DeBruijnNode>();
            }

            return graphNodes;
        }
예제 #8
0
        /// <summary>
        /// Removes nodes that are part of dangling links.
        /// </summary>
        /// <param name="deBruijnGraph">Input graph.</param>
        /// <param name="nodesList">List of dangling link nodes.</param>
        public void RemoveErroneousNodes(DeBruijnGraph deBruijnGraph, DeBruijnPathList nodesList)
        {
            // Argument Validation
            if (deBruijnGraph == null)
                throw new ArgumentNullException("deBruijnGraph");

            if (nodesList == null)
                throw new ArgumentNullException("nodesList");
            
            var lastNodes = new HashSet<DeBruijnNode>(nodesList.Paths.Select(nl => nl.PathNodes.Last()));

            // Update extensions and Delete nodes from graph.
            deBruijnGraph.RemoveNodes(
                nodesList.Paths.AsParallel().SelectMany(nodes =>
                {
                    RemoveLinkNodes(nodes, lastNodes);
                    return nodes.PathNodes;
                }));
        }
예제 #9
0
        /// <summary>
        /// Detect nodes that are part of dangling links. 
        /// Locks: Method only does reads. No locking necessary here or its callees. 
        /// </summary>
        /// <param name="deBruijnGraph">Input graph.</param>
        /// <returns>List of nodes in dangling links.</returns>
        public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph)
        {
            if (deBruijnGraph == null)
                throw new ArgumentNullException("deBruijnGraph");

            var debruijnPaths = new ConcurrentBag<DeBruijnPath>();

            Parallel.ForEach(deBruijnGraph.GetNodes(), node =>
            {
                    if (node.ExtensionsCount == 0)
                    {
                        // Single node island
                        debruijnPaths.Add(new DeBruijnPath(node));
                    }
                    else if (node.RightExtensionNodesCount == 0)
                    {
                        // End of possible dangling link
                        // Trace back to see if it is part of a dangling link
                        var link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true);
                        if (link != null  && link.PathNodes.Count>0)
                        {
                            debruijnPaths.Add(link);
                        }
                    }
                    else if (node.LeftExtensionNodesCount == 0)
                    {
                        // End of possible dangling link
                        // Trace back to see if it is part of a dangling link
                        var link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true);
                        if (link != null && link.PathNodes.Count>0)//if the first node is below the threshold, it is not added, leaving a link with no nodes, so check is needed
                        {
                            debruijnPaths.Add(link);
                        }
                    }
                }
            );

            return new DeBruijnPathList(debruijnPaths);
        }
예제 #10
0
 /// <summary>
 /// Validate input graph.
 /// Throws exception if graph is null.
 /// </summary>
 /// <param name="graph">Input graph.</param>
 public static void ValidateGraph(DeBruijnGraph graph)
 {
     if (graph == null)
     {
         throw new ArgumentNullException("graph");
     }
 }
예제 #11
0
        /// <summary>
        /// Removes nodes that are part of redundant paths. 
        /// </summary>
        /// <param name="deBruijnGraph">De Bruijn graph.</param>
        /// <param name="nodesList">Path nodes to be deleted.</param>
        public void RemoveErroneousNodes(DeBruijnGraph deBruijnGraph, DeBruijnPathList nodesList)
        {
            if (this.graph == null)
            {
                throw new ArgumentNullException("deBruijnGraph");
            }

            DeBruijnGraph.ValidateGraph(deBruijnGraph);

            if (nodesList == null)
            {
                throw new ArgumentNullException("nodesList");
            }

            this.graph = deBruijnGraph;

            // Neighbors of all nodes have to be updated.
            HashSet<DeBruijnNode> deleteNodes = new HashSet<DeBruijnNode>(
                nodesList.Paths.AsParallel().SelectMany(nl => nl.PathNodes));

            // Update extensions for deletion
            // No need for read-write lock as deleteNode's dictionary is being read, 
            // and only other graph node's dictionaries are updated.
            Parallel.ForEach(
                deleteNodes,
                node =>
                {
                    foreach (DeBruijnNode extension in node.GetExtensionNodes())
                    {
                        // If the neighbor is also to be deleted, there is no use of updation in that case
                        if (!deleteNodes.Contains(extension))
                        {
                            extension.RemoveExtensionThreadSafe(node);
                        }
                    }
                });

            // Delete nodes from graph
            this.graph.RemoveNodes(deleteNodes);
        }
예제 #12
0
 /// <summary>
 /// Step 1: Building k-mers from sequence reads
 /// Step 2: Build de bruijn graph for input set of k-mers.
 /// Sets the _assemblerGraph field.
 /// </summary>
 protected virtual void CreateGraph()
 {
     Graph = new DeBruijnGraph(this.kmerLength);
     Graph.Build(this.sequenceReads);
 }
예제 #13
0
        /// <summary>
        /// Validate graph generated using DeBruijnGraph.CreateGraph() with kmers
        /// </summary>
        /// <param name="nodeName">xml node name used for different testcases</param>
        internal void ValidateDeBruijnGraphBuild(string nodeName)
        {
            string filePath = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.FilePathNode);
            string kmerLength = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.KmerLengthNode);

            // Get the input reads and build kmers
            IEnumerable<ISequence> sequenceReads = null;
            using (FastAParser parser = new FastAParser(filePath))
            {
                sequenceReads = parser.Parse();

                this.KmerLength = int.Parse(kmerLength, (IFormatProvider)null);
                this.SequenceReads.Clear();
                this.SetSequenceReads(sequenceReads.ToList());
                DeBruijnGraph graph = new DeBruijnGraph(this.KmerLength);
                graph.Build(this.SequenceReads);
                ValidateGraph(graph, nodeName);
            }
            ApplicationLog.WriteLine(@"Padena BVT : DeBruijnGraph Build() validation for Padena step2 completed successfully");

        }