/// <summary>
        /// Removes nodes that are part of redundant paths.
        /// </summary>
        /// <param name="deBruijnGraph">De Bruijn graph.</param>
        /// <param name="nodesList">Path nodes to be deleted.</param>
        public void RemoveErroneousNodes(DeBruijnGraph deBruijnGraph, DeBruijnPathList nodesList)
        {
            DeBruijnGraph.ValidateGraph(deBruijnGraph);
            if (nodesList == null)
            {
                throw new ArgumentNullException("nodesList");
            }

            this.graph = deBruijnGraph;

            // Neighbors of all nodes have to be updated.
            HashSet <DeBruijnNode> deleteNodes = new HashSet <DeBruijnNode>(
                nodesList.Paths.AsParallel().SelectMany(nl => nl.PathNodes));

            // Update extensions for deletion
            // No need for read-write lock as deleteNode's dictionary is being read,
            // and only other graph node's dictionaries are updated.
            Parallel.ForEach(
                deleteNodes,
                node =>
            {
                foreach (DeBruijnNode extension in node.GetExtensionNodes())
                {
                    // If the neighbor is also to be deleted, there is no use of updation in that case
                    if (!deleteNodes.Contains(extension))
                    {
                        extension.RemoveExtensionThreadSafe(node);
                    }
                }
            });

            // Delete nodes from graph
            this.graph.RemoveNodes(deleteNodes);
        }
Example #2
0
        /// <summary>
        /// Step 3: Remove dangling links from graph.
        /// </summary>
        protected void UnDangleGraph()
        {
            if (this.DanglingLinksPurger != null && this.DanglingLinksThreshold > 0)
            {
                DeBruijnPathList danglingNodes = null;

                // Observe lengths of dangling links in the graph
                // This is an optimization - instead of incrementing threshold by 1 and
                // running the purger iteratively, we first determine the lengths of the
                // danglings links found in the graph and run purger only for those lengths.
                this.DanglingLinksPurger.LengthThreshold = this.DanglingLinksThreshold - 1;

                IEnumerable <int> danglingLengths;
                IGraphEndsEroder  graphEndsEroder = this.DanglingLinksPurger as IGraphEndsEroder;
                if (graphEndsEroder != null && this.AllowErosion)
                {
                    // If eroder is implemented, while getting lengths of dangling links,
                    // it also erodes the low coverage ends, this marks any node for deletion below a threshold.

                    //TODO: Verify that this does enumerate all dangling ends, the concern is that if a dangling end of length 7 and 2
                    //arrive at a node which itself would be of dangling node of length 2 without these "dangling ends" then a dangling end of 9
                    // (which it would be without either the 7 or 2 end) might not be reported.
                    danglingLengths = graphEndsEroder.ErodeGraphEnds(this.Graph, this.ErosionThreshold);
                }
                else
                {
                    // Perform dangling purger at all incremental values till dangleThreshold.
                    danglingLengths = Enumerable.Range(1, this.DanglingLinksThreshold - 1);
                }

                // Erosion is to be only once. Reset erode threshold to -1.
                this.ErosionThreshold = -1;


                // Start removing dangling links
                foreach (int threshold in danglingLengths)
                {
                    if (this.Graph.NodeCount >= threshold)
                    {
                        this.DanglingLinksPurger.LengthThreshold = threshold;
                        danglingNodes = this.DanglingLinksPurger.DetectErroneousNodes(this.Graph);
                        this.DanglingLinksPurger.RemoveErroneousNodes(this.Graph, danglingNodes);
                    }
                }

                // Removing dangling links can in turn create more dangling links
                // In order to remove all links within threshold, we therefore run
                // purger at threshold length until there is no more change in graph.
                do
                {
                    danglingNodes = null;
                    if (this.Graph.NodeCount >= this.DanglingLinksThreshold)
                    {
                        this.DanglingLinksPurger.LengthThreshold = this.DanglingLinksThreshold;
                        danglingNodes = this.DanglingLinksPurger.DetectErroneousNodes(this.Graph);
                        this.DanglingLinksPurger.RemoveErroneousNodes(this.Graph, danglingNodes);
                    }
                }while (danglingNodes != null && danglingNodes.Paths.Count > 0);
            }
        }
Example #3
0
        /// <summary>
        /// Step 3: Remove dangling links from graph
        /// </summary>
        protected void UnDangleGraph()
        {
            if (_danglingLinksPurger != null && _dangleThreshold > 0)
            {
                DeBruijnPathList danglingNodes = null;

                // Observe lenghts of dangling links in the graph
                // This is an optimization - instead of incrementing threshold by 1 and
                // running the purger iteratively, we first determine the lengths of the
                // danglings links found in the graph and run purger only for those lengths.
                _danglingLinksPurger.LengthThreshold = _dangleThreshold - 1;

                IEnumerable <int> danglingLengths;
                IGraphEndsEroder  graphEndsEroder = _danglingLinksPurger as IGraphEndsEroder;
                if (graphEndsEroder != null && _isErosionEnabled)
                {
                    // If eroder is implemented, while getting lengths of dangling links,
                    // it also erodes the low coverage ends.
                    danglingLengths = graphEndsEroder.ErodeGraphEnds(_graph, _erosionThreshold);
                }
                else
                {
                    // Perform dangling purger at all incremental values till dangleThreshold.
                    danglingLengths = Enumerable.Range(1, _dangleThreshold - 1);
                }

                // Erosion is to be only once. Reset erode threshold to -1.
                _erosionThreshold = -1;

                // Start removing dangling links
                foreach (int threshold in danglingLengths)
                {
                    if (_graph.Nodes.Count >= threshold)
                    {
                        _danglingLinksPurger.LengthThreshold = threshold;
                        danglingNodes = _danglingLinksPurger.DetectErroneousNodes(_graph);
                        _danglingLinksPurger.RemoveErroneousNodes(_graph, danglingNodes);
                    }
                }

                // Removing dangling links can in turn create more dangling links
                // In order to remove all links within threshold, we therefore run
                // purger at threshold length until there is no more change in graph.
                do
                {
                    danglingNodes = null;
                    if (_graph.Nodes.Count >= _dangleThreshold)
                    {
                        _danglingLinksPurger.LengthThreshold = _dangleThreshold;
                        danglingNodes = _danglingLinksPurger.DetectErroneousNodes(_graph);
                        _danglingLinksPurger.RemoveErroneousNodes(_graph, danglingNodes);
                    }
                }while (danglingNodes != null && danglingNodes.Paths.Count > 0);
            }
        }
Example #4
0
        /// <summary>
        /// Detect nodes that are part of dangling links.
        /// Locks: Method only does reads. No locking necessary here or its callees.
        /// </summary>
        /// <param name="deBruijnGraph">Input graph.</param>
        /// <returns>List of nodes in dangling links.</returns>
        public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph)
        {
            if (deBruijnGraph == null)
            {
                throw new ArgumentNullException("deBruijnGraph");
            }

            BlockingCollection <DeBruijnPath> debruijnPaths = new BlockingCollection <DeBruijnPath>();

            Task[] tasks = new Task[1];

            DeBruijnPathList danglingNodesList = null;
            Task             collectionTask    = Task.Factory.StartNew(() =>
            {
                danglingNodesList = new DeBruijnPathList(this.GetPaths(debruijnPaths));
            });

            tasks[0] = collectionTask;

            Parallel.ForEach(
                deBruijnGraph.GetNodes(),
                (node) =>
            {
                if (node.ExtensionsCount == 0)
                {
                    // Single node island
                    debruijnPaths.Add(new DeBruijnPath(node));
                }
                else if (node.RightExtensionNodesCount == 0)
                {
                    // End of possible dangling link
                    // Trace back to see if it is part of a dangling link
                    var link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true);
                    if (link != null)
                    {
                        debruijnPaths.Add(link);
                    }
                }
                else if (node.LeftExtensionNodesCount == 0)
                {
                    // End of possible dangling link
                    // Trace back to see if it is part of a dangling link
                    var link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true);
                    if (link != null)
                    {
                        debruijnPaths.Add(link);
                    }
                }
            });

            debruijnPaths.CompleteAdding();
            Task.WaitAll(collectionTask);

            return(danglingNodesList);
        }
Example #5
0
        /// <summary>
        /// Detect nodes that are part of dangling links.
        /// Locks: Method only does reads. No locking necessary here or its callees.
        /// </summary>
        /// <param name="deBruijnGraph">Input graph.</param>
        /// <returns>List of nodes in dangling links.</returns>
        public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph)
        {
            if (deBruijnGraph == null)
            {
                throw new ArgumentNullException("deBruijnGraph");
            }

            ConcurrentBag <DeBruijnPath> debruijnPaths = new ConcurrentBag <DeBruijnPath>();

            DeBruijnPathList danglingNodesList = null;

            Parallel.ForEach(deBruijnGraph.GetNodes(), node =>
            {
                if (node.ExtensionsCount == 0)
                {
                    // Single node island
                    debruijnPaths.Add(new DeBruijnPath(node));
                }
                else if (node.RightExtensionNodesCount == 0)
                {
                    // End of possible dangling link
                    // Trace back to see if it is part of a dangling link
                    var link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true);
                    if (link != null && link.PathNodes.Count > 0)
                    {
                        debruijnPaths.Add(link);
                    }
                }
                else if (node.LeftExtensionNodesCount == 0)
                {
                    // End of possible dangling link
                    // Trace back to see if it is part of a dangling link
                    var link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true);
                    if (link != null && link.PathNodes.Count > 0)  //if the first node is below the threshold, it is not added, leaving a link with no nodes, so check is needed
                    {
                        debruijnPaths.Add(link);
                    }
                }
                else if (node.ContainsSelfReference)
                {
                    //does it have not self references?
                    if (node.ExtensionsCount == 1)
                    {
                        debruijnPaths.Add(new DeBruijnPath(node));
                    }
                }
            }
                             );
            danglingNodesList = new DeBruijnPathList(debruijnPaths);

            return(danglingNodesList);
        }
Example #6
0
        /// <summary>
        /// Detect nodes that are part of dangling links
        /// Locks: Method only does reads. No locking necessary here or its callees.
        /// </summary>
        /// <param name="graph">Input graph</param>
        /// <returns>List of nodes in dangling links</returns>
        public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph graph)
        {
            if (graph == null)
            {
                throw new ArgumentNullException("graph");
            }

            DeBruijnNode[] graphNodesArray = graph.Nodes.ToArray();
            int            rangeSize       = (int)Math.Ceiling((float)graphNodesArray.Length / Environment.ProcessorCount);

            DeBruijnPathList danglingNodesList = new DeBruijnPathList(
                Partitioner.Create(0, graphNodesArray.Length, rangeSize).AsParallel().SelectMany(chunk =>
            {
                List <DeBruijnPath> danglingLinks = new List <DeBruijnPath>();
                for (int i = chunk.Item1; i < chunk.Item2; i++)
                {
                    DeBruijnNode node = graphNodesArray[i];
                    if (node.ExtensionsCount == 0)
                    {
                        // Single node island
                        danglingLinks.Add(new DeBruijnPath(node));
                    }
                    else if (node.RightExtensionNodes.Count == 0)
                    {
                        // End of possible dangling link
                        // Traceback to see if it is part of a dangling link
                        var link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true);
                        if (link != null)
                        {
                            danglingLinks.Add(link);
                        }
                    }
                    else if (node.LeftExtensionNodes.Count == 0)
                    {
                        // End of possible dangling link
                        // Traceback to see if it is part of a dangling link
                        var link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true);
                        if (link != null)
                        {
                            danglingLinks.Add(link);
                        }
                    }
                }
                return(danglingLinks);
            }));

            return(danglingNodesList);
        }
Example #7
0
        /// <summary>
        /// Extract best path from list of paths. For the current cluster
        /// of paths, return only those that should be removed.
        /// </summary>
        /// <param name="divergingPaths">List of redundant paths.</param>
        /// <returns>List of paths nodes to be deleted.</returns>
        private static DeBruijnPathList ExtractBestPath(DeBruijnPathList divergingPaths)
        {
            // Find "best" path. Except for best path, return rest for removal
            int bestPathIndex = GetBestPath(divergingPaths);

            DeBruijnPath bestPath = divergingPaths.Paths[bestPathIndex];

            divergingPaths.Paths.RemoveAt(bestPathIndex);

            // There can be overlap between redundant paths.
            // Remove path nodes that occur in best path
            foreach (var path in divergingPaths.Paths)
            {
                path.RemoveAll(n => bestPath.PathNodes.Contains(n));
            }

            return(divergingPaths);
        }
Example #8
0
        /// <summary>
        /// Gets the best path from the list of diverging paths.
        /// Path that has maximum sum of 'count' of belonging k-mers is best.
        /// In case there are multiple 'best' paths, we arbitrarily return one of them.
        /// </summary>
        /// <param name="divergingPaths">List of diverging paths.</param>
        /// <returns>Index of the best path.</returns>
        private static int GetBestPath(DeBruijnPathList divergingPaths)
        {
            // We find the index of the 'best' path.
            long max      = -1;
            int  maxIndex = -1;

            // Path that has the maximum sum of 'count' of belonging k-mers is the winner
            for (int i = 0; i < divergingPaths.Paths.Count; i++)
            {
                long sum = divergingPaths.Paths[i].PathNodes.Sum(n => n.KmerCount);
                if (sum > max)
                {
                    max      = sum;
                    maxIndex = i;
                }
            }

            return(maxIndex);
        }
Example #9
0
        /// <summary>
        /// Removes nodes that are part of dangling links.
        /// </summary>
        /// <param name="deBruijnGraph">Input graph.</param>
        /// <param name="nodesList">List of dangling link nodes.</param>
        public void RemoveErroneousNodes(DeBruijnGraph deBruijnGraph, DeBruijnPathList nodesList)
        {
            // Argument Validation
            if (deBruijnGraph == null)
            {
                throw new ArgumentNullException("deBruijnGraph");
            }

            if (nodesList == null)
            {
                throw new ArgumentNullException("nodesList");
            }
            HashSet <DeBruijnNode> lastNodes = new HashSet <DeBruijnNode>(nodesList.Paths.Select(nl => nl.PathNodes.Last()));

            // Update extensions and Delete nodes from graph.
            deBruijnGraph.RemoveNodes(
                nodesList.Paths.AsParallel().SelectMany(nodes =>
            {
                RemoveLinkNodes(nodes, lastNodes);
                return(nodes.PathNodes);
            }));
        }
Example #10
0
        public static List <ContinuousFrequencyIndelGenotype> CallIndelsFromPathCollection(DeBruijnPathList paths, DeBruijnGraph graph)
        {
            var sequences = paths.Paths.Select(z => new IndelData(z, graph.KmerLength)).Where(z => z.OkayData).ToList();

            if (sequences.Count == 0)
            {
                return(new List <ContinuousFrequencyIndelGenotype>());
            }
            // Get the reference sequence.
            var regionStart = sequences.Min(x => x.LikelyStart) - IndelPathCollection.AlignmentPadding - graph.KmerLength;
            var regionEnd   = sequences.Max(x => x.LikelyEnd) + IndelPathCollection.AlignmentPadding + graph.KmerLength;
            var reference   = HaploGrepSharp.ReferenceGenome.GetReferenceSequenceSection(regionStart, regionEnd);

            // Setup the aligner with appropriate parameters.
            var algo = new Bio.Algorithms.Alignment.SmithWatermanAligner();

            algo.SimilarityMatrix = new Bio.SimilarityMatrices.DiagonalSimilarityMatrix(1, -1);
            algo.GapOpenCost      = -2;
            algo.GapExtensionCost = -1;

            // Execute the alignment and go through and generate variants.
            Dictionary <IndelData, List <IndelLocation> > indels = new Dictionary <IndelData, List <IndelLocation> >();

            foreach (var s in sequences)
            {
                //Note, do not change alignment order here.
                var aln         = algo.Align(reference.Seq, s.Seq);
                var res         = aln[0].PairwiseAlignedSequences[0];
                var indels_locs = FindIndels(res);
                indels[s] = indels_locs;
            }
            // Now to group indels by unique starts and collect them.
            var locations = indels.Values.SelectMany(z => z).ToList().Distinct().GroupBy(z => z.Start);
            var toReturn  = new List <ContinuousFrequencyIndelGenotype>(10);

            // Note: Typically there will only be one Indel location per alignment
            foreach (var g in locations)
            {
                var g2     = g.ToList();
                var lspots = g2.Select(x => x.DeletionOnReference).Distinct().Count();
                if (lspots > 1)
                {
                    throw new NotImplementedException("Same location had indels present on both reference and reads.  This is an edge case not handled");
                }
                // Add a fake reference allele.
                var first    = g2[0];
                var no_indel = new IndelLocation(first.DeletionOnReference, first.Start, 0);
                no_indel.InsertedSequence = String.Empty;
                g2.Add(no_indel);
                //sort by location
                g2.Sort();
                double[] counts = new double[g2.Count];
                //now add counts from each.
                foreach (var s in sequences)
                {
                    var cur   = indels[s];
                    int index = 0;
                    var atLoc = cur.Where(x => x.Start == first.Start).FirstOrDefault();
                    if (atLoc == null)
                    {
                        counts[index] += s.MinKmerCount;
                    }
                    else
                    {
                        bool found = false;
                        for (int i = 1; i < g2.Count; i++)
                        {
                            if (g2[i].Equals(atLoc))
                            {
                                counts[i] += s.MinKmerCount;
                                found      = true;
                                break;
                            }
                        }
                        if (!found)
                        {
                            throw new InvalidProgramException("Point should never be reached");
                        }
                    }
                }
                var types      = g2.Select(x => x.InsertedSequence).ToList();
                var ref_loc    = HaploGrepSharp.ReferenceGenome.ConvertTorCRSPosition(first.Start + regionStart);
                var indel_call = new ContinuousFrequencyIndelGenotype(first.DeletionOnReference, types, counts, ref_loc);
                toReturn.Add(indel_call);
            }
            return(toReturn);
        }
Example #11
0
 /// <summary>
 /// Gets end node of redundant path cluster
 /// All paths in input are part of a redundant path cluster
 /// So all of them have the same start and the end node.
 /// Return the last node of first path.
 /// </summary>
 /// <param name="paths">List of redundant paths.</param>
 /// <returns>End node of redundant path cluster.</returns>
 private static DeBruijnNode GetEndNode(DeBruijnPathList paths)
 {
     return(paths.Paths.First().PathNodes.Last());
 }
        /// <summary>
        /// Traces diverging paths in given direction.
        /// For each path in the set of diverging paths, extend path by one node
        /// at a time. Continue this until all diverging paths converge to a
        /// single node or length threshold is exceeded.
        /// If paths converge, add path cluster containing list of redundant
        /// path nodes to list of redundant paths and return.
        /// </summary>
        /// <param name="startNode">Node at starting point of divergence.</param>
        /// <param name="divergingNodes">List of diverging nodes.</param>
        /// <param name="isRightExtension">Bool indicating direction of divergence.</param>
        /// <param name="redundantPaths">List of redundant paths.</param>
        private void TraceDivergingExtensionPaths(
            DeBruijnNode startNode,
            Dictionary <DeBruijnNode, bool> divergingNodes,
            bool isRightExtension,
            List <DeBruijnPathList> redundantPaths)
        {
            //maka a new path with each having the same start node, and a differing second node based on orientation
            List <PathWithOrientation> divergingPaths = new List <PathWithOrientation>(
                divergingNodes.Select(n =>
                                      new PathWithOrientation(startNode, n.Key, n.Value)));
            int maxDivergingPathLength = 2;

            // Extend paths till length threshold is exceeded.
            // In case paths coverge within threshold, we break out of while.

            // Make a list of paths that we have finished following, this would be any path that
            // has reached a divergent end, possibly before the others have.
            var finishedPaths = new List <PathWithOrientation>(divergingPaths.Count);

            while (maxDivergingPathLength <= this.pathLengthThreshold && finishedPaths.Count != divergingPaths.Count)
            {
                // Extend each path in cluster. While performing path extension
                // also keep track of whether they have converged
                var startCount = divergingPaths.Count;
                for (int k = 0; k < startCount; k++)
                {
                    var path = divergingPaths [k];
                    if (finishedPaths.Contains(path))
                    {
                        continue;
                    }

                    /* We go left if we are already heading left in the same orientation, or if
                     * we are heading right with a different orientation */
                    var grabLeftNext = isRightExtension ^ path.IsSameOrientation;
                    var endNode      = path.Nodes.Last();
                    var nextNodes    = grabLeftNext ? endNode.GetLeftExtensionNodesWithOrientation() : endNode.GetRightExtensionNodesWithOrientation();
                    // If this path ends, we don't continue to follow it.
                    if (nextNodes.Count == 0)
                    {
                        finishedPaths.Add(path);
                        continue;
                    }
                    PathWithOrientation oldPath = null;
                    if (nextNodes.Count > 1)
                    {
                        oldPath = new PathWithOrientation(path);
                    }

                    for (int i = 0; i < nextNodes.Count; i++)
                    {
                        KeyValuePair <DeBruijnNode, bool> nextNode = nextNodes.ElementAt(i);
                        // if more than one, deep copy and continue.
                        if (i > 0)
                        {
                            path = new PathWithOrientation(oldPath);
                            divergingPaths.Add(path);
                        }
                        if (path.Nodes.Contains(nextNode.Key))
                        {
                            // Loop in path
                            //TODO: Not necessarily true, could overlap with itself but go out the other way
                            finishedPaths.Add(path);
                            continue;
                        }
                        else
                        {
                            // Update path orientation
                            path.IsSameOrientation = !(path.IsSameOrientation ^ nextNode.Value);
                            path.Nodes.Add(nextNode.Key);
                        }
                    }
                }
                maxDivergingPathLength++;

                /* Now to check for convergence, this is true if all paths can end with the same node
                 * equivalent to all paths having the same node somewhere.
                 * TODO: Slow implementation is brute force N by all measure
                 * first step would be to search only over the smallest possible path */
                var          firstPathNodes = divergingPaths[0].Nodes;
                DeBruijnNode endingNode     = null;
                for (int i = 1; i < firstPathNodes.Count; i++)
                {
                    var presentInAll = true;
                    var cur_node     = firstPathNodes[i];
                    for (int k = 1; k < divergingPaths.Count; k++)
                    {
                        var c_path = divergingPaths[k];
                        if (!c_path.Nodes.Contains(cur_node))
                        {
                            presentInAll = false;
                            break;
                        }
                    }
                    if (presentInAll)
                    {
                        endingNode = cur_node;
                        break;
                    }
                }
                // Paths have been extended. Check for convergence
                if (endingNode != null)
                {
                    DeBruijnPathList dpl = new DeBruijnPathList(divergingPaths.Count);
                    //If they have all converged, we now trim off any nodes at the end that didn't apply.
                    for (int i = 0; i < divergingPaths.Count; i++)
                    {
                        var          cur_path = divergingPaths[i];
                        DeBruijnPath dp;
                        if (endingNode != cur_path.Nodes.Last())
                        {
                            var indexOfEnd = cur_path.Nodes.IndexOf(endingNode);
                            dp = new DeBruijnPath(cur_path.Nodes.Take(indexOfEnd + 1));
                        }
                        else
                        {
                            dp = new DeBruijnPath(cur_path.Nodes);
                        }
                        dpl.AddPath(dp);
                    }

                    // Note: all paths have the same end node.
                    lock (redundantPaths)
                    {
                        // Redundant paths found
                        redundantPaths.Add(dpl);
                    }
                    return;
                }
            }
        }