/// <summary> /// Removes nodes that are part of redundant paths. /// </summary> /// <param name="deBruijnGraph">De Bruijn graph.</param> /// <param name="nodesList">Path nodes to be deleted.</param> public void RemoveErroneousNodes(DeBruijnGraph deBruijnGraph, DeBruijnPathList nodesList) { DeBruijnGraph.ValidateGraph(deBruijnGraph); if (nodesList == null) { throw new ArgumentNullException("nodesList"); } this.graph = deBruijnGraph; // Neighbors of all nodes have to be updated. HashSet <DeBruijnNode> deleteNodes = new HashSet <DeBruijnNode>( nodesList.Paths.AsParallel().SelectMany(nl => nl.PathNodes)); // Update extensions for deletion // No need for read-write lock as deleteNode's dictionary is being read, // and only other graph node's dictionaries are updated. Parallel.ForEach( deleteNodes, node => { foreach (DeBruijnNode extension in node.GetExtensionNodes()) { // If the neighbor is also to be deleted, there is no use of updation in that case if (!deleteNodes.Contains(extension)) { extension.RemoveExtensionThreadSafe(node); } } }); // Delete nodes from graph this.graph.RemoveNodes(deleteNodes); }
/// <summary> /// Step 3: Remove dangling links from graph. /// </summary> protected void UnDangleGraph() { if (this.DanglingLinksPurger != null && this.DanglingLinksThreshold > 0) { DeBruijnPathList danglingNodes = null; // Observe lengths of dangling links in the graph // This is an optimization - instead of incrementing threshold by 1 and // running the purger iteratively, we first determine the lengths of the // danglings links found in the graph and run purger only for those lengths. this.DanglingLinksPurger.LengthThreshold = this.DanglingLinksThreshold - 1; IEnumerable <int> danglingLengths; IGraphEndsEroder graphEndsEroder = this.DanglingLinksPurger as IGraphEndsEroder; if (graphEndsEroder != null && this.AllowErosion) { // If eroder is implemented, while getting lengths of dangling links, // it also erodes the low coverage ends, this marks any node for deletion below a threshold. //TODO: Verify that this does enumerate all dangling ends, the concern is that if a dangling end of length 7 and 2 //arrive at a node which itself would be of dangling node of length 2 without these "dangling ends" then a dangling end of 9 // (which it would be without either the 7 or 2 end) might not be reported. danglingLengths = graphEndsEroder.ErodeGraphEnds(this.Graph, this.ErosionThreshold); } else { // Perform dangling purger at all incremental values till dangleThreshold. danglingLengths = Enumerable.Range(1, this.DanglingLinksThreshold - 1); } // Erosion is to be only once. Reset erode threshold to -1. this.ErosionThreshold = -1; // Start removing dangling links foreach (int threshold in danglingLengths) { if (this.Graph.NodeCount >= threshold) { this.DanglingLinksPurger.LengthThreshold = threshold; danglingNodes = this.DanglingLinksPurger.DetectErroneousNodes(this.Graph); this.DanglingLinksPurger.RemoveErroneousNodes(this.Graph, danglingNodes); } } // Removing dangling links can in turn create more dangling links // In order to remove all links within threshold, we therefore run // purger at threshold length until there is no more change in graph. do { danglingNodes = null; if (this.Graph.NodeCount >= this.DanglingLinksThreshold) { this.DanglingLinksPurger.LengthThreshold = this.DanglingLinksThreshold; danglingNodes = this.DanglingLinksPurger.DetectErroneousNodes(this.Graph); this.DanglingLinksPurger.RemoveErroneousNodes(this.Graph, danglingNodes); } }while (danglingNodes != null && danglingNodes.Paths.Count > 0); } }
/// <summary> /// Step 3: Remove dangling links from graph /// </summary> protected void UnDangleGraph() { if (_danglingLinksPurger != null && _dangleThreshold > 0) { DeBruijnPathList danglingNodes = null; // Observe lenghts of dangling links in the graph // This is an optimization - instead of incrementing threshold by 1 and // running the purger iteratively, we first determine the lengths of the // danglings links found in the graph and run purger only for those lengths. _danglingLinksPurger.LengthThreshold = _dangleThreshold - 1; IEnumerable <int> danglingLengths; IGraphEndsEroder graphEndsEroder = _danglingLinksPurger as IGraphEndsEroder; if (graphEndsEroder != null && _isErosionEnabled) { // If eroder is implemented, while getting lengths of dangling links, // it also erodes the low coverage ends. danglingLengths = graphEndsEroder.ErodeGraphEnds(_graph, _erosionThreshold); } else { // Perform dangling purger at all incremental values till dangleThreshold. danglingLengths = Enumerable.Range(1, _dangleThreshold - 1); } // Erosion is to be only once. Reset erode threshold to -1. _erosionThreshold = -1; // Start removing dangling links foreach (int threshold in danglingLengths) { if (_graph.Nodes.Count >= threshold) { _danglingLinksPurger.LengthThreshold = threshold; danglingNodes = _danglingLinksPurger.DetectErroneousNodes(_graph); _danglingLinksPurger.RemoveErroneousNodes(_graph, danglingNodes); } } // Removing dangling links can in turn create more dangling links // In order to remove all links within threshold, we therefore run // purger at threshold length until there is no more change in graph. do { danglingNodes = null; if (_graph.Nodes.Count >= _dangleThreshold) { _danglingLinksPurger.LengthThreshold = _dangleThreshold; danglingNodes = _danglingLinksPurger.DetectErroneousNodes(_graph); _danglingLinksPurger.RemoveErroneousNodes(_graph, danglingNodes); } }while (danglingNodes != null && danglingNodes.Paths.Count > 0); } }
/// <summary> /// Detect nodes that are part of dangling links. /// Locks: Method only does reads. No locking necessary here or its callees. /// </summary> /// <param name="deBruijnGraph">Input graph.</param> /// <returns>List of nodes in dangling links.</returns> public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph) { if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } BlockingCollection <DeBruijnPath> debruijnPaths = new BlockingCollection <DeBruijnPath>(); Task[] tasks = new Task[1]; DeBruijnPathList danglingNodesList = null; Task collectionTask = Task.Factory.StartNew(() => { danglingNodesList = new DeBruijnPathList(this.GetPaths(debruijnPaths)); }); tasks[0] = collectionTask; Parallel.ForEach( deBruijnGraph.GetNodes(), (node) => { if (node.ExtensionsCount == 0) { // Single node island debruijnPaths.Add(new DeBruijnPath(node)); } else if (node.RightExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link var link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true); if (link != null) { debruijnPaths.Add(link); } } else if (node.LeftExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link var link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true); if (link != null) { debruijnPaths.Add(link); } } }); debruijnPaths.CompleteAdding(); Task.WaitAll(collectionTask); return(danglingNodesList); }
/// <summary> /// Detect nodes that are part of dangling links. /// Locks: Method only does reads. No locking necessary here or its callees. /// </summary> /// <param name="deBruijnGraph">Input graph.</param> /// <returns>List of nodes in dangling links.</returns> public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph) { if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } ConcurrentBag <DeBruijnPath> debruijnPaths = new ConcurrentBag <DeBruijnPath>(); DeBruijnPathList danglingNodesList = null; Parallel.ForEach(deBruijnGraph.GetNodes(), node => { if (node.ExtensionsCount == 0) { // Single node island debruijnPaths.Add(new DeBruijnPath(node)); } else if (node.RightExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link var link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true); if (link != null && link.PathNodes.Count > 0) { debruijnPaths.Add(link); } } else if (node.LeftExtensionNodesCount == 0) { // End of possible dangling link // Trace back to see if it is part of a dangling link var link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true); if (link != null && link.PathNodes.Count > 0) //if the first node is below the threshold, it is not added, leaving a link with no nodes, so check is needed { debruijnPaths.Add(link); } } else if (node.ContainsSelfReference) { //does it have not self references? if (node.ExtensionsCount == 1) { debruijnPaths.Add(new DeBruijnPath(node)); } } } ); danglingNodesList = new DeBruijnPathList(debruijnPaths); return(danglingNodesList); }
/// <summary> /// Detect nodes that are part of dangling links /// Locks: Method only does reads. No locking necessary here or its callees. /// </summary> /// <param name="graph">Input graph</param> /// <returns>List of nodes in dangling links</returns> public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph graph) { if (graph == null) { throw new ArgumentNullException("graph"); } DeBruijnNode[] graphNodesArray = graph.Nodes.ToArray(); int rangeSize = (int)Math.Ceiling((float)graphNodesArray.Length / Environment.ProcessorCount); DeBruijnPathList danglingNodesList = new DeBruijnPathList( Partitioner.Create(0, graphNodesArray.Length, rangeSize).AsParallel().SelectMany(chunk => { List <DeBruijnPath> danglingLinks = new List <DeBruijnPath>(); for (int i = chunk.Item1; i < chunk.Item2; i++) { DeBruijnNode node = graphNodesArray[i]; if (node.ExtensionsCount == 0) { // Single node island danglingLinks.Add(new DeBruijnPath(node)); } else if (node.RightExtensionNodes.Count == 0) { // End of possible dangling link // Traceback to see if it is part of a dangling link var link = TraceDanglingExtensionLink(false, new DeBruijnPath(), node, true); if (link != null) { danglingLinks.Add(link); } } else if (node.LeftExtensionNodes.Count == 0) { // End of possible dangling link // Traceback to see if it is part of a dangling link var link = TraceDanglingExtensionLink(true, new DeBruijnPath(), node, true); if (link != null) { danglingLinks.Add(link); } } } return(danglingLinks); })); return(danglingNodesList); }
/// <summary> /// Extract best path from list of paths. For the current cluster /// of paths, return only those that should be removed. /// </summary> /// <param name="divergingPaths">List of redundant paths.</param> /// <returns>List of paths nodes to be deleted.</returns> private static DeBruijnPathList ExtractBestPath(DeBruijnPathList divergingPaths) { // Find "best" path. Except for best path, return rest for removal int bestPathIndex = GetBestPath(divergingPaths); DeBruijnPath bestPath = divergingPaths.Paths[bestPathIndex]; divergingPaths.Paths.RemoveAt(bestPathIndex); // There can be overlap between redundant paths. // Remove path nodes that occur in best path foreach (var path in divergingPaths.Paths) { path.RemoveAll(n => bestPath.PathNodes.Contains(n)); } return(divergingPaths); }
/// <summary> /// Gets the best path from the list of diverging paths. /// Path that has maximum sum of 'count' of belonging k-mers is best. /// In case there are multiple 'best' paths, we arbitrarily return one of them. /// </summary> /// <param name="divergingPaths">List of diverging paths.</param> /// <returns>Index of the best path.</returns> private static int GetBestPath(DeBruijnPathList divergingPaths) { // We find the index of the 'best' path. long max = -1; int maxIndex = -1; // Path that has the maximum sum of 'count' of belonging k-mers is the winner for (int i = 0; i < divergingPaths.Paths.Count; i++) { long sum = divergingPaths.Paths[i].PathNodes.Sum(n => n.KmerCount); if (sum > max) { max = sum; maxIndex = i; } } return(maxIndex); }
/// <summary> /// Removes nodes that are part of dangling links. /// </summary> /// <param name="deBruijnGraph">Input graph.</param> /// <param name="nodesList">List of dangling link nodes.</param> public void RemoveErroneousNodes(DeBruijnGraph deBruijnGraph, DeBruijnPathList nodesList) { // Argument Validation if (deBruijnGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } if (nodesList == null) { throw new ArgumentNullException("nodesList"); } HashSet <DeBruijnNode> lastNodes = new HashSet <DeBruijnNode>(nodesList.Paths.Select(nl => nl.PathNodes.Last())); // Update extensions and Delete nodes from graph. deBruijnGraph.RemoveNodes( nodesList.Paths.AsParallel().SelectMany(nodes => { RemoveLinkNodes(nodes, lastNodes); return(nodes.PathNodes); })); }
public static List <ContinuousFrequencyIndelGenotype> CallIndelsFromPathCollection(DeBruijnPathList paths, DeBruijnGraph graph) { var sequences = paths.Paths.Select(z => new IndelData(z, graph.KmerLength)).Where(z => z.OkayData).ToList(); if (sequences.Count == 0) { return(new List <ContinuousFrequencyIndelGenotype>()); } // Get the reference sequence. var regionStart = sequences.Min(x => x.LikelyStart) - IndelPathCollection.AlignmentPadding - graph.KmerLength; var regionEnd = sequences.Max(x => x.LikelyEnd) + IndelPathCollection.AlignmentPadding + graph.KmerLength; var reference = HaploGrepSharp.ReferenceGenome.GetReferenceSequenceSection(regionStart, regionEnd); // Setup the aligner with appropriate parameters. var algo = new Bio.Algorithms.Alignment.SmithWatermanAligner(); algo.SimilarityMatrix = new Bio.SimilarityMatrices.DiagonalSimilarityMatrix(1, -1); algo.GapOpenCost = -2; algo.GapExtensionCost = -1; // Execute the alignment and go through and generate variants. Dictionary <IndelData, List <IndelLocation> > indels = new Dictionary <IndelData, List <IndelLocation> >(); foreach (var s in sequences) { //Note, do not change alignment order here. var aln = algo.Align(reference.Seq, s.Seq); var res = aln[0].PairwiseAlignedSequences[0]; var indels_locs = FindIndels(res); indels[s] = indels_locs; } // Now to group indels by unique starts and collect them. var locations = indels.Values.SelectMany(z => z).ToList().Distinct().GroupBy(z => z.Start); var toReturn = new List <ContinuousFrequencyIndelGenotype>(10); // Note: Typically there will only be one Indel location per alignment foreach (var g in locations) { var g2 = g.ToList(); var lspots = g2.Select(x => x.DeletionOnReference).Distinct().Count(); if (lspots > 1) { throw new NotImplementedException("Same location had indels present on both reference and reads. This is an edge case not handled"); } // Add a fake reference allele. var first = g2[0]; var no_indel = new IndelLocation(first.DeletionOnReference, first.Start, 0); no_indel.InsertedSequence = String.Empty; g2.Add(no_indel); //sort by location g2.Sort(); double[] counts = new double[g2.Count]; //now add counts from each. foreach (var s in sequences) { var cur = indels[s]; int index = 0; var atLoc = cur.Where(x => x.Start == first.Start).FirstOrDefault(); if (atLoc == null) { counts[index] += s.MinKmerCount; } else { bool found = false; for (int i = 1; i < g2.Count; i++) { if (g2[i].Equals(atLoc)) { counts[i] += s.MinKmerCount; found = true; break; } } if (!found) { throw new InvalidProgramException("Point should never be reached"); } } } var types = g2.Select(x => x.InsertedSequence).ToList(); var ref_loc = HaploGrepSharp.ReferenceGenome.ConvertTorCRSPosition(first.Start + regionStart); var indel_call = new ContinuousFrequencyIndelGenotype(first.DeletionOnReference, types, counts, ref_loc); toReturn.Add(indel_call); } return(toReturn); }
/// <summary> /// Gets end node of redundant path cluster /// All paths in input are part of a redundant path cluster /// So all of them have the same start and the end node. /// Return the last node of first path. /// </summary> /// <param name="paths">List of redundant paths.</param> /// <returns>End node of redundant path cluster.</returns> private static DeBruijnNode GetEndNode(DeBruijnPathList paths) { return(paths.Paths.First().PathNodes.Last()); }
/// <summary> /// Traces diverging paths in given direction. /// For each path in the set of diverging paths, extend path by one node /// at a time. Continue this until all diverging paths converge to a /// single node or length threshold is exceeded. /// If paths converge, add path cluster containing list of redundant /// path nodes to list of redundant paths and return. /// </summary> /// <param name="startNode">Node at starting point of divergence.</param> /// <param name="divergingNodes">List of diverging nodes.</param> /// <param name="isRightExtension">Bool indicating direction of divergence.</param> /// <param name="redundantPaths">List of redundant paths.</param> private void TraceDivergingExtensionPaths( DeBruijnNode startNode, Dictionary <DeBruijnNode, bool> divergingNodes, bool isRightExtension, List <DeBruijnPathList> redundantPaths) { //maka a new path with each having the same start node, and a differing second node based on orientation List <PathWithOrientation> divergingPaths = new List <PathWithOrientation>( divergingNodes.Select(n => new PathWithOrientation(startNode, n.Key, n.Value))); int maxDivergingPathLength = 2; // Extend paths till length threshold is exceeded. // In case paths coverge within threshold, we break out of while. // Make a list of paths that we have finished following, this would be any path that // has reached a divergent end, possibly before the others have. var finishedPaths = new List <PathWithOrientation>(divergingPaths.Count); while (maxDivergingPathLength <= this.pathLengthThreshold && finishedPaths.Count != divergingPaths.Count) { // Extend each path in cluster. While performing path extension // also keep track of whether they have converged var startCount = divergingPaths.Count; for (int k = 0; k < startCount; k++) { var path = divergingPaths [k]; if (finishedPaths.Contains(path)) { continue; } /* We go left if we are already heading left in the same orientation, or if * we are heading right with a different orientation */ var grabLeftNext = isRightExtension ^ path.IsSameOrientation; var endNode = path.Nodes.Last(); var nextNodes = grabLeftNext ? endNode.GetLeftExtensionNodesWithOrientation() : endNode.GetRightExtensionNodesWithOrientation(); // If this path ends, we don't continue to follow it. if (nextNodes.Count == 0) { finishedPaths.Add(path); continue; } PathWithOrientation oldPath = null; if (nextNodes.Count > 1) { oldPath = new PathWithOrientation(path); } for (int i = 0; i < nextNodes.Count; i++) { KeyValuePair <DeBruijnNode, bool> nextNode = nextNodes.ElementAt(i); // if more than one, deep copy and continue. if (i > 0) { path = new PathWithOrientation(oldPath); divergingPaths.Add(path); } if (path.Nodes.Contains(nextNode.Key)) { // Loop in path //TODO: Not necessarily true, could overlap with itself but go out the other way finishedPaths.Add(path); continue; } else { // Update path orientation path.IsSameOrientation = !(path.IsSameOrientation ^ nextNode.Value); path.Nodes.Add(nextNode.Key); } } } maxDivergingPathLength++; /* Now to check for convergence, this is true if all paths can end with the same node * equivalent to all paths having the same node somewhere. * TODO: Slow implementation is brute force N by all measure * first step would be to search only over the smallest possible path */ var firstPathNodes = divergingPaths[0].Nodes; DeBruijnNode endingNode = null; for (int i = 1; i < firstPathNodes.Count; i++) { var presentInAll = true; var cur_node = firstPathNodes[i]; for (int k = 1; k < divergingPaths.Count; k++) { var c_path = divergingPaths[k]; if (!c_path.Nodes.Contains(cur_node)) { presentInAll = false; break; } } if (presentInAll) { endingNode = cur_node; break; } } // Paths have been extended. Check for convergence if (endingNode != null) { DeBruijnPathList dpl = new DeBruijnPathList(divergingPaths.Count); //If they have all converged, we now trim off any nodes at the end that didn't apply. for (int i = 0; i < divergingPaths.Count; i++) { var cur_path = divergingPaths[i]; DeBruijnPath dp; if (endingNode != cur_path.Nodes.Last()) { var indexOfEnd = cur_path.Nodes.IndexOf(endingNode); dp = new DeBruijnPath(cur_path.Nodes.Take(indexOfEnd + 1)); } else { dp = new DeBruijnPath(cur_path.Nodes); } dpl.AddPath(dp); } // Note: all paths have the same end node. lock (redundantPaths) { // Redundant paths found redundantPaths.Add(dpl); } return; } } }