public void TestDeBruijnGraphBuilderSmall() { const int KmerLength = 6; List <ISequence> reads = TestInputs.GetSmallReads(); this.KmerLength = KmerLength; this.SequenceReads.Clear(); this.SetSequenceReads(reads); this.CreateGraph(); DeBruijnGraph graph = this.Graph; Assert.AreEqual(20, graph.NodeCount); HashSet <string> nodeStrings = GetGraphNodesForSmallReads(); string nodeStr, nodeStrRC; foreach (DeBruijnNode node in graph.GetNodes()) { nodeStr = new string(graph.GetNodeSequence(node).Select(a => (char)a).ToArray()); nodeStrRC = new string(graph.GetNodeSequence(node).GetReverseComplementedSequence().Select(a => (char)a).ToArray()); Assert.IsTrue(nodeStrings.Contains(nodeStr) || nodeStrings.Contains(nodeStrRC)); } long totalEdges = graph.GetNodes().Select(n => n.ExtensionsCount).Sum(); Assert.AreEqual(51, totalEdges); }
/// <summary> /// Some set of paths will appear twice, one traced in forward direction /// and other in opposite. This method eliminate duplicates. /// </summary> /// <param name="redundantPathClusters">List of path cluster</param> /// <returns>List of unique path clusters</returns> private List <DeBruijnPathList> RemoveDuplicates(List <DeBruijnPathList> redundantPathClusters) { // Divide the list into two groups. One with paths that do not // have duplicates, and one with paths that do not have duplicate List <IGrouping <bool, DeBruijnPathList> > uniqueAndDuplicatedPaths = redundantPathClusters.AsParallel().GroupBy(pc1 => redundantPathClusters.Any(pc2 => GetStartNode(pc1) == GetEndNode(pc2) && GetEndNode(pc1) == GetStartNode(pc2))).ToList(); List <DeBruijnPathList> uniquePaths = new List <DeBruijnPathList>(); foreach (IGrouping <bool, DeBruijnPathList> group in uniqueAndDuplicatedPaths) { if (!group.Key) { // Add all paths that do have duplicates to final list uniquePaths.AddRange(group); } else { // Each element in this list contains a duplicate in the list // Add only those where the start node has a sequence that is // lexicographically greater than the end node sequence. This // operation will eliminate duplicates effectively. uniquePaths.AddRange( group.AsParallel().Where(pc => string.CompareOrdinal( _graph.GetNodeSequence(GetStartNode(pc)).ToString(), _graph.GetNodeSequence(GetEndNode(pc)).ToString()) >= 0)); } } return(uniquePaths); }
/// <summary> /// Get simple paths in the graph. /// </summary> /// <returns>List of simple paths.</returns> private List <ISequence> GetSimplePaths(bool createContigSequences) { //set flag to false so we can find any nodes that are missed during the build _graph.SetNodeVisitState(false); List <ISequence> paths = new List <ISequence>(); Parallel.ForEach(_graph.GetNodes(), node => { int validLeftExtensionsCount = node.LeftExtensionNodesCount; int validRightExtensionsCount = node.RightExtensionNodesCount; if (validLeftExtensionsCount + validRightExtensionsCount == 0) { node.IsVisited = true; // Island. Check coverage if (_coverageThreshold == -1) { if (createContigSequences) { lock (paths) { paths.Add(_graph.GetNodeSequence(node)); } } } else { if (node.KmerCount < _coverageThreshold) { node.MarkNodeForDelete(); } } } else if (validLeftExtensionsCount == 1 && validRightExtensionsCount == 0) { TraceSimplePath(paths, node, false, createContigSequences, true); } else if (validRightExtensionsCount == 1 && validLeftExtensionsCount == 0) { TraceSimplePath(paths, node, true, createContigSequences, true); } }); //All paths starting from ends have now been found, however graph nodes entirely enclosed in a //circular loop have been skipped, since these are small plasmids, etc. fast enough to do not in parallel. //Must also be done sequentially to avoid grabbing nodes from the same circle in the graph concurrently foreach (var node in _graph.GetUnvisitedNodes()) { TraceSimplePath(paths, node, true, createContigSequences, false); } // Reset flag state to false, likely unnecessary as any method using the visit state flag // should set it to false independently _graph.SetNodeVisitState(false); return(paths); }
/// <summary> /// Converts the scaffold path into its sequence. /// </summary> /// <param name="graph">De Bruijn graph.</param> /// <param name="kmerLength">Kmer Length.</param> /// <returns>Scaffold Sequence.</returns> public ISequence BuildSequenceFromPath(DeBruijnGraph graph, int kmerLength) { if (graph == null) { throw new ArgumentNullException("graph"); } DeBruijnNode startNode = this[0].Key; bool isForwardDirection = this[0].Value.IsSameOrientation; startNode.MarkNode(); ISequence scaffoldSequence = new Sequence(Alphabets.DNA); scaffoldSequence.InsertRange(0, graph.GetNodeSequence(startNode).ToString()); this.RemoveAt(0); // There is overlap of (k-1) symbols between adjacent contigs if (kmerLength > 1) { kmerLength--; } bool sameOrientation = true; ISequence nextNodeSequence; foreach (KeyValuePair <DeBruijnNode, DeBruijnEdge> extensions in this) { sameOrientation = !(sameOrientation ^ extensions.Value.IsSameOrientation); nextNodeSequence = sameOrientation ? graph.GetNodeSequence(extensions.Key) : graph.GetNodeSequence(extensions.Key).ReverseComplement; // Extend scaffold sequence using symbols from contig beyond the overlap if (isForwardDirection) { scaffoldSequence.InsertRange(scaffoldSequence.Count, nextNodeSequence.Range(kmerLength, nextNodeSequence.Count - kmerLength).ToString()); } else { scaffoldSequence.InsertRange(0, nextNodeSequence.Range(0, nextNodeSequence.Count - kmerLength).ToString()); } extensions.Key.MarkNode(); } return(scaffoldSequence); }
/// <summary> /// Performs Breadth First Search to traverse through graph to generate scaffold paths. /// </summary> /// <param name="graph">Contig Overlap Graph.</param> /// <param name="contigPairedReadMaps">InterContig Distances.</param> /// <param name="kmerLength">Length of Kmer</param> /// <param name="depth">Depth to which graph is searched.</param> /// <returns>List of paths/scaffold</returns> public IList <ScaffoldPath> FindPaths( DeBruijnGraph graph, ContigMatePairs contigPairedReadMaps, int kmerLength, int depth = 10) { if (graph == null) { throw new ArgumentNullException("graph"); } if (contigPairedReadMaps == null) { throw new ArgumentNullException("contigPairedReadMaps"); } if (kmerLength <= 0) { throw new ArgumentException(Resource.KmerLength); } if (depth <= 0) { throw new ArgumentException(Resource.Depth); } _graph = graph; _kmerLength = kmerLength; _depth = depth; List <ScaffoldPath> scaffoldPaths = new List <ScaffoldPath>(); Parallel.ForEach(_graph.Nodes, (DeBruijnNode node) => { Dictionary <ISequence, IList <ValidMatePair> > contigPairedReadMap; if (contigPairedReadMaps.TryGetValue(graph.GetNodeSequence(node), out contigPairedReadMap)) { List <ScaffoldPath> scaffoldPath = TraverseGraph(node, contigPairedReadMap); lock (scaffoldPaths) { scaffoldPaths.AddRange(scaffoldPath); } } }); return(scaffoldPaths); }
/// <summary> /// Get simple paths in the graph. /// </summary> /// <returns>List of simple paths.</returns> private List <ISequence> GetSimplePaths(bool createContigSequences) { List <ISequence> paths = new List <ISequence>(); Parallel.ForEach( this.graph.GetNodes(), node => { int validLeftExtensionsCount, validRightExtensionsCount; validLeftExtensionsCount = node.LeftExtensionNodesCount; validRightExtensionsCount = node.RightExtensionNodesCount; if (validLeftExtensionsCount + validRightExtensionsCount == 0) { // Island. Check coverage if (coverageThreshold == -1) { if (createContigSequences) { lock (paths) { paths.Add(graph.GetNodeSequence(node)); } } } else { if (node.KmerCount < coverageThreshold) { node.MarkNodeForDelete(); } } } else if (validLeftExtensionsCount == 1 && validRightExtensionsCount == 0) { TraceSimplePath(paths, node, false, createContigSequences); } else if (validRightExtensionsCount == 1 && validLeftExtensionsCount == 0) { TraceSimplePath(paths, node, true, createContigSequences); } }); return(paths); }
/// <summary> /// Condense redundant paths down to simple paths /// </summary> /// <returns>List of simple paths.</returns> private IList<ISequence> CreateMegaNodes() { foreach(DeBruijnNode node in _graph.GetNodes()) { IList<ISequence> paths = new List<ISequence>(); Parallel.ForEach(this._graph.GetNodes(), node => { int validLeftExtensionsCount = node.LeftExtensionNodesCount; int validRightExtensionsCount = node.RightExtensionNodesCount; if (validLeftExtensionsCount + validRightExtensionsCount == 0) { // Island. Check coverage if (Double.IsNaN(_coverageThreshold)) { if (createContigSequences) { lock (paths) { paths.Add(_graph.GetNodeSequence(node)); } } } else { if (node.KmerCount < _coverageThreshold) { node.MarkNodeForDelete(); } } } else if (validLeftExtensionsCount == 1 && validRightExtensionsCount == 0) { TraceSimplePath(paths, node, false, createContigSequences); } else if (validRightExtensionsCount == 1 && validLeftExtensionsCount == 0) { TraceSimplePath(paths, node, true, createContigSequences); } }); return paths; } }
/// <summary> /// Generate sequences from list of contig nodes. /// </summary> /// <param name="contigGraph">Contig Overlap Graph.</param> /// <param name="paths">Scaffold paths.</param> /// <returns>List of sequences of scaffolds.</returns> protected IList <ISequence> GenerateScaffold( DeBruijnGraph contigGraph, IList <ScaffoldPath> paths) { if (contigGraph == null) { throw new ArgumentNullException("contigGraph"); } if (paths == null) { throw new ArgumentNullException("paths"); } List <ISequence> scaffolds = paths.AsParallel().Select(t => t.BuildSequenceFromPath(contigGraph, _kmerLength)).ToList(); IEnumerable <DeBruijnNode> visitedNodes = contigGraph.Nodes.AsParallel().Where(t => !t.IsMarked()); scaffolds.AddRange(visitedNodes.AsParallel().Select(t => contigGraph.GetNodeSequence(t))); contigGraph.Dispose(); return(scaffolds); }
public void TestDeBruijnGraphBuilderTiny() { const int KmerLength = 3; List <ISequence> reads = TestInputs.GetTinyReads(); this.KmerLength = KmerLength; this.SequenceReads.Clear(); this.SetSequenceReads(reads); this.CreateGraph(); DeBruijnGraph graph = this.Graph; Assert.AreEqual(9, graph.NodeCount); HashSet <string> nodeStrings = new HashSet <string>(graph.GetNodes().Select(n => new string(graph.GetNodeSequence(n).Select(a => (char)a).ToArray()))); Assert.IsTrue(nodeStrings.Contains("ATG") || nodeStrings.Contains("CAT")); Assert.IsTrue(nodeStrings.Contains("TGC") || nodeStrings.Contains("GCA")); Assert.IsTrue(nodeStrings.Contains("GCC") || nodeStrings.Contains("GGC")); Assert.IsTrue(nodeStrings.Contains("TCC") || nodeStrings.Contains("GGA")); Assert.IsTrue(nodeStrings.Contains("CCT") || nodeStrings.Contains("AGG")); Assert.IsTrue(nodeStrings.Contains("CTA") || nodeStrings.Contains("TAG")); Assert.IsTrue(nodeStrings.Contains("TAT") || nodeStrings.Contains("ATA")); Assert.IsTrue(nodeStrings.Contains("ATC") || nodeStrings.Contains("GAT")); Assert.IsTrue(nodeStrings.Contains("CTC") || nodeStrings.Contains("GAG")); long totalEdges = graph.GetNodes().Select(n => n.ExtensionsCount).Sum(); Assert.AreEqual(31, totalEdges); }
public MetaNode(DeBruijnNode startNode, DeBruijnGraph graph) { this.NodeNumber = GraphGenerator.NodeCount++; KmerLength = graph.KmerLength; if (startNode.IsVisited) { throw new Exception("If a node has been visited it should not form a metanode, suggests an infinite recursion problem"); } NODE_TYPE type = ClassifyNode(startNode); startNode.IsVisited = true; //Either of these become their own thing if (type == NODE_TYPE.NEXUS || type == NODE_TYPE.ISLAND || type == NODE_TYPE.END_LOOPS_ON_ITSELF) { ConstituentNodes.Add(startNode); contigSequence = new List <byte>(graph.GetNodeSequence(startNode)); Sequence = (new Sequence((IAlphabet)NoGapDnaAlphabet.Instance, contigSequence.ToArray())).ConvertToString(0, contigSequence.Count); } else if (type == NODE_TYPE.LINK_IN_CHAIN) { contigSequence = new List <byte>(graph.GetNodeSequence(startNode)); if (!VerifyNotCircular(startNode)) { MakeCircle(startNode, graph); //throw new Exception("Non circular visualizations not currently supported"); } else { //go right first contigSequence = new List <byte>(graph.GetNodeSequence(startNode)); //var nextNodes = ExtendChain(startNode, true, graph); ExtendChain(startNode, true, graph); //copy the right information and clear it out var tmpRightSeq = contigSequence.ToArray(); //skip the first node var tmpRightNodes = ConstituentNodes.Skip(1).ToArray(); ConstituentNodes.Clear(); contigSequence.Clear(); //now go left ExtendChain(startNode, false, graph); //now lets combine ConstituentNodes.Reverse(); ConstituentNodes.AddRange(tmpRightNodes); var tmpSequence = new Sequence(DnaAlphabet.Instance, contigSequence.ToArray()); tmpSequence = new Sequence(tmpSequence.GetReverseComplementedSequence()); string LeftSequence = ""; if (tmpSequence.Count > 0) { LeftSequence = tmpSequence.ConvertToString(0, tmpSequence.Count); } tmpSequence = new Sequence(DnaAlphabet.Instance, tmpRightSeq); Sequence = LeftSequence + tmpSequence.ConvertToString(0, (tmpSequence.Count)); contigSequence = new Sequence(DnaAlphabet.Instance, Sequence).ToList(); } } else if (type == NODE_TYPE.GO_LEFT) { contigSequence = new List <byte>(graph.GetNodeSequence(startNode).GetReverseComplementedSequence()); //var nextNodes = ExtendChain(startNode, false, graph); ExtendChain(startNode, false, graph); var tmpSequence = new Sequence(DnaAlphabet.Instance, contigSequence.ToArray()); //somewhat confusing - originally built the RC of sequence, so RCing again to get correct orientation for //neighbors tmpSequence = new Sequence(tmpSequence.GetReverseComplementedSequence()); contigSequence = tmpSequence.ToList(); Sequence = tmpSequence.ConvertToString(0, tmpSequence.Count); //flip it so nodes and sequence are in order ConstituentNodes.Reverse(); } else if (type == NODE_TYPE.GO_RIGHT) { contigSequence = new List <byte>(graph.GetNodeSequence(startNode)); //var nextNodes = ExtendChain(startNode, true, graph); ExtendChain(startNode, true, graph); var tmpSequence = new Sequence(DnaAlphabet.Instance, contigSequence.ToArray()); Sequence = tmpSequence.ConvertToString(0, tmpSequence.Count); } Cement(); }
public void TracePathTestWithPalindromicContig() { const int kmerLength = 6; const int dangleThreshold = 3; const int redundantThreshold = 7; List <ISequence> sequences = new List <ISequence>(); Sequence seq = new Sequence(Alphabets.DNA, "ATGCCTC"); seq.DisplayID = ">10.x1:abc"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "CCTCCTAT"); seq.DisplayID = "1"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TCCTATC"); seq.DisplayID = "2"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TGCCTCCT"); seq.DisplayID = "3"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "ATCTTAGC"); seq.DisplayID = "4"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "CTATCTTAG"); seq.DisplayID = "5"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "CTTAGCG"); seq.DisplayID = "6"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "GCCTCCTAT"); seq.DisplayID = ">8.x1:abc"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TAGCGCGCTA"); seq.DisplayID = ">8.y1:abc"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "AGCGCGC"); seq.DisplayID = ">9.x1:abc"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TTTTTT"); seq.DisplayID = "7"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TTTTTAAA"); seq.DisplayID = "8"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TAAAAA"); seq.DisplayID = "9"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TTTTAG"); seq.DisplayID = "10"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TTTAGC"); seq.DisplayID = "11"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "GCGCGCCGCGCG"); seq.DisplayID = "12"; sequences.Add(seq); KmerLength = kmerLength; SequenceReads.Clear(); AddSequenceReads(sequences); CreateGraph(); DanglingLinksThreshold = dangleThreshold; DanglingLinksPurger = new DanglingLinksPurger(dangleThreshold); RedundantPathLengthThreshold = redundantThreshold; RedundantPathsPurger = new RedundantPathsPurger(redundantThreshold); UnDangleGraph(); RemoveRedundancy(); IList <ISequence> contigs = BuildContigs(); ReadContigMapper mapper = new ReadContigMapper(); ReadContigMap maps = mapper.Map(contigs, sequences, kmerLength); MatePairMapper builder = new MatePairMapper(); CloneLibrary.Instance.AddLibrary("abc", (float)5, (float)15); ContigMatePairs pairedReads = builder.MapContigToMatePairs(sequences, maps); ContigMatePairs overlap; OrientationBasedMatePairFilter filter = new OrientationBasedMatePairFilter(); overlap = filter.FilterPairedReads(pairedReads, 0); DistanceCalculator dist = new DistanceCalculator(); dist.CalculateDistance(overlap); Graph.BuildContigGraph(contigs, this.KmerLength); TracePath path = new TracePath(); IList <ScaffoldPath> paths = path.FindPaths(Graph, overlap, kmerLength, 3); Assert.AreEqual(paths.Count, 3); Assert.AreEqual(paths.First().Count, 3); ScaffoldPath scaffold = paths.First(); DeBruijnGraph graph = Graph; Assert.IsTrue(graph.GetNodeSequence(scaffold[0].Key).ToString().Equals("ATGCCTCCTATCTTAGC")); Assert.IsTrue(graph.GetNodeSequence(scaffold[1].Key).ToString().Equals("TTAGCGCG")); Assert.IsTrue(graph.GetNodeSequence(scaffold[2].Key).ToString().Equals("GCGCGC")); }
/// <summary> /// Builds scaffolds from list of reads and contigs /// </summary> /// <param name="reads">List of reads</param> /// <param name="contigs">List of contigs</param> /// <param name="kmerLength">Kmer Length</param> /// <param name="depth">Depth for graph traversal</param> /// <param name="redundancy">Number of mate pairs required to create a link between two contigs. /// Hierarchical Scaffolding With Bambus /// by: Mihai Pop, Daniel S. Kosack, Steven L. Salzberg /// Genome Research, Vol. 14, No. 1. (January 2004), pp. 149-159.</param> /// <returns>List of scaffold sequences</returns> public IList <ISequence> BuildScaffold( IList <ISequence> reads, IList <ISequence> contigs, int kmerLength, int depth = 10, int redundancy = 2) { if (contigs == null) { throw new ArgumentNullException("contigs"); } if (null == reads) { throw new ArgumentNullException("reads"); } if (kmerLength <= 0) { throw new ArgumentException(Properties.Resource.KmerLength); } if (depth <= 0) { throw new ArgumentException(Resource.Depth); } if (redundancy < 0) { throw new ArgumentException(Resource.NegativeRedundancy); } _depth = depth; _redundancy = redundancy; _kmerLength = kmerLength; IList <ISequence> readSeqs = reads.AsParallel().Where(s => s.All <ISequenceItem>(c => !c.IsAmbiguous && !c.IsGap)).ToList(); //Step1: Generate contig overlap graph. DeBruijnGraph contigGraph = GenerateContigOverlapGraph(contigs); IEnumerable <DeBruijnNode> nodes = contigGraph.Nodes.Where(t => t.ExtensionsCount == 0); foreach (DeBruijnNode node in nodes) { contigs.Remove(contigGraph.GetNodeSequence(node)); } // Step2: Map Reads to contigs. ReadContigMap readContigMap = ReadContigMap(contigs, readSeqs); contigs = null; // Step3: Generate Contig Mate Pair Map. ContigMatePairs contigMatePairs = MapPairedReadsToContigs(readContigMap, readSeqs); readContigMap = null; // Step4: Filter Paired Reads. contigMatePairs = FilterReadsBasedOnOrientation(contigMatePairs); // Step5: Distance Calculation. CalculateDistanceBetweenContigs(contigMatePairs); // Step6: Trace Scaffold Paths. IList <ScaffoldPath> paths = TracePath(contigGraph, contigMatePairs); contigMatePairs = null; // Step7: Assemble paths. PathPurger(paths); // Step8: Generate sequence of scaffolds. return(GenerateScaffold(contigGraph, paths)); }
/// <summary> /// Checks if the sequence in the node is a palindrome. /// A sequence is palindrome if it is same as its reverse complement. /// Reference: http://en.wikipedia.org/wiki/Palindromic_sequence /// </summary> /// <param name="node">DeBruijn graph node</param> /// <returns>Boolean indicating if node represents palidromic sequence</returns> private bool IsPalindrome(DeBruijnNode node) { ISequence seq = _graph.GetNodeSequence(node); return(string.CompareOrdinal(seq.ToString(), seq.ReverseComplement.ToString()) == 0); }
/// <summary> /// Add left extension of the nodes to queue. /// </summary> /// <param name="node">Current node.</param> /// <param name="search">Queue for BFS.</param> /// <param name="paths">List of paths</param> /// <param name="familyTree">nodes visited for construction of paths.</param> /// <param name="contigPairedReadMap">contig and valid mate pair map.</param> private void LeftExtension( KeyValuePair <DeBruijnNode, DeBruijnEdge> node, Queue <Paths> search, List <Paths> paths, ScaffoldPath familyTree, Dictionary <ISequence, IList <ValidMatePair> > contigPairedReadMap) { Paths childPath; if (node.Key.LeftExtensionNodes.Count > 0) { foreach (KeyValuePair <DeBruijnNode, DeBruijnEdge> child in node.Key.LeftExtensionNodes) { childPath = new Paths(); childPath.CurrentNode = child; if (familyTree == null) { childPath.FamilyTree.Add(node); } else { childPath.FamilyTree.AddRange(familyTree); childPath.FamilyTree.Add(node); } childPath.NodeOrientation = false; if (DistanceConstraint(childPath, contigPairedReadMap) && childPath.FamilyTree.Count < _depth && !contigPairedReadMap.All( t => childPath.FamilyTree.Any(k => t.Key == _graph.GetNodeSequence(k.Key)))) { search.Enqueue(childPath); } else { if (contigPairedReadMap.All( t => childPath.FamilyTree.Any(k => t.Key == _graph.GetNodeSequence(k.Key)))) { paths.Add(childPath); } } } } else { childPath = new Paths(); if (familyTree == null) { childPath.FamilyTree.Add(node); } else { childPath.FamilyTree.AddRange(familyTree); childPath.FamilyTree.Add(node); } if (contigPairedReadMap.All( t => childPath.FamilyTree.Any(k => t.Key == _graph.GetNodeSequence(k.Key)))) { paths.Add(childPath); } } }