/// <summary> /// Generate contig overlap graph. /// </summary> /// <param name="contigs">List of contig sequences.</param> /// <returns>Contig Graph.</returns> protected ContigGraph GenerateContigOverlapGraph(IList <ISequence> contigs) { if (contigs == null) { throw new ArgumentNullException("contigs"); } ContigGraph contigGraph = new ContigGraph(); contigGraph.BuildContigGraph(contigs, this.kmerLength); return(contigGraph); }
/// <summary> /// Converts the scaffold path into its sequence. /// </summary> /// <param name="graph">De Bruijn graph.</param> /// <param name="kmerLength">Kmer Length.</param> /// <returns>Scaffold Sequence.</returns> public ISequence BuildSequenceFromPath(ContigGraph graph, int kmerLength) { if (graph == null) { throw new ArgumentNullException("graph"); } Node startNode = this[0].Key; bool isForwardDirection = this[0].Value.IsSameOrientation; startNode.MarkNode(); List <byte> scaffoldSequence = new List <byte>(); scaffoldSequence.InsertRange(0, graph.GetNodeSequence(startNode)); this.RemoveAt(0); // There is overlap of (k-1) symbols between adjacent contigs if (kmerLength > 1) { kmerLength--; } bool sameOrientation = true; ISequence nextNodeSequence = null; foreach (KeyValuePair <Node, Edge> extensions in this) { sameOrientation = !(sameOrientation ^ extensions.Value.IsSameOrientation); nextNodeSequence = sameOrientation ? graph.GetNodeSequence(extensions.Key) : graph.GetNodeSequence(extensions.Key).GetReverseComplementedSequence(); // Extend scaffold sequence using symbols from contig beyond the overlap if (isForwardDirection) { scaffoldSequence.InsertRange( scaffoldSequence.Count, nextNodeSequence.GetSubSequence(kmerLength, nextNodeSequence.Count - kmerLength)); } else { scaffoldSequence.InsertRange(0, nextNodeSequence.GetSubSequence(0, nextNodeSequence.Count - kmerLength)); } extensions.Key.MarkNode(); } if (nextNodeSequence == null) { return(null); } return(new Sequence(nextNodeSequence.Alphabet, scaffoldSequence.ToArray(), false)); }
/// <summary> /// Performs Breadth First Search to traverse through graph to generate scaffold paths. /// </summary> /// <param name="overlapGraph">Contig Overlap Graph.</param> /// <param name="contigPairedReadMaps">InterContig Distances.</param> /// <param name="lengthOfKmer">Length of Kmer.</param> /// <param name="searchDepth">Depth to which graph is searched.</param> /// <returns>List of paths/scaffold.</returns> public IList <ScaffoldPath> FindPaths( ContigGraph overlapGraph, ContigMatePairs contigPairedReadMaps, int lengthOfKmer, int searchDepth = 10) { if (overlapGraph == null) { throw new ArgumentNullException("deBruijnGraph"); } if (contigPairedReadMaps == null) { throw new ArgumentNullException("contigPairedReadMaps"); } if (lengthOfKmer <= 0) { throw new ArgumentException(Resource.KmerLength); } if (searchDepth <= 0) { throw new ArgumentException(Resource.Depth); } this.graph = overlapGraph; this.kmerLength = lengthOfKmer; this.depth = searchDepth; List <ScaffoldPath> scaffoldPaths = new List <ScaffoldPath>(); Parallel.ForEach( overlapGraph.Nodes, (Node node) => { Dictionary <ISequence, IList <ValidMatePair> > contigPairedReadMap; if (contigPairedReadMaps.TryGetValue(overlapGraph.GetNodeSequence(node), out contigPairedReadMap)) { List <ScaffoldPath> scaffoldPath = TraverseGraph(node, contigPairedReadMap); lock (scaffoldPaths) { scaffoldPaths.AddRange(scaffoldPath); } } }); return(scaffoldPaths); }
/// <summary> /// Generate sequences from list of contig nodes. /// </summary> /// <param name="contigGraph">Contig Overlap Graph.</param> /// <param name="paths">Scaffold paths.</param> /// <returns>List of sequences of scaffolds.</returns> protected IList <ISequence> GenerateScaffold( ContigGraph contigGraph, IList <ScaffoldPath> paths) { if (contigGraph == null) { throw new ArgumentNullException("contigGraph"); } if (paths == null) { throw new ArgumentNullException("paths"); } List <ISequence> scaffolds = paths.AsParallel().Select(t => t.BuildSequenceFromPath(contigGraph, this.kmerLength)).ToList(); IEnumerable <Node> visitedNodes = contigGraph.Nodes.AsParallel().Where(t => !t.IsMarked()); scaffolds.AddRange(visitedNodes.AsParallel().Select(t => contigGraph.GetNodeSequence(t))); contigGraph.Dispose(); return(scaffolds); }
/// <summary> /// Performs Breadth First Search in contig overlap graph. /// </summary> /// <param name="contigGraph">Contig Graph.</param> /// <param name="contigMatePairs">Contig Mate Pair map.</param> /// <returns>List of Scaffold Paths.</returns> protected IList <ScaffoldPath> TracePath(ContigGraph contigGraph, ContigMatePairs contigMatePairs) { return(this.tracePath.FindPaths(contigGraph, contigMatePairs, this.kmerLength, this.depthField)); }
/// <summary> /// Builds scaffolds from list of reads and contigs. /// </summary> /// <param name="reads">List of reads.</param> /// <param name="contigs">List of contigs.</param> /// <param name="lengthofKmer">Kmer Length.</param> /// <param name="depth">Depth for graph traversal.</param> /// <param name="redundancy">Number of mate pairs required to create a link between two contigs. /// Hierarchical Scaffolding With Bambus /// by: Mihai Pop, Daniel S. Kosack, Steven L. Salzberg /// Genome Research, Vol. 14, No. 1. (January 2004), pp. 149-159.</param> /// <returns>List of scaffold sequences.</returns> public IList <ISequence> BuildScaffold( IEnumerable <ISequence> reads, IList <ISequence> contigs, int lengthofKmer, int depth = 10, int redundancy = 2) { if (contigs == null) { throw new ArgumentNullException("contigs"); } if (null == reads) { throw new ArgumentNullException("reads"); } if (lengthofKmer <= 0) { throw new ArgumentException(Properties.Resource.KmerLength); } if (depth <= 0) { throw new ArgumentException(Resource.Depth); } if (redundancy < 0) { throw new ArgumentException(Resource.NegativeRedundancy); } this.depthField = depth; this.redundancyField = redundancy; this.kmerLength = lengthofKmer; IEnumerable <ISequence> readSeqs = ValidateReads(reads); //Step1: Generate contig overlap graph. IList <ISequence> contigsList = new List <ISequence>(contigs); ContigGraph contigGraph = GenerateContigOverlapGraph(contigsList); IEnumerable <Node> nodes = contigGraph.Nodes.Where(t => t.ExtensionsCount == 0); foreach (Node node in nodes) { contigsList.Remove(contigGraph.GetNodeSequence(node)); } // Step2: Map Reads to contigs. ReadContigMap readContigMaps = ReadContigMap(contigsList, readSeqs); contigsList = null; // Step3: Generate Contig Mate Pair Map. ContigMatePairs contigMatePairs = MapPairedReadsToContigs(readContigMaps, readSeqs); readContigMaps = null; // Step4: Filter Paired Reads. contigMatePairs = FilterReadsBasedOnOrientation(contigMatePairs); // Step5: Distance Calculation. CalculateDistanceBetweenContigs(contigMatePairs); // Step6: Trace Scaffold Paths. IList <ScaffoldPath> paths = TracePath(contigGraph, contigMatePairs); contigMatePairs = null; // Step7: Assemble paths. PathPurger(paths); // Step8: Generate sequence of scaffolds. return(GenerateScaffold(contigGraph, paths)); }
public void PathPurger1() { const int KmerLength = 7; ISequence sequence = new Sequence(Alphabets.DNA, "GATTCAAGGGCTGGGGG"); IList <ISequence> contigsSequence = SequenceToKmerBuilder.GetKmerSequences(sequence, KmerLength).ToList(); ContigGraph graph = new ContigGraph(); graph.BuildContigGraph(contigsSequence, KmerLength); List <Node> contigs = graph.Nodes.ToList(); IList <ScaffoldPath> paths = new List <ScaffoldPath>(); ScaffoldPath path = new ScaffoldPath(); foreach (Node node in contigs) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(2, 5)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(3, 5)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(6, 5)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(0, 11)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(7, 4)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(11, 0)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(2, 9)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(1, 10)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); PathPurger assembler = new PathPurger(); assembler.PurgePath(paths); Assert.AreEqual(paths.Count, 1); Assert.IsTrue(Compare(paths.First(), contigs)); }
public void TracePathTestWithPalindromicContig() { const int kmerLengthConst = 5; const int dangleThreshold = 3; const int redundantThreshold = 6; var sequences = new List <ISequence>() { new Sequence(Alphabets.DNA, "ATGCCTC") { ID = "0" }, new Sequence(Alphabets.DNA, "CCTCCTAT") { ID = "1" }, new Sequence(Alphabets.DNA, "TCCTATC") { ID = "2" }, new Sequence(Alphabets.DNA, "TGCCTCCT") { ID = "3" }, new Sequence(Alphabets.DNA, "ATCTTAGC") { ID = "4" }, new Sequence(Alphabets.DNA, "CTATCTTAG") { ID = "5" }, new Sequence(Alphabets.DNA, "CTTAGCG") { ID = "6" }, new Sequence(Alphabets.DNA, "GCCTCCTAT") { ID = "7" }, new Sequence(Alphabets.DNA, "TAGCGCGCTA") { ID = "8" }, new Sequence(Alphabets.DNA, "AGCGCGC") { ID = "9" }, new Sequence(Alphabets.DNA, "TTTTTT") { ID = "10" }, new Sequence(Alphabets.DNA, "TTTTTAAA") { ID = "11" }, new Sequence(Alphabets.DNA, "TAAAAA") { ID = "12" }, new Sequence(Alphabets.DNA, "TTTTAG") { ID = "13" }, new Sequence(Alphabets.DNA, "TTTAGC") { ID = "14" }, new Sequence(Alphabets.DNA, "GCGCGCCGCGCG") { ID = "15" }, }; KmerLength = kmerLengthConst; SequenceReads.Clear(); SetSequenceReads(sequences); CreateGraph(); DanglingLinksThreshold = dangleThreshold; DanglingLinksPurger = new DanglingLinksPurger(dangleThreshold); RedundantPathLengthThreshold = redundantThreshold; RedundantPathsPurger = new RedundantPathsPurger(redundantThreshold); UnDangleGraph(); RemoveRedundancy(); IList <ISequence> contigs = BuildContigs().ToList(); ReadContigMapper mapper = new ReadContigMapper(); ReadContigMap maps = mapper.Map(contigs, sequences, kmerLengthConst); MatePairMapper builder = new MatePairMapper(); CloneLibrary.Instance.AddLibrary("abc", 5, 15); ContigMatePairs pairedReads = builder.MapContigToMatePairs(sequences, maps); OrientationBasedMatePairFilter filter = new OrientationBasedMatePairFilter(); ContigMatePairs overlap = filter.FilterPairedReads(pairedReads, 0); DistanceCalculator dist = new DistanceCalculator(overlap); overlap = dist.CalculateDistance(); ContigGraph graph = new ContigGraph(); graph.BuildContigGraph(contigs, this.KmerLength); TracePath path = new TracePath(); IList <ScaffoldPath> paths = path.FindPaths(graph, overlap, kmerLengthConst, 3); Assert.AreEqual(paths.Count, 3); Assert.AreEqual(paths.First().Count, 3); ScaffoldPath scaffold = paths.First(); Assert.AreEqual("ATGCCTCCTATCTTAGC", graph.GetNodeSequence(scaffold[0].Key).ConvertToString()); Assert.AreEqual("TTAGCGCG", graph.GetNodeSequence(scaffold[1].Key).ConvertToString()); Assert.AreEqual("GCGCGC", graph.GetNodeSequence(scaffold[2].Key).ConvertToString()); }
public void TracePathTestWithPalindromicContig() { const int KmerLengthConst = 6; const int DangleThreshold = 3; const int RedundantThreshold = 7; List <ISequence> sequences = new List <ISequence>(); Sequence seq = new Sequence(Alphabets.DNA, "ATGCCTC".Select(a => (byte)a).ToArray()); seq.ID = ">10.x1:abc"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "CCTCCTAT".Select(a => (byte)a).ToArray()); seq.ID = "1"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TCCTATC".Select(a => (byte)a).ToArray()); seq.ID = "2"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TGCCTCCT".Select(a => (byte)a).ToArray()); seq.ID = "3"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "ATCTTAGC".Select(a => (byte)a).ToArray()); seq.ID = "4"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "CTATCTTAG".Select(a => (byte)a).ToArray()); seq.ID = "5"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "CTTAGCG".Select(a => (byte)a).ToArray()); seq.ID = "6"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "GCCTCCTAT".Select(a => (byte)a).ToArray()); seq.ID = ">8.x1:abc"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TAGCGCGCTA".Select(a => (byte)a).ToArray()); seq.ID = ">8.y1:abc"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "AGCGCGC".Select(a => (byte)a).ToArray()); seq.ID = ">9.x1:abc"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TTTTTT".Select(a => (byte)a).ToArray()); seq.ID = "7"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TTTTTAAA".Select(a => (byte)a).ToArray()); seq.ID = "8"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TAAAAA".Select(a => (byte)a).ToArray()); seq.ID = "9"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TTTTAG".Select(a => (byte)a).ToArray()); seq.ID = "10"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "TTTAGC".Select(a => (byte)a).ToArray()); seq.ID = "11"; sequences.Add(seq); seq = new Sequence(Alphabets.DNA, "GCGCGCCGCGCG".Select(a => (byte)a).ToArray()); seq.ID = "12"; sequences.Add(seq); KmerLength = KmerLengthConst; SequenceReads.Clear(); SetSequenceReads(sequences); CreateGraph(); DanglingLinksThreshold = DangleThreshold; DanglingLinksPurger = new DanglingLinksPurger(DangleThreshold); RedundantPathLengthThreshold = RedundantThreshold; RedundantPathsPurger = new RedundantPathsPurger(RedundantThreshold); UnDangleGraph(); RemoveRedundancy(); IList <ISequence> contigs = BuildContigs().ToList(); ReadContigMapper mapper = new ReadContigMapper(); ReadContigMap maps = mapper.Map(contigs, sequences, KmerLengthConst); MatePairMapper builder = new MatePairMapper(); CloneLibrary.Instance.AddLibrary("abc", (float)5, (float)15); ContigMatePairs pairedReads = builder.MapContigToMatePairs(sequences, maps); ContigMatePairs overlap; OrientationBasedMatePairFilter filter = new OrientationBasedMatePairFilter(); overlap = filter.FilterPairedReads(pairedReads, 0); DistanceCalculator dist = new DistanceCalculator(overlap); overlap = dist.CalculateDistance(); ContigGraph graph = new ContigGraph(); graph.BuildContigGraph(contigs, this.KmerLength); TracePath path = new TracePath(); IList <ScaffoldPath> paths = path.FindPaths(graph, overlap, KmerLengthConst, 3); Assert.AreEqual(paths.Count, 3); Assert.AreEqual(paths.First().Count, 3); ScaffoldPath scaffold = paths.First(); Assert.IsTrue(new string(graph.GetNodeSequence(scaffold[0].Key).Select(a => (char)a).ToArray()).Equals("ATGCCTCCTATCTTAGC")); Assert.IsTrue(new string(graph.GetNodeSequence(scaffold[1].Key).Select(a => (char)a).ToArray()).Equals("TTAGCGCG")); Assert.IsTrue(new string(graph.GetNodeSequence(scaffold[2].Key).Select(a => (char)a).ToArray()).Equals("GCGCGC")); }