/// <summary> /// Validate building Kmer using sequence and kmer length. /// </summary> /// <param name="nodeName">Name of the xml node for different test cases</param> /// <param name="IsKmerBuilder">True if validating kmerbuilder or else false.</param> private void ValidateKmer(string nodeName, bool IsKmerBuilder) { // Get the parameters from Xml string Sequence = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.SequenceNode1); string expectedKmerCount = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.KmrSeqCountNode); string expectedKmerSeq = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.KmerSequenceNode); string expectedKmerPos = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.PositionsNode); // Create a Kmer Sequence. ISequence seq = new Sequence(Alphabets.DNA, Sequence); KmersOfSequence kmerSeq; if (IsKmerBuilder) { // Build Kmer. var kmerBuilder = new SequenceToKmerBuilder(); KmersOfSequence kmerList = kmerBuilder.Build(seq, 2); // Validate builder kmer. Assert.AreEqual(expectedKmerCount, kmerList.Kmers.Count.ToString((IFormatProvider)null)); Assert.AreEqual(expectedKmerSeq, new String(kmerList.BaseSequence.Select(a => (char)a).ToArray())); Assert.AreEqual(expectedKmerPos, kmerList.Length.ToString((IFormatProvider)null)); } else { kmerSeq = new KmersOfSequence(seq, 2); // Validate Kmer Seq. Assert.AreEqual(expectedKmerSeq, new String(kmerSeq.BaseSequence.Select(a => (char)a).ToArray())); Assert.AreEqual(expectedKmerPos, kmerSeq.Length.ToString((IFormatProvider)null)); } }
/// <summary> /// Build graph nodes and edges from list of k-mers. /// Creates a node for every unique k-mer (and reverse-complement) /// in the read. Then, generates adjacency information between nodes /// by computing pairs of nodes that have overlapping regions /// between node sequences. /// </summary> /// <param name="sequences">List of input sequences</param> /// <param name="kmerLength">Kmer Length</param> public void Build(IList <ISequence> sequences, int kmerLength) { if (sequences == null) { throw new ArgumentNullException("sequences"); } if (kmerLength <= 0) { throw new ArgumentException(Properties.Resource.KmerLengthShouldBePositive); } _baseSequence = new List <ISequence>(sequences); var kmersData = SequenceToKmerBuilder.BuildPaDeNAKmerDictionary(sequences, kmerLength); _kmerNodes = new HashSet <DeBruijnNode>( kmersData.Values.AsParallel().Select(kd => { kd.Node = kd.KeyHasSameOrientation ? new DeBruijnNode(kmerLength, kd.Kmer.SequenceIndex, kd.Kmer.KmerPosition, kd.Kmer.Count, kd.Kmer.CountRC) : new DeBruijnNode(kmerLength, kd.Kmer.SequenceIndex, kd.Kmer.KmerPosition, kd.Kmer.CountRC, kd.Kmer.Count); kd.Kmer = null; return(kd.Node); })); // Add edge information GenerateAdjacency(kmersData, kmerLength); }
/// <summary> /// Add a line to each debruijin node if it corresponds to a /// kmer from a single position in a reference genome, /// </summary> protected void PaintKmersWithReference() { List <int> missingLocs = new List <int> (); var refKmerPositions = SequenceToKmerBuilder.BuildKmerDictionary(ReferenceGenome.ReferenceSequence, this.KmerLength); int KmersPainted = 0; int KmersSkipped = 0; DeBruijnGraph graph = this.Graph; long totalNodes = graph.NodeCount; foreach (var v in refKmerPositions) { ISequence seq = v.Key; IList <long> locations = v.Value; if (locations.Count == 1) { var kmerData = new KmerData32(); kmerData.SetKmerData(seq, 0, this.KmerLength); DeBruijnNode matchingNode = this.Graph.KmerManager.SetNewOrGetOld(kmerData, false); if (matchingNode != null) { matchingNode.ReferenceGenomePosition = (short)locations [0]; KmersPainted++; if (matchingNode.ReferenceGenomePosition < 0) { throw new Exception(); } } else { missingLocs.Add((int)locations [0]); } } else { KmersSkipped += locations.Count; } } if (false && OutputDiagnosticInformation) { StreamWriter sw = new StreamWriter("OutMissing.csv"); foreach (int i in missingLocs) { sw.WriteLine(i.ToString()); } sw.Close(); } double percentKmersSkipped = 100.0 * (KmersSkipped) / ((double)(KmersPainted + KmersSkipped)); if (percentKmersSkipped > 95.0) { throw new InvalidProgramException("Reference Genome Skipped over 95% of Kmers"); } double percentHit = KmersPainted / (double)refKmerPositions.Count; RaiseMessage("A total of " + (100.0 * percentHit).ToString() + "% nodes in the reference were painted"); PercentNodesPainted = 100.0 * KmersPainted / (double)totalNodes; RaiseMessage(PercentNodesPainted.ToString("n2") + " % of nodes painted, for a total of " + KmersPainted.ToString() + " painted."); RaiseMessage(percentKmersSkipped.ToString("n2") + " % of Kmers were skipped for being in multiple locations"); }
/// <summary> /// Validates the Sequences for all the general test cases. /// </summary> /// <param name="node">Xml Node Name</param> /// <param name="additionalParameter"> /// Additional Parameter based /// on which the validations are done. /// </param> private void ValidateComputeFeature(string node, AssemblyParameters additionalParameter) { // Get the parameters from Xml string firstSequence = utilityObj.xmlUtil.GetTextValue(node, Constants.SequenceNode1); string secondSequence = utilityObj.xmlUtil.GetTextValue(node, Constants.SequenceNode2); string kmerLength = utilityObj.xmlUtil.GetTextValue(node, Constants.KmerLengthNode); string expectedFeatureCount = utilityObj.xmlUtil.GetTextValue(node, Constants.FeatureCount); string expectedFeature = utilityObj.xmlUtil.GetTextValue(node, Constants.FeatureName); string expectedFeatureType = utilityObj.xmlUtil.GetTextValue(node, Constants.FeatureType); string expectedStartIndex = utilityObj.xmlUtil.GetTextValue(node, Constants.StartIndexNode); string expectedEndIndex = utilityObj.xmlUtil.GetTextValue(node, Constants.EndIndexNode); ISequence seq1 = null; ISequence seq2 = null; // Create Sequences. switch (additionalParameter) { case AssemblyParameters.Assemble: var seqObj1 = new Sequence(Alphabets.Protein, firstSequence); var seqObj2 = new Sequence(Alphabets.Protein, secondSequence); seq1 = seqObj1; seq2 = seqObj2; break; case AssemblyParameters.Consensus: seq1 = new Sequence(Alphabets.DNA, firstSequence); seq2 = new Sequence(Alphabets.DNA, secondSequence); break; } var kmerBuilder = new SequenceToKmerBuilder(); KmersOfSequence kmerList = kmerBuilder.Build(seq1, int.Parse(kmerLength, null)); List <WordMatch> nodes = WordMatch.BuildMatchTable( kmerList, seq2, int.Parse(kmerLength, null)); List <WordMatch> matchList = WordMatch.GetMinimalList(nodes, int.Parse(kmerLength, null)); List <DifferenceNode> diffNode = DifferenceNode.BuildDiffList(matchList, seq1, seq2); List <DifferenceNode.CompareFeature> features = DifferenceNode.OutputDiffList(diffNode, seq1, seq2); // Validate difference. Assert.AreEqual(expectedFeatureCount, features.Count.ToString((IFormatProvider)null)); Assert.AreEqual(expectedFeature, features[0].Feature); Assert.AreEqual(expectedFeatureType, features[0].FeatureType); Assert.AreEqual(expectedStartIndex, features[0].Start.ToString((IFormatProvider)null)); Assert.AreEqual(expectedEndIndex, features[0].End.ToString((IFormatProvider)null)); ApplicationLog.WriteLine(string.Format(null, "Kmer P1 : Validated DifferenceNodes successfully.")); }
public static void FindDifferencesInSequences(string firstFile, string secondFile) { // parsowanie pierwszej listy if (!SequenceParsers.IsFasta(firstFile)) { Console.WriteLine("Nieprawidlowy format pierwszego pliku!"); return; } if (!SequenceParsers.IsFasta(secondFile)) { Console.WriteLine("Nieprawidlowy format drugiego pliku!"); return; } var firstParser = SequenceParsers.FindParserByFileName(firstFile); firstParser.Alphabet = AmbiguousProteinAlphabet.Instance; var firstSequenceList = firstParser.Parse(); var firstFileSequences = Helper.ConvertIenumerableToList(firstSequenceList); // parsowanie drugiej listy var secondParser = SequenceParsers.FindParserByFileName(firstFile); secondParser.Alphabet = AmbiguousProteinAlphabet.Instance; var secondSequenceList = secondParser.Parse(); var secondFileSequences = Helper.ConvertIenumerableToList(secondSequenceList); // pobranie listy KMER'ów var kmerBuilder = new SequenceToKmerBuilder(); var kmerList = kmerBuilder.Build(firstFileSequences.First(), 2); var nodes = WordMatch.BuildMatchTable(kmerList, secondFileSequences.First(), 2); var list2 = new List <WordMatch>(nodes); var matchList = WordMatch.GetMinimalList(list2, 2); var list3 = new List <WordMatch>(matchList); // znajdŸ ró¿nice miêdzy wêz³ami var diffNode = DifferenceNode.BuildDiffList(list3, firstFileSequences.First(), secondFileSequences.First()); var list4 = new List <DifferenceNode>(diffNode); var features = DifferenceNode.OutputDiffList(list4, firstFileSequences.First(), secondFileSequences.First()); foreach (var compareFeature in features) { Console.WriteLine(compareFeature.Feature); } }
public void SequenceCompare() { ISequence seq1 = new Sequence(Alphabets.DNA, "AAAAAA"); ISequence seq2 = new Sequence(Alphabets.DNA, "AAATAA"); SequenceToKmerBuilder kmerBuilder = new SequenceToKmerBuilder(); KmersOfSequence kmers = kmerBuilder.Build(seq1, 2); List <WordMatch> nodes = WordMatch.BuildMatchTable(kmers, seq1, seq2, 2); List <WordMatch> matchList = WordMatch.GetMinimalList(nodes, 2); List <DifferenceNode> diffNode = DifferenceNode.BuildDiffList(matchList, seq1, seq2); List <DifferenceNode.CompareFeature> features = DifferenceNode.OutputDiffList(diffNode, seq1, seq2); Assert.AreEqual(features.Count, 4); Assert.AreEqual(features[0].Feature, "Insertion of 1 bases in 2 "); Assert.AreEqual(features[1].FeatureType, "REPLACE"); Assert.AreEqual(features[2].Feature, "Insertion of 1 bases in 1 "); Assert.AreEqual(features[3].FeatureType, "REPLACE"); }
public void ValidateSequenceCompare() { string firstSequence = utilityObj.xmlUtil.GetTextValue(Constants.SequenceCompareNode, Constants.SequenceNode1); string secondSequence = utilityObj.xmlUtil.GetTextValue(Constants.SequenceCompareNode, Constants.SequenceNode2); string replace = utilityObj.xmlUtil.GetTextValue(Constants.SequenceCompareNode, Constants.ReplaceNode); ISequence seq1 = new Sequence(Alphabets.DNA, firstSequence); ISequence seq2 = new Sequence(Alphabets.DNA, secondSequence); var kmerBuilder = new SequenceToKmerBuilder(); KmersOfSequence kmers = kmerBuilder.Build(seq1, 2); List <WordMatch> nodes = WordMatch.BuildMatchTable(kmers, seq2, 2); List <WordMatch> matchList = WordMatch.GetMinimalList(nodes, 2); List <DifferenceNode> diffNode = DifferenceNode.BuildDiffList(matchList, seq1, seq2); List <DifferenceNode.CompareFeature> features = DifferenceNode.OutputDiffList(diffNode, seq1, seq2); //Validating the bahavior. Assert.AreEqual(features.Count, 4); Assert.AreEqual(features[0].Feature, Constants.InsertionOfOneBaseIn2); Assert.AreEqual(features[1].FeatureType, replace); Assert.AreEqual(features[2].Feature, Constants.InsertionOfOneBaseIn1); Assert.AreEqual(features[3].FeatureType, replace); }
/// <summary> /// Validate building Kmer using sequence and kmer length. /// </summary> /// <param name="nodeName">Name of the xml node for different test cases</param> /// <param name="IsKmerBuilder">True if validating kmerbuilder or else false.</param> static void ValidateKmer(string nodeName, bool IsKmerBuilder) { // Get the parameters from Xml string Sequence = Utility._xmlUtil.GetTextValue(nodeName, Constants.SequenceNode1); string expectedKmerCount = Utility._xmlUtil.GetTextValue(nodeName, Constants.KmrSeqCountNode); string expectedKmerSeq = Utility._xmlUtil.GetTextValue(nodeName, Constants.KmerSequenceNode); string expectedKmerPos = Utility._xmlUtil.GetTextValue(nodeName, Constants.PositionsNode); // Create a Kmer Sequence. ISequence seq = new Sequence(Alphabets.DNA, Sequence); KmersOfSequence kmerSeq; if (IsKmerBuilder) { // Build Kmer. SequenceToKmerBuilder kmerBuilder = new SequenceToKmerBuilder(); KmersOfSequence kmerList = kmerBuilder.Build(seq, 2); // Validate builder kmer. Assert.AreEqual(expectedKmerCount, kmerList.Kmers.Count.ToString()); Assert.AreEqual(expectedKmerSeq, kmerList.BaseSequence.ToString()); Assert.AreEqual(expectedKmerPos, kmerList.Length.ToString()); } else { kmerSeq = new KmersOfSequence(seq, 2); // Validate Kmer Seq. Assert.AreEqual(expectedKmerSeq, kmerSeq.BaseSequence.ToString()); Assert.AreEqual(expectedKmerPos, kmerSeq.Length.ToString()); } }
/// <summary> /// Aligns reads to contigs using kmer method of alignment. /// </summary> /// <param name="contigs">List of contig sequences.</param> /// <param name="reads">List of read sequences.</param> /// <param name="kmerLength">Kmer Length.</param> /// <returns>List of Contig.</returns> public static IList <Contig> ReadContigAlignment(IList <ISequence> contigs, IList <ISequence> reads, int kmerLength) { KmerIndexerDictionary map = SequenceToKmerBuilder.BuildKmerDictionary(reads, kmerLength); IList <ContigIndex> contigDatas; contigDatas = contigs.AsParallel().Select(contig => { IEnumerable <ISequence> kmers = SequenceToKmerBuilder.GetKmerSequences(contig, kmerLength); ContigIndex index = new ContigIndex(contig); IList <KmerIndexer> positions; foreach (ISequence kmer in kmers) { if (map.TryGetValue(kmer, out positions) || map.TryGetValue(kmer.GetReverseComplementedSequence(), out positions)) { index.ContigReadMatchIndexes.Add(positions); } else { index.ContigReadMatchIndexes.Add(new List <KmerIndexer>()); } } return(index); }).ToList(); return(contigDatas.Select(contigData => { IList <Task <IList <ReadMap> > > tasks = new List <Task <IList <ReadMap> > >(); // Stores information about contigs for which tasks has been generated. IList <long> visitedReads = new List <long>(); // Creates Task for every read in nodes for a given contig. for (int index = 0; index < contigData.ContigReadMatchIndexes.Count; index++) { int readPosition = index; foreach (KmerIndexer kmer in contigData.ContigReadMatchIndexes[index]) { long contigIndex = kmer.SequenceIndex; if (!visitedReads.Contains(contigIndex)) { visitedReads.Add(contigIndex); tasks.Add( Task <IList <ReadMap> > .Factory.StartNew(t => MapRead(readPosition, contigData.ContigReadMatchIndexes, contigIndex, kmerLength), TaskCreationOptions.AttachedToParent)); } } } Contig contigOutputStructure = new Contig(); contigOutputStructure.Consensus = contigData.ContigSequence; for (int index = 0; index < visitedReads.Count; index++) { foreach (ReadMap maps in tasks[index].Result) { Contig.AssembledSequence assembledSeq = new Contig.AssembledSequence() { Length = maps.Length, Position = maps.StartPositionOfContig, ReadPosition = maps.StartPositionOfRead, Sequence = reads.ElementAt(visitedReads[index]) }; if (new string( contigOutputStructure.Consensus.GetSubSequence( assembledSeq.Position, assembledSeq.Length).Select(a => (char)a).ToArray()). Equals(new string(assembledSeq.Sequence.GetSubSequence(assembledSeq.ReadPosition, assembledSeq.Length) .Select(a => (char)a).ToArray()))) { assembledSeq.IsComplemented = false; assembledSeq.IsReversed = false; } else { assembledSeq.IsComplemented = true; assembledSeq.IsReversed = true; } contigOutputStructure.Sequences.Add(assembledSeq); } } return contigOutputStructure; }).ToList()); }
/// <summary> /// Public method mapping Reads to Contigs. /// </summary> /// <param name="contigs">List of sequences of contigs.</param> /// <param name="reads">List of input reads.</param> /// <param name="kmerLength">Length of kmer.</param> /// <returns>Contig Read Map.</returns> public ReadContigMap Map(IList <ISequence> contigs, IEnumerable <ISequence> reads, int kmerLength) { KmerIndexerDictionary map = SequenceToKmerBuilder.BuildKmerDictionary(contigs, kmerLength); ReadContigMap maps = new ReadContigMap(); Parallel.ForEach(reads, readSequence => { IEnumerable <ISequence> kmers = SequenceToKmerBuilder.GetKmerSequences(readSequence, kmerLength); ReadIndex read = new ReadIndex(readSequence); foreach (ISequence kmer in kmers) { IList <KmerIndexer> positions; if (map.TryGetValue(kmer, out positions) || map.TryGetValue(kmer.GetReverseComplementedSequence(), out positions)) { read.ContigReadMatchIndexes.Add(positions); } } IList <Task <IList <ReadMap> > > tasks = new List <Task <IList <ReadMap> > >(); // Stores information about contigs for which tasks has been generated. IList <long> visitedContigs = new List <long>(); // Creates Task for every read in nodes for a given contig. for (int index = 0; index < read.ContigReadMatchIndexes.Count; index++) { int readPosition = index; foreach (KmerIndexer kmer in read.ContigReadMatchIndexes[index]) { long contigIndex = kmer.SequenceIndex; if (!visitedContigs.Contains(contigIndex)) { visitedContigs.Add(contigIndex); tasks.Add( Task <IList <ReadMap> > .Factory.StartNew( t => MapRead( readPosition, read.ContigReadMatchIndexes, contigIndex, read.ReadSequence.Count, kmerLength), TaskCreationOptions.AttachedToParent)); } } } var overlapMaps = new Dictionary <ISequence, IList <ReadMap> >(); for (int index = 0; index < visitedContigs.Count; index++) { overlapMaps.Add(contigs.ElementAt(visitedContigs[index]), tasks[index].Result); } lock (maps) { if (!maps.ContainsKey(read.ReadSequence.ID)) { maps.Add(read.ReadSequence.ID, overlapMaps); } else { throw new ArgumentException( string.Format(CultureInfo.CurrentCulture, Resource.DuplicatingReadIds, read.ReadSequence.ID)); } } }); return(maps); }
public void PathPurger1() { const int KmerLength = 7; ISequence sequence = new Sequence(Alphabets.DNA, "GATTCAAGGGCTGGGGG"); IList <ISequence> contigsSequence = SequenceToKmerBuilder.GetKmerSequences(sequence, KmerLength).ToList(); ContigGraph graph = new ContigGraph(); graph.BuildContigGraph(contigsSequence, KmerLength); List <Node> contigs = graph.Nodes.ToList(); IList <ScaffoldPath> paths = new List <ScaffoldPath>(); ScaffoldPath path = new ScaffoldPath(); foreach (Node node in contigs) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(2, 5)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(3, 5)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(6, 5)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(0, 11)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(7, 4)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(11, 0)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(2, 9)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); path = new ScaffoldPath(); foreach (Node node in contigs.GetRange(1, 10)) { path.Add(new KeyValuePair <Node, Edge>(node, null)); } paths.Add(path); PathPurger assembler = new PathPurger(); assembler.PurgePath(paths); Assert.AreEqual(paths.Count, 1); Assert.IsTrue(Compare(paths.First(), contigs)); }