Exemple #1
0
        public void ValidateGetSequenceRange()
        {
            const int kmerLength         = 6;
            const int dangleThreshold    = 3;
            const int redundantThreshold = 7;

            using (ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler())
            {
                assembler.KmerLength                   = kmerLength;
                assembler.DanglingLinksThreshold       = dangleThreshold;
                assembler.RedundantPathLengthThreshold = redundantThreshold;

                assembler.ScaffoldRedundancy = 0;
                assembler.Depth = 3;
                CloneLibrary.Instance.AddLibrary("abc", (float)5, (float)20);

                PadenaAssembly result   = (PadenaAssembly)assembler.Assemble(GetReadsForScaffolds(), true);
                ISequence      sequence = result.ContigSequences[0];
                ISequence      seqRange = Helper.GetSequenceRange(sequence, 2, 3);
                Assert.AreEqual(3, seqRange.Count);

                string sequenceStr = new string(sequence.Select(a => (char)a).ToArray());
                string seqRangeStr = new string(seqRange.Select(a => (char)a).ToArray());
                Assert.IsTrue(sequenceStr.Contains(seqRangeStr));
            }
        }
Exemple #2
0
        public void AssemblerTestWithScaffoldBuilder()
        {
            const int kmerLength         = 6;
            const int dangleThreshold    = 3;
            const int redundantThreshold = 7;

            using (ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler())
            {
                assembler.KmerLength                   = kmerLength;
                assembler.DanglingLinksThreshold       = dangleThreshold;
                assembler.RedundantPathLengthThreshold = redundantThreshold;

                assembler.ScaffoldRedundancy = 0;
                assembler.Depth = 3;
                CloneLibrary.Instance.AddLibrary("abc", 5, 20);

                PadenaAssembly result = (PadenaAssembly)assembler.Assemble(TestInputs.GetReadsForScaffolds(), true);

                HashSet <string> expectedContigs = new HashSet <string>
                {
                    "TTTTTT", "CGCGCG", "TTAGCGCG", "CGCGCCGCGC", "GCGCGC", "TTTTTA", "TTTTAA", "TTTAAA", "TTTTAGC", "ATGCCTCCTATCTTAGC"
                };

                AlignmentHelpers.CompareSequenceLists(expectedContigs, result.ContigSequences);

                HashSet <string> expectedScaffolds = new HashSet <string>
                {
                    "ATGCCTCCTATCTTAGCGCGC", "TTTAAA", "TTTTTT", "TTTTAGC", "TTTTAA", "CGCGCCGCGC", "TTTTTA", "CGCGCG"
                };

                AlignmentHelpers.CompareSequenceLists(expectedScaffolds, result.Scaffolds);
            }
        }
Exemple #3
0
        /// <summary>
        /// Writes the contigs to the file.
        /// </summary>
        /// <param name="assembly">IDeNovoAssembly parameter is the result of running De Novo Assembly on a set of two or more sequences. </param>
        protected void writeContigs(PadenaAssembly assembly)
        {
            if (assembly.AssembledSequences.Count == 0)
            {
                Output.WriteLine(OutputLevel.Results, "\tNo sequences assembled.");
                return;
            }
            ensureContigNames(assembly.AssembledSequences);

            if (!string.IsNullOrEmpty(this.DiagnosticFilePrefix))
            {
                using (FastAFormatter formatter = new FastAFormatter(ContigFileName)) {
                    formatter.AutoFlush = true;
                    foreach (ISequence seq in assembly.AssembledSequences)
                    {
                        formatter.Write(seq);
                    }
                }
                Output.WriteLine(OutputLevel.Information, "\tWrote {0} sequences to {1}", assembly.AssembledSequences.Count, ContigFileName);
            }
            else
            {
                Output.WriteLine(OutputLevel.Information, "\tAssembled Sequence Results: {0} sequences", assembly.AssembledSequences.Count);
                using (FastAFormatter formatter = new FastAFormatter()) {
                    formatter.Open(new StreamWriter(Console.OpenStandardOutput()));
                    formatter.MaxSymbolsAllowedPerLine = decideOutputWidth();
                    formatter.AutoFlush = true;
                    foreach (ISequence seq in assembly.AssembledSequences)
                    {
                        formatter.Write(seq);
                    }
                }
            }
        }
Exemple #4
0
        public void TestPadenaAssemblyToString()
        {
            ISequence         seq2       = new Sequence(Alphabets.DNA, "ACAAAAGCAAC");
            ISequence         seq1       = new Sequence(Alphabets.DNA, "ATGAAGGCAATACTAGTAGT");
            IList <ISequence> contigList = new List <ISequence>();

            contigList.Add(seq1);
            contigList.Add(seq2);
            PadenaAssembly denovoAssembly = new PadenaAssembly();

            denovoAssembly.AddContigs(contigList);

            string actualString   = denovoAssembly.ToString();
            string expectedString = "ATGAAGGCAATACTAGTAGT\r\nACAAAAGCAAC\r\n";

            Assert.AreEqual(actualString, expectedString);
        }
        public void ValidatePadenaAssemblyToString()
        {
            ISequence         seq2       = new Sequence(Alphabets.DNA, "ACAAAAGCAAC");
            ISequence         seq1       = new Sequence(Alphabets.DNA, "ATGAAGGCAATACTAGTAGT");
            IList <ISequence> contigList = new List <ISequence>();

            contigList.Add(seq1);
            contigList.Add(seq2);
            var denovoAssembly = new PadenaAssembly();

            denovoAssembly.AddContigs(contigList);

            string actualString   = denovoAssembly.ToString();
            string expectedString = "ATGAAGGCAATACTAGTAGT\r\nACAAAAGCAAC\r\n";

            Assert.AreEqual(actualString, expectedString);

            // read sequences from xml
            string sequence1 = this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName,
                                                                    Constants.SequenceNode1);
            string sequence2 = this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName,
                                                                    Constants.SequenceNode2);
            IAlphabet alphabet =
                Utility.GetAlphabet(this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName,
                                                                         Constants.AlphabetNameNode));

            var seq3 = new Sequence(alphabet, sequence1);
            var seq4 = new Sequence(alphabet, sequence2);
            IList <ISequence> contigList1 = new List <ISequence>();

            contigList1.Add(seq3);
            contigList1.Add(seq4);
            var denovoAssembly1 = new PadenaAssembly();

            denovoAssembly1.AddContigs(contigList1);

            string actualString1   = denovoAssembly1.ToString();
            string expectedString1 = "GCCAAAATTTAGGC\r\nTTATGGCGCCCACGGA\r\n";

            Assert.AreEqual(expectedString1, actualString1);
        }
Exemple #6
0
        public void AssemblerTestWithScaffoldBuilder()
        {
            const int kmerLength         = 6;
            const int dangleThreshold    = 3;
            const int redundantThreshold = 7;

            using (ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler())
            {
                assembler.KmerLength                   = kmerLength;
                assembler.DanglingLinksThreshold       = dangleThreshold;
                assembler.RedundantPathLengthThreshold = redundantThreshold;

                assembler.ScaffoldRedundancy = 0;
                assembler.Depth = 3;
                CloneLibrary.Instance.AddLibrary("abc", (float)5, (float)20);

                PadenaAssembly result = (PadenaAssembly)assembler.Assemble(TestInputs.GetReadsForScaffolds(), true);

                Assert.AreEqual(10, result.ContigSequences.Count());

                HashSet <string> expectedContigs = new HashSet <string>
                {
                    "GCGCGC",
                    "TTTTTT",
                    "TTTTTA",
                    "TTTTAA",
                    "TTTAAA",
                    "ATGCCTCCTATCTTAGC",
                    "TTTTAGC",
                    "TTAGCGCG",
                    "CGCGCCGCGC",
                    "CGCGCG"
                };

                foreach (ISequence contig in result.ContigSequences)
                {
                    string contigSeq = new string(contig.Select(a => (char)a).ToArray());
                    Assert.IsTrue(
                        expectedContigs.Contains(contigSeq) ||
                        expectedContigs.Contains(contigSeq.GetReverseComplement(new char[contigSeq.Length])));
                }

                Assert.AreEqual(8, result.Scaffolds.Count());
                HashSet <string> expectedScaffolds = new HashSet <string>
                {
                    "ATGCCTCCTATCTTAGCGCGC",
                    "TTTTTT",
                    "TTTTTA",
                    "TTTTAA",
                    "TTTAAA",
                    "CGCGCCGCGC",
                    "TTTTAGC",
                    "CGCGCG"
                };

                foreach (ISequence scaffold in result.Scaffolds)
                {
                    string scaffoldSeq = new string(scaffold.Select(a => (char)a).ToArray());
                    Assert.IsTrue(
                        expectedScaffolds.Contains(scaffoldSeq) ||
                        expectedScaffolds.Contains(scaffoldSeq.GetReverseComplement(new char[scaffoldSeq.Length])));
                }
            }
        }
        public void ValidatePadenaAssemblyToString()
        {
            ISequence seq2 = new Sequence(Alphabets.DNA, "ACAAAAGCAAC");
            ISequence seq1 = new Sequence(Alphabets.DNA, "ATGAAGGCAATACTAGTAGT");
            IList<ISequence> contigList = new List<ISequence>();
            contigList.Add(seq1);
            contigList.Add(seq2);
            var denovoAssembly = new PadenaAssembly();
            denovoAssembly.AddContigs(contigList);

            string actualString = denovoAssembly.ToString();
            string expectedString = "ATGAAGGCAATACTAGTAGT\r\nACAAAAGCAAC\r\n";
            Assert.AreEqual(actualString, expectedString);

            // read sequences from xml
            string sequence1 = this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName,
                                                               Constants.SequenceNode1);
            string sequence2 = this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName,
                                                               Constants.SequenceNode2);
            IAlphabet alphabet =
                Utility.GetAlphabet(this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName,
                                                                    Constants.AlphabetNameNode));

            var seq3 = new Sequence(alphabet, sequence1);
            var seq4 = new Sequence(alphabet, sequence2);
            IList<ISequence> contigList1 = new List<ISequence>();
            contigList1.Add(seq3);
            contigList1.Add(seq4);
            var denovoAssembly1 = new PadenaAssembly();
            denovoAssembly1.AddContigs(contigList1);

            string actualString1 = denovoAssembly1.ToString();
            string expectedString1 = "GCCAAAATTTAGGC\r\nTTATGGCGCCCACGGA\r\n";
            Assert.AreEqual(expectedString1, actualString1);
        }
Exemple #8
0
        //TODO: This is really a PADENA Test, needs to be renamed.
        public void ValidateGetReverseComplement()
        {
            const int kmerLength         = 6;
            const int dangleThreshold    = 3;
            const int redundantThreshold = 7;

            using (ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler())
            {
                assembler.KmerLength                   = kmerLength;
                assembler.DanglingLinksThreshold       = dangleThreshold;
                assembler.RedundantPathLengthThreshold = redundantThreshold;

                assembler.ScaffoldRedundancy = 0;
                assembler.Depth = 3;
                CloneLibrary.Instance.AddLibrary("abc", 5, 20);

                PadenaAssembly result = (PadenaAssembly)assembler.Assemble(GetReadsForScaffolds(), true);



                var expectedContigs = new List <string>
                {
                    "TTTTTT",
                    "TTAGCGCG",
                    "CGCGCCGCGC",
                    "CGCGCG",
                    "GCGCGC",
                    "TTTTTA",
                    "TTTTAGC",
                    "TTTTAA",
                    "TTTAAA",
                    "ATGCCTCCTATCTTAGC",
                };

                Assert.AreEqual(10, result.ContigSequences.Count());

                foreach (ISequence contig in result.ContigSequences)
                {
                    string contigSeq = contig.ConvertToString();
                    Assert.IsTrue(
                        expectedContigs.Contains(contigSeq) ||
                        expectedContigs.Contains(contigSeq.GetReverseComplement(new char[contigSeq.Length])),
                        "Found unknown contig " + contigSeq);
                }

                Assert.AreEqual(8, result.Scaffolds.Count());
                var expectedScaffolds = new List <string>
                {
                    "ATGCCTCCTATCTTAGCGCGC",
                    "CGCGCG",
                    "CGCGCCGCGC",
                    "TTTTTA",
                    "TTTTTT",
                    "TTTTAGC",
                    "TTTTAA",
                    "TTTAAA",
                };

                foreach (ISequence scaffold in result.Scaffolds)
                {
                    string scaffoldSeq = scaffold.ConvertToString();
                    Assert.IsTrue(
                        expectedScaffolds.Contains(scaffoldSeq) ||
                        expectedScaffolds.Contains(scaffoldSeq.GetReverseComplement(new char[scaffoldSeq.Length])),
                        "Found unknown scaffold " + scaffoldSeq);
                }
            }
        }
Exemple #9
0
        public override PadenaAssembly Assemble(IEnumerable <ISequence> inputSequences)
        {
            if (inputSequences == null)
            {
                throw new ArgumentNullException("inputSequences");
            }
            // Step 0: Load the reference genome as a fasta file.
            // Remove ambiguous reads and set up fields for assembler process
            this.Initialize();

            // Step 1, 2: Create k-mers from reads and build de bruijn graph and paint them with the reference
            System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew();
            this.CreateGraphStarted();
            this.CreateGraph(inputSequences);
            this.CreateGraphEnded();
            sw.Stop();
            this.NodeCountReport();
            this.TaskTimeSpanReport(sw.Elapsed);

            int count = this.Graph.GetNodes().Where(x => x.IsInReference).Count();

            ReferenceNodeCountAfterCreation = count;
            TotalSequencingBP = Graph.GetNodes().Sum(x => x.KmerCount * KmerLength);
            RaiseMessage("A total of: " + count.ToString() + " nodes remain from the reference");
            RaiseMessage("A total of: " + this.Graph.NodeCount + " nodes are in the graph");

            NodeCountAfterCreation   = Graph.NodeCount;
            SkippedReadsAfterQCCount = Graph.SkippedSequencesCount;
            ReadCount = Graph.ProcessedSequencesCount;

            if (NodeCountAfterCreation < 100)
            {
                return(null);
            }

            //Step 2.1, Remove nodes are below coverage cutoff
            sw.Reset();
            sw.Start();
            // Estimate and set default value for erosion and coverage thresholds
            this.EstimateDefaultThresholds();
            int sqrtCoverageCutOff = this.CalculateSqrtOfMedianCoverageCutoff();
            var coverageCutOff     = ForceSqrtThreshold ? sqrtCoverageCutOff : Math.Min(sqrtCoverageCutOff, AlternateMinimumNodeCount);

            if (MTAssembleArguments.MinNodeCountSet)
            {
                coverageCutOff = AlternateMinimumNodeCount;
            }
            //var coverageCutOff = sqrtCoverageCutOff;
            KmerCutOff = coverageCutOff;
            if (OutputIntermediateGraphSteps)
            {
                OutputNodeCountHistograms("PreFiltered", coverageCutOff);
            }
            long originalNodes = this.Graph.NodeCount;
            ThresholdCoverageNodePurger snr = new ThresholdCoverageNodePurger(coverageCutOff);

            snr.RemoveLowCoverageNodes(Graph);
            PercentNodesRemovedByLowCoverageOrThreshold = originalNodes / (double)this.Graph.NodeCount;
            sw.Stop();

            TaskTimeSpanReport(sw.Elapsed);
            RaiseMessage("Finished removing nodes with less than " + snr.CoverageCutOff.ToString() + " counts");
            NodeCountReport();
            NodeCountAfterCoveragePurge = Graph.NodeCount;
            sw.Reset();
            sw.Start();
            RaiseMessage("Start removing unconnected nodes");

            // Remove pathological nodes
            var badNodes = PathologicalSequencePurger.RemovePathologicalNodes(Graph);

            if (badNodes > 0)
            {
                RaiseMessage("WARNING!!!! Found and removed " + badNodes.ToString() + " pathological nodes.  These were removed.", false);
            }

            //Step 2.2 Remove nodes that are not connected to the reference genome
            UnlinkedToReferencePurger remover = new UnlinkedToReferencePurger();

            remover.RemoveUnconnectedNodes(Graph, referenceNodes);
            RaiseMessage("Finished removing unconnected nodes");
            this.NodeCountReport();
            NodeCountAfterUndangle = Graph.NodeCount;
            //outputVisualization ("PostUnconnectedFilter");

            // Step 2.3: Remove dangling links from graph
            ///NIGEL: This also removes the low coverage nodes
            sw.Reset();
            sw.Restart();
            this.UndangleGraphStarted();
            this.UnDangleGraph();
            this.UndangleGraphEnded();
            sw.Stop();

            this.TaskTimeSpanReport(sw.Elapsed);
            this.NodeCountReport();
            if (OutputIntermediateGraphSteps)
            {
                outputVisualization("PostUndangleFilter");
            }

            // Step 4: Remove redundant SNP and indel paths from graph
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to remove redundant paths", DateTime.Now));
            this.RemoveRedundancyStarted();
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to remove SNPs", DateTime.Now));
            this.RemoveRedundancy();
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished  removing SNPs", DateTime.Now));
            this.NodeCountReport();
            //Now remove redundant indel paths as well
            //TODO: For historic reasons this is largely similar to the snp remover, which isn't so great...
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to call INDELs", DateTime.Now));
            var indels = CallAndRemoveIndels();

            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished calling and removing small INDELs paths", DateTime.Now));
            this.NodeCountReport();
            // Perform dangling link purger step once more.
            // This is done to remove any links created by redundant paths purger.
            this.UnDangleGraph();
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished removing all redundant paths", DateTime.Now));
            this.NodeCountReport();

            //STEP 4.2 Rerun the unlinked to reference purger after graph is cleaned
            ChangeNodeVisitFlag(false);
            remover = new UnlinkedToReferencePurger();
            remover.RemoveUnconnectedNodes(Graph, referenceNodes);
            this.RemoveRedundancyEnded();
            this.NodeCountReport();

            NodeCountAfterRedundancyRemoval = Graph.NodeCount;
            if (OutputIntermediateGraphSteps)
            {
                outputVisualization("Post-redundant-path-removal");
            }
            //Now attempt to assemble and find deletions
            var attemptedAssembly = new MitochondrialAssembly(Graph, DiagnosticFileOutputPrefix);

            FinalMetaNodeCount = attemptedAssembly.AllNodesInGraph.Count;
            SuccessfulAssembly = attemptedAssembly.SuccessfulAssembly;
            if (SuccessfulAssembly)
            {
                SuccessfulAssemblyLength = attemptedAssembly.AssemblyLength;
                MinSplitPercentage       = attemptedAssembly.MinimumGreedySplit;
                if (OutputDiagnosticInformation)
                {
                    var outReport = attemptedAssembly.OutputAssembly(DiagnosticFileOutputPrefix);
                    if (outReport != null)
                    {
                        //TODO: This matching is really crappy, need to find a better way to propogate this on up.
                        BestHaplotypeScore                = outReport.BestHit.Rank;
                        SecondBestHaplotypeScore          = outReport.SecondBestHit.Rank;
                        BestMatchingHaplotype             = outReport.BestHit.node.haplogroup.id;
                        SecondBestMatchingHaplotype       = outReport.SecondBestHit.node.haplogroup.id;
                        NumberOfEquallyGoodHaplotypes     = outReport.NumberOfEquallyGoodBestHits;
                        PolymorphismsMatchingHaplotype    = outReport.BestHit.NumberOfMatchingPolymorphisms;
                        PolymorphismsMissingFromHaplotype = outReport.BestHit.NumberOfPolymorphismsMissingFromHaplotype;
                        PolymorphismsMissingFromGenotype  = outReport.BestHit.NumberOfPolymorphismsMissingFromGenotype;
                    }
                }
                else
                {
                    RaiseMessage("Greedy assembly skipped as assembly failed.");
                }
            }
            else
            {
                RaiseMessage("Greedy assembly skipped as assembly failed.");
            }

            //Now find deletions
            this.OutputGraphicAndFindDeletion(attemptedAssembly);
            PercentageOfScannedReadsUsed = Graph.GetNodes().Sum(x => x.KmerCount * KmerLength) / (double)TotalSequencingBP;
            RaiseMessage("Used a total of " + PercentageOfScannedReadsUsed.ToString("p") + " of basepairs in reads for assembly");

            // Step 5: Build Contigs - This is essentially independent of deletion finding
            this.BuildContigsStarted();
            List <ISequence> contigSequences = this.BuildContigs().ToList();

            contigSequences.ForEach(x => ReferenceGenome.AssignContigToMTDNA(x));
            contigSequences.Sort((x, y) => - x.Count.CompareTo(y.Count));
            this.BuildContigsEnded();

            PadenaAssembly result = new PadenaAssembly();

            result.AddContigs(contigSequences);
            long totalLength = contigSequences.Sum(x => x.Count);

            RaiseMessage("Assembled " + totalLength.ToString() + " bases of sequence in " + contigSequences.Count.ToString() + " contigs.");
            if (contigSequences.Count > 0)
            {
                N50 = CalculateN50(contigSequences, totalLength);
                RaiseMessage("N50: " + N50.ToString());
            }
            count = this.Graph.GetNodes().Where(x => x.IsInReference).Count();
            RaiseMessage("A total of: " + count.ToString() + " nodes remain from the reference");
            RaiseMessage("A total of: " + this.Graph.NodeCount + " nodes are in the graph");
            return(result);
        }