public long GetEdgesCount(SequenceModel sequenceData) { var sequence = _sequenceProvider.Provide(sequenceData?.FileName, sequenceData?.Content)?.First(); _suffixTree = new MultiWaySuffixTree(sequence); return(_suffixTree.EdgesCount); }
public IEnumerable <Match> GetUniqueMatches(SequenceModel sequenceData) { var sequence = _sequenceProvider.Provide(sequenceData?.FileName, sequenceData?.Content)?.First(); _suffixTree = new MultiWaySuffixTree(sequence); return(_suffixTree.SearchMatchesUniqueInReference(sequence)); }
void ValidateFindMatchSuffixGeneralTestCases(string nodeName, bool isFilePath, AdditionalParameters additionalParam) { ISequence referenceSeqs = null; string[] referenceSequences = null; string[] searchSequences = null; List <ISequence> searchSeqList = new List <ISequence>(); if (isFilePath) { // Gets the reference sequence from the FastA file string filePath = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.FilePathNode); Assert.IsNotNull(filePath); ApplicationLog.WriteLine(string.Format((IFormatProvider)null, "NUCmer BVT : Successfully validated the File Path '{0}'.", filePath)); using (FastAParser parser = new FastAParser(filePath)) { IEnumerable <ISequence> referenceSeqList = parser.Parse(); List <Byte> byteList = new List <Byte>(); foreach (ISequence seq in referenceSeqList) { byteList.AddRange(seq); byteList.Add((byte)'+'); } referenceSeqs = new Sequence(AlphabetExtensions.GetMummerAlphabet(referenceSeqList.ElementAt(0).Alphabet), byteList.ToArray()); // Gets the query sequence from the FastA file string queryFilePath = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.SearchSequenceFilePathNode); Assert.IsNotNull(queryFilePath); ApplicationLog.WriteLine(string.Format((IFormatProvider)null, "NUCmer BVT : Successfully validated the File Path '{0}'.", queryFilePath)); FastAParser queryParser = new FastAParser(queryFilePath); IEnumerable <ISequence> querySeqList = queryParser.Parse(); foreach (ISequence seq in querySeqList) { searchSeqList.Add(seq); } } } else { // Gets the reference & search sequences from the configurtion file referenceSequences = utilityObj.xmlUtil.GetTextValues(nodeName, Constants.ReferenceSequencesNode); searchSequences = utilityObj.xmlUtil.GetTextValues(nodeName, Constants.SearchSequencesNode); IAlphabet seqAlphabet = Utility.GetAlphabet(utilityObj.xmlUtil.GetTextValue(nodeName, Constants.AlphabetNameNode)); List <ISequence> refSeqList = new List <ISequence>(); for (int i = 0; i < referenceSequences.Length; i++) { ISequence referSeq = new Sequence(seqAlphabet, encodingObj.GetBytes(referenceSequences[i])); refSeqList.Add(referSeq); } List <Byte> byteListQuery = new List <Byte>(); foreach (ISequence seq in refSeqList) { byteListQuery.AddRange(seq); byteListQuery.Add((byte)'+'); } referenceSeqs = new Sequence(AlphabetExtensions.GetMummerAlphabet(refSeqList.ElementAt(0).Alphabet), byteListQuery.ToArray()); for (int i = 0; i < searchSequences.Length; i++) { ISequence searchSeq = new Sequence(seqAlphabet, encodingObj.GetBytes(searchSequences[i])); searchSeqList.Add(searchSeq); } } string mumLength = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.MUMLengthNode); // Builds the suffix for the reference sequence passed. MultiWaySuffixTree suffixTreeBuilder = new MultiWaySuffixTree(referenceSeqs as Sequence); suffixTreeBuilder.MinLengthOfMatch = long.Parse(mumLength, null); Dictionary <ISequence, IEnumerable <Match> > matches = new Dictionary <ISequence, IEnumerable <Match> >(); for (int i = 0; i < searchSeqList.Count; i++) { matches.Add(searchSeqList[i], suffixTreeBuilder.SearchMatchesUniqueInReference(searchSeqList[i])); } List <Match> mums = new List <Match>(); foreach (var a in matches.Values) { mums.AddRange(a); } switch (additionalParam) { case AdditionalParameters.FindUniqueMatches: // Validates the Unique Matches. ApplicationLog.WriteLine("NUCmer BVT : Validating the Unique Matches"); Assert.IsTrue(ValidateUniqueMatches(mums, nodeName, additionalParam, isFilePath)); Console.WriteLine( "NUCmer BVT : Successfully validated the all the unique matches for the sequences."); break; case AdditionalParameters.PerformClusterBuilder: // Validates the Unique Matches. ApplicationLog.WriteLine( "NUCmer BVT : Validating the Unique Matches using Cluster Builder"); Assert.IsTrue(ValidateUniqueMatches(mums, nodeName, additionalParam, isFilePath)); Console.WriteLine( "NUCmer BVT : Successfully validated the all the cluster builder matches for the sequences."); break; default: break; } ApplicationLog.WriteLine( "NUCmer BVT : Successfully validated the all the unique matches for the sequences."); }
static void Main(string[] args) { try { // DateTime dStart = DateTime.Now; Stopwatch swMumUtil = Stopwatch.StartNew(); Stopwatch swInterval = new Stopwatch(); Console.Error.WriteLine(SplashString()); if (args.Length > 0) { CommandLineOptions myArgs = ProcessCommandLine(args); if (myArgs.help) { Console.WriteLine(Resources.MumUtilHelp); } else { FileInfo refFileinfo = new FileInfo(myArgs.fileList[0]); long refFileLength = refFileinfo.Length; refFileinfo = null; swInterval.Restart(); IEnumerable <ISequence> referenceSequences = ParseFastA(myArgs.fileList[0]); Sequence referenceSequence = referenceSequences.First() as Sequence; swInterval.Stop(); if (myArgs.verbose) { Console.Error.WriteLine(); Console.Error.WriteLine(" Processed Reference FastA file: {0}", Path.GetFullPath(myArgs.fileList[0])); Console.Error.WriteLine(" Length of first Sequence: {0:#,000}", referenceSequence.Count); Console.Error.WriteLine(" Read/Processing time: {0}", swInterval.Elapsed); Console.Error.WriteLine(" File size : {0:#,000} bytes", refFileLength); } FileInfo queryFileinfo = new FileInfo(myArgs.fileList[1]); long queryFileLength = queryFileinfo.Length; refFileinfo = null; IEnumerable <ISequence> parsedQuerySequences = ParseFastA(myArgs.fileList[1]); IEnumerable <ISequence> querySequences = parsedQuerySequences; if (myArgs.reverseOnly) { // convert to reverse complement sequences querySequences = ReverseComplementSequenceList(parsedQuerySequences); } else if (myArgs.both) { // add the reverse complement sequences along with query sequences. querySequences = AddReverseComplementsToSequenceList(parsedQuerySequences); } // DISCUSSION: // Three possible outputs desired. Globally unique 'mum' (v1), unique in reference sequence (v2), // or get the maximum matches of length or greater. // mummerTime = new TimeSpan(); writetime = new TimeSpan(); IEnumerable <Match> mums; long memoryAtStart = 0; long memoryAtEnd = 0; if (myArgs.verbose) { swMumUtil.Stop(); memoryAtStart = GC.GetTotalMemory(true); swMumUtil.Start(); } swInterval.Restart(); MultiWaySuffixTree suffixTreee = new MultiWaySuffixTree(referenceSequence); swInterval.Stop(); if (myArgs.verbose) { swMumUtil.Stop(); memoryAtEnd = GC.GetTotalMemory(true); swMumUtil.Start(); } MUMmer mummer = new MUMmer(suffixTreee); if (myArgs.verbose) { Console.Error.WriteLine(); Console.Error.WriteLine("Suffix tree construction time : {0}", swInterval.Elapsed); Console.Error.WriteLine("Memory consumed by Suffix tree : {0:#,000}", memoryAtEnd - memoryAtStart); Console.Error.WriteLine("Total edges created : {0:#,000}", suffixTreee.EdgesCount); Console.Error.WriteLine("Memory per edge : {0:#,000.00} bytes", (((double)(memoryAtEnd - memoryAtStart)) / suffixTreee.EdgesCount)); Console.Error.WriteLine(); Console.Error.WriteLine(" Processed Query FastA file: {0}", Path.GetFullPath(myArgs.fileList[1])); Console.Error.WriteLine(" File Size : {0:#,000} bytes", queryFileLength); } mummer.LengthOfMUM = myArgs.length; mummer.NoAmbiguity = myArgs.noAmbiguity; long querySeqCount = 0; double sumofSeqLength = 0; TimeSpan totalTimetakenToProcessQuerySequences = new TimeSpan(); string outputOption = string.Empty; if (myArgs.maxmatch) { outputOption = "GetMumsMaxMatch()"; swInterval.Restart(); foreach (Sequence qSeq in querySequences) { // Stop the wath after each query sequence parsed. swInterval.Stop(); // Add total time to process query sequence. // if reverse complement option is set, includes reverse complement time also. totalTimetakenToProcessQuerySequences = totalTimetakenToProcessQuerySequences.Add(swInterval.Elapsed); mums = mummer.GetMatches(qSeq); WriteMums(mums, referenceSequence, qSeq, myArgs); querySeqCount++; sumofSeqLength += qSeq.Count; // Start the watch for next query sequence parse. swInterval.Restart(); } swInterval.Stop(); } else if (myArgs.mum) { // mums = mum3.GetMumsMum( referenceSequences[0], querySequences); outputOption = "GetMumsMum()"; swInterval.Restart(); foreach (Sequence qSeq in querySequences) { // Stop the wath after each query sequence parsed. swInterval.Stop(); // Add total time to process query sequence. // if reverse complement option is set, includes reverse complement time also. totalTimetakenToProcessQuerySequences = totalTimetakenToProcessQuerySequences.Add(swInterval.Elapsed); swInterval.Restart(); // TODO: After implementing GetMatchesUniqueInBothReferenceAndQuery() in MUMmer //// GetMatchesUniqueInReference() with GetMatchesUniqueInBothReferenceAndQuery() in the line below. mums = mummer.GetMatchesUniqueInReference(qSeq); swInterval.Stop(); // Add time taken by GetMatchesUniqueInBothReferenceAndQuery(). mummerTime = mummerTime.Add(swInterval.Elapsed); swInterval.Restart(); WriteMums(mums, referenceSequence, qSeq, myArgs); swInterval.Stop(); // Add time taken by write matches. writetime = writetime.Add(swInterval.Elapsed); querySeqCount++; sumofSeqLength += qSeq.Count; // Start the watch for next query sequence parse. swInterval.Restart(); } swInterval.Stop(); } else if (myArgs.mumreference) { // NOTE: // mum3.GetMUMs() this really implements the GetMumReference() functionality // mums = mum3.GetMumsReference( referenceSequences[0], querySequences); // should be //swInterval.Restart(); outputOption = "GetMumsReference()"; swInterval.Restart(); foreach (Sequence qSeq in querySequences) { // Stop the wath after each query sequence parsed. swInterval.Stop(); // Add total time to process query sequence. // if reverse complement option is set, includes reverse complement time also. totalTimetakenToProcessQuerySequences = totalTimetakenToProcessQuerySequences.Add(swInterval.Elapsed); swInterval.Restart(); mums = mummer.GetMatchesUniqueInReference(qSeq); swInterval.Stop(); // Add time taken by GetMatchesUniqueInReference(). mummerTime = mummerTime.Add(swInterval.Elapsed); swInterval.Restart(); WriteMums(mums, referenceSequence, qSeq, myArgs); swInterval.Stop(); // Add time taken by write matches. writetime = writetime.Add(swInterval.Elapsed); querySeqCount++; sumofSeqLength += qSeq.Count; // Start the watch for next query sequence parse. swInterval.Restart(); } swInterval.Stop(); } else { // cannot happen as argument processing already asserted one of the three options must be specified Console.Error.WriteLine("\nError: one of /maxmatch, /mum, /mumreference options must be specified."); Environment.Exit(-1); // kill the error about unitialized use of 'mums' in the next block...the compiler does not recognize // Environment.Exit() as a no-return function throw new Exception("Never hit this"); } if (myArgs.verbose) { if (myArgs.reverseOnly || myArgs.both) { Console.Error.WriteLine(" Read/Processing time : {0}", timeTakenToParseQuerySequences); Console.Error.WriteLine(" Reverse Complement time : {0}", timeTakenToGetReverseComplement); Console.Error.WriteLine(" Total time taken to Process reads: {0}", totalTimetakenToProcessQuerySequences); } else { Console.Error.WriteLine(" Read/Processing time : {0}", totalTimetakenToProcessQuerySequences); } Console.Error.WriteLine(); Console.Error.WriteLine(" Number of query Sequences : {0:#,000}", querySeqCount); Console.Error.WriteLine(" Average length of query Sequences: {0:#,000}", sumofSeqLength / querySeqCount); Console.Error.WriteLine(); Console.Error.WriteLine("Compute {0,20} time : {1}", outputOption, mummerTime); Console.Error.WriteLine(" WriteMums() time : {0}", writetime); } swMumUtil.Stop(); if (myArgs.verbose) { Console.Error.WriteLine(" Total MumUtil Runtime : {0}", swMumUtil.Elapsed); } } } else { Console.WriteLine(Resources.MumUtilHelp); } } catch (Exception ex) { DisplayException(ex); } }
/// <summary> /// Validates most of the find matches suffix tree test cases with varying parameters. /// </summary> /// <param name="nodeName">Node name which needs to be read for execution.</param> /// <param name="isFilePath">Is File Path?</param> /// <param name="additionalParam">LIS action type enum</param> void ValidateFindMatchSuffixGeneralTestCases(string nodeName, bool isFilePath, AdditionalParameters additionalParam) { ISequence referenceSeqs; var searchSeqList = new List <ISequence>(); if (isFilePath) { // Gets the reference sequence from the FastA file string filePath = this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.FilePathNode); Assert.IsNotNull(filePath); FastAParser parser = new FastAParser(); IEnumerable <ISequence> referenceSeqList = parser.Parse(filePath); List <Byte> byteList = new List <Byte>(); foreach (ISequence seq in referenceSeqList) { byteList.AddRange(seq); byteList.Add((byte)'+'); } referenceSeqs = new Sequence(referenceSeqList.First().Alphabet.GetMummerAlphabet(), byteList.ToArray()); // Gets the query sequence from the FastA file string queryFilePath = this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.SearchSequenceFilePathNode); Assert.IsNotNull(queryFilePath); IEnumerable <ISequence> querySeqList = parser.Parse(queryFilePath); searchSeqList.AddRange(querySeqList); } else { // Gets the reference & search sequences from the configuration file string[] referenceSequences = this.utilityObj.xmlUtil.GetTextValues(nodeName, Constants.ReferenceSequencesNode); string[] searchSequences = this.utilityObj.xmlUtil.GetTextValues(nodeName, Constants.SearchSequencesNode); IAlphabet seqAlphabet = Utility.GetAlphabet(this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.AlphabetNameNode)); List <ISequence> refSeqList = referenceSequences.Select(t => new Sequence(seqAlphabet, this.encodingObj.GetBytes(t))).Cast <ISequence>().ToList(); List <Byte> byteListQuery = new List <Byte>(); foreach (ISequence seq in refSeqList) { byteListQuery.AddRange(seq); byteListQuery.Add((byte)'+'); } referenceSeqs = new Sequence(refSeqList.First().Alphabet.GetMummerAlphabet(), byteListQuery.ToArray()); searchSeqList.AddRange(searchSequences.Select(t => new Sequence(seqAlphabet, this.encodingObj.GetBytes(t))).Cast <ISequence>()); } string mumLength = this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.MUMLengthNode); // Builds the suffix for the reference sequence passed. MultiWaySuffixTree suffixTreeBuilder = new MultiWaySuffixTree(referenceSeqs as Sequence) { MinLengthOfMatch = long.Parse(mumLength, null) }; var matches = new Dictionary <ISequence, IEnumerable <Match> >(); foreach (ISequence sequence in searchSeqList) { matches.Add(sequence, suffixTreeBuilder.SearchMatchesUniqueInReference(sequence)); } List <Match> mums = new List <Match>(); foreach (var a in matches.Values) { mums.AddRange(a); } switch (additionalParam) { case AdditionalParameters.FindUniqueMatches: // Validates the Unique Matches. Assert.IsTrue(this.ValidateUniqueMatches(mums, nodeName, additionalParam, isFilePath)); break; case AdditionalParameters.PerformClusterBuilder: // Validates the Unique Matches. Assert.IsTrue(this.ValidateUniqueMatches(mums, nodeName, additionalParam, isFilePath)); break; default: break; } }