/// <summary> /// Parses the sequence represented by the specified sequence pointer. /// </summary> /// <param name="pointer"> /// A sequence pointer which holds information about the sequence to be retrieved. /// </param> /// <returns>IAlignedSequence object.</returns> public IAlignedSequence ParseAlignedSequence(SequencePointer pointer) { if (pointer == null) { throw new ArgumentNullException("pointer"); } if (string.IsNullOrEmpty(_fileName)) { throw new NotSupportedException(Resource.DataVirtualizationNeedsInputFile); } if (pointer.IndexOffsets[0] >= pointer.IndexOffsets[1]) return null; if (_mbfStreamReader == null || !_mbfStreamReader.CanRead) { _mbfStreamReader = new MBFStreamReader(_fileName); } string buffer; _mbfStreamReader.Seek(pointer.IndexOffsets[0], SeekOrigin.Begin); buffer = _mbfStreamReader.ReadLine(); return ParseSequence(buffer, _isReadOnly); }
public void ValidateFVQSPProperties() { IVirtualSequenceParser parserObj = new FastQParser(); try { FileVirtualQualitativeSequenceProvider provObj = new FileVirtualQualitativeSequenceProvider(parserObj, GetSequencePointer()); provObj.BlockSize = 5; provObj.IsReadOnly = false; provObj.MaxNumberOfBlocks = 10; provObj.SequencePointerInstance = GetSequencePointer(); SequencePointer seqPoint = GetSequencePointer(); Assert.AreEqual(5, provObj.BlockSize); Assert.AreEqual(10, provObj.MaxNumberOfBlocks); Assert.IsFalse(provObj.IsReadOnly); Assert.AreEqual(seqPoint.AlphabetName, provObj.SequencePointerInstance.AlphabetName); Assert.AreEqual(26, provObj.Count); ApplicationLog.WriteLine(@"FVQSP Bvt : Successfully validated all the properties"); Console.WriteLine(@"FVQSP Bvt : Successfully validated all the properties"); } finally { (parserObj as FastQParser).Dispose(); } }
/// <summary> /// Parses a range of symbols starting from the specified index in the sequence. /// </summary> /// <param name="startIndex">The zero-based index at which to begin parsing.</param> /// <param name="count">The number of symbols to parse.</param> /// <param name="seqPointer">The sequence pointer of that sequence.</param> /// <returns>The parsed symbols as ASCII values.</returns> public byte[] ParseRange(int startIndex, int count, SequencePointer seqPointer) { if (string.IsNullOrEmpty(_fileName)) { throw new NotSupportedException(Resource.DataVirtualizationNeedsInputFile); } if (seqPointer == null) { throw new ArgumentNullException("seqPointer"); } if (startIndex < 0) { throw new ArgumentOutOfRangeException("startIndex"); } if (count <= 0) { throw new ArgumentOutOfRangeException("count"); } if (_mbfStreamReader == null || !_mbfStreamReader.CanRead) { _mbfStreamReader = new MBFStreamReader(_fileName); } long fileIndex = startIndex + seqPointer.IndexOffsets[0]; return(_mbfStreamReader.ReadBytes(fileIndex, count)); }
/// <summary> /// Parses all the sequences in a SAM file. /// This method is used only in data virtualization scenarios. /// </summary> /// <param name="mbfReader">A reader for the sequence alignment text.</param> private void ParseSequences(MBFStreamReader mbfReader) { // if DV enabled if (IsDataVirtualizationEnabled && _sidecarFileProvider.SidecarFileExists) { try { while (mbfReader.HasLines && !mbfReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { SequencePointer sequencePointer = new SequencePointer { AlphabetName = Alphabets.DNA.Name }; // sequence starting index sequencePointer.IndexOffsets[0] = mbfReader.CurrentLineStartingIndex; // sequence ending index sequencePointer.IndexOffsets[1] = mbfReader.CurrentLineStartingIndex + mbfReader.Line.Length; // Write each sequence pointer to the sidecar file immediately _sidecarFileProvider.WritePointer(sequencePointer); mbfReader.GoToNextLine(); _lineCount++; } _sidecarFileProvider.Close(); } catch (Exception) { _sidecarFileProvider.Cleanup(); } } }
/// <summary> /// Gets Sequence pointer. /// </summary> /// <param name="startLine">Set starting index of the pointer</param> /// <returns>sequence pointer</returns> static SequencePointer GetSequencePointer(int startLine) { SequencePointer seqPointer = new SequencePointer(); seqPointer.AlphabetName = "DNA"; //seqPointer.EndingIndex = 0; seqPointer.StartingLine = startLine; seqPointer.Id = null; // seqPointer.StartingIndex = 0; return(seqPointer); }
/// <summary> /// Gets BAM Sequence pointer. /// </summary> /// <param name="startLine">Set starting index of the pointer</param> /// <returns>sequence pointer</returns> private static SequencePointer GetBAMSequencePointer(int LineNumber, int startIndex, int endIndex) { SequencePointer seqPointer = new SequencePointer(); seqPointer.AlphabetName = "DNA"; seqPointer.IndexOffsets[0] = startIndex; seqPointer.IndexOffsets[1] = endIndex; seqPointer.Id = null; seqPointer.StartingLine = LineNumber; return(seqPointer); }
/// <summary> /// Gets the SequencePointer /// </summary> /// <returns>Sequence Pointer</returns> private static SequencePointer GetSequencePointer() { SequencePointer pointerObj = new SequencePointer(); pointerObj.AlphabetName = "DNA"; pointerObj.Id = "gi|186972394|gb|EU490707.1| Selenipedium aequinoctiale maturase K (matK) gene, partial cds; chloroplast"; pointerObj.IndexOffsets[0] = 104; pointerObj.IndexOffsets[1] = 1405; pointerObj.StartingLine = 1; return pointerObj; }
/// <summary> /// Gets the SequencePointer /// </summary> /// <returns>Sequence Pointer</returns> private static SequencePointer GetSequencePointer() { SequencePointer pointerObj = new SequencePointer(); pointerObj.AlphabetName = "DNA"; pointerObj.Id = "SRR002012.1 Oct4:5:1:871:340 length=26"; pointerObj.IndexOffsets[0] = 40; pointerObj.IndexOffsets[1] = pointerObj.IndexOffsets[0] + 26; pointerObj.StartingLine = 1; return(pointerObj); }
/// <summary> /// Parses a range of sequence items starting from the specified index in the sequence. /// </summary> /// <param name="startIndex">The zero-based index at which to begin parsing.</param> /// <param name="count">The number of symbols to parse.</param> /// <param name="seqPointer">The sequence pointer of that sequence.</param> /// <returns>The parsed sequence.</returns> public ISequence ParseRange(int startIndex, int count, SequencePointer seqPointer) { if (0 > startIndex) { throw new ArgumentOutOfRangeException("startIndex"); } if (0 >= count) { throw new ArgumentOutOfRangeException("count"); } if (seqPointer == null) { throw new ArgumentNullException("seqPointer"); } // if the start index exceeds the sequence boundary if ((long)startIndex + seqPointer.IndexOffsets[0] >= seqPointer.IndexOffsets[1]) { return(null); } IAlphabet alphabet = Alphabets.All.Single(A => A.Name.Equals(seqPointer.AlphabetName)); Sequence sequence = new Sequence(alphabet) { IsReadOnly = false }; if (_mbfStreamReader == null || !_mbfStreamReader.CanRead) { _mbfStreamReader = new MBFStreamReader(_fileName); } long filePosition = startIndex + seqPointer.IndexOffsets[0]; int sequenceLength = (int)(seqPointer.IndexOffsets[1] - seqPointer.IndexOffsets[0]); if (count + startIndex >= sequenceLength) { count = (int)(sequenceLength - startIndex); } char[] buffer = _mbfStreamReader.ReadChars(filePosition, count); sequence.InsertRange(0, new string(buffer)); // default for partial load sequence.IsReadOnly = true; return(sequence); }
/// <summary> /// Get perf nos of each block /// </summary> /// <param name="parserObj">Fasta Parser object</param> /// <param name="pointerObj">Seq pointer</param> /// <param name="seq">Isequence</param> private void GetBlockPerfNumber(FastaParser parserObj, SequencePointer pointerObj, ISequence seq, string seqCountToRead) { // Calculating First Block Time and CPU Utilization _watchObj.Reset(); _watchObj.Start(); ISequence firstBlock = parserObj.ParseRange(0, Int32.Parse(seqCountToRead) , pointerObj); _watchObj.Stop(); Console.WriteLine(string.Format("FirstBlock Perf Time : {0} Secs", TimeSpan.FromMilliseconds( _watchObj.ElapsedMilliseconds).TotalSeconds.ToString())); Console.WriteLine(string.Format("FirstBlock CPU Utilization : {0}", _cpuCounterObj.NextValue().ToString())); // Calculating Middle Block Time and CPU Utilization _watchObj.Reset(); _watchObj.Start(); ISequence middleBlock = parserObj.ParseRange((seq.Count / 2), Int32.Parse(seqCountToRead), pointerObj); _watchObj.Stop(); Console.WriteLine(string.Format("MiddleBlock Perf Time : {0} Secs", TimeSpan.FromMilliseconds( _watchObj.ElapsedMilliseconds).TotalSeconds.ToString())); Console.WriteLine(string.Format("MiddleBlock CPU Utilization : {0}", _cpuCounterObj.NextValue().ToString())); // Calculating Last Block Time and CPU Utilization _watchObj.Reset(); _watchObj.Start(); ISequence lastBlock = parserObj.ParseRange(seq.Count - Int32.Parse(seqCountToRead), Int32.Parse(seqCountToRead), pointerObj); _watchObj.Stop(); Console.WriteLine(string.Format("LastBlock Perf Time : {0} Secs", TimeSpan.FromMilliseconds( _watchObj.ElapsedMilliseconds).TotalSeconds.ToString())); Console.WriteLine(string.Format("LastBlock CPU Utilization : {0}", _cpuCounterObj.NextValue().ToString())); }
public void ValidateFVQSPVirtualSeqParserSeqPointerConstructor() { FileVirtualQualitativeSequenceProvider provObj = GetVirtualSequenceProvider(); SequencePointer seqPointerObj = GetSequencePointer(); Assert.AreEqual(26, provObj.Count); Assert.AreEqual(seqPointerObj.Id, provObj.SequencePointerInstance.Id); ApplicationLog.WriteLine(@"FVQSP Bvt : Successfully validated the constructor FileVirtualQualitativeSequenceProvider(IVirtualSequenceParser, SequencePointer)"); Console.WriteLine(@"FVQSP Bvt : Successfully validated the constructor FileVirtualQualitativeSequenceProvider(IVirtualSequenceParser, SequencePointer)"); }
public void PerformObjectModelPerf() { string filePathObj = Utility._xmlUtil.GetTextValue(Constants.ObjectModelNodeName, Constants.FilePathNode); string seqCountToRead = Utility._xmlUtil.GetTextValue(Constants.ObjectModelNodeName, Constants.SequenceRangeToRead); Assert.IsNotNullOrEmpty(filePathObj); // Create a List for input files. List <string> lstInputFiles = new List <string>(); lstInputFiles.Add(filePathObj); FastaParser parserObj = new FastaParser(); parserObj.EnforceDataVirtualization = true; IList <ISequence> seqListObj = parserObj.Parse(filePathObj, false); SequencePointer pointerObj = new SequencePointer(); pointerObj.AlphabetName = Utility._xmlUtil.GetTextValue(Constants.ObjectModelNodeName, Constants.AlphabetNode); pointerObj.Id = Utility._xmlUtil.GetTextValue(Constants.ObjectModelNodeName, Constants.SequenceIDNode); pointerObj.IndexOffsets[0] = int.Parse( Utility._xmlUtil.GetTextValue(Constants.ObjectModelNodeName, Constants.StartIndexNode)); pointerObj.IndexOffsets[1] = int.Parse( Utility._xmlUtil.GetTextValue(Constants.ObjectModelNodeName, Constants.EndIndexNode)); pointerObj.StartingLine = int.Parse( Utility._xmlUtil.GetTextValue(Constants.ObjectModelNodeName, Constants.StartLineNode)); GetBlockPerfNumber(parserObj, pointerObj, seqListObj[0], seqCountToRead); GetSequencePerfNumber(seqListObj[0]); }
/// <summary> /// Parses a range of sequence items starting from the specified index in the sequence. /// </summary> /// <param name="startIndex">The zero-based index at which to begin parsing.</param> /// <param name="count">The number of symbols to parse.</param> /// <param name="seqPointer">The sequence pointer of that sequence.</param> /// <returns>The parsed sequence.</returns> public ISequence ParseRange(int startIndex, int count, SequencePointer seqPointer) { if (string.IsNullOrEmpty(_fileName)) { throw new NotSupportedException(Resource.DataVirtualizationNeedsInputFile); } if (startIndex < 0) { throw new ArgumentOutOfRangeException("startIndex"); } if (count <= 0) { throw new ArgumentOutOfRangeException("count"); } IAlphabet alphabet = Alphabets.All.Single(A => A.Name.Equals(seqPointer.AlphabetName)); Sequence sequence = new Sequence(alphabet) { IsReadOnly = false }; int start = (int)seqPointer.StartingIndex + startIndex; if (start >= seqPointer.EndingIndex) { return(null); } int includesNewline = seqPointer.StartingLine * Environment.NewLine.Length; int len = (int)(seqPointer.EndingIndex - seqPointer.StartingIndex); using (BioTextReader bioReader = new BioTextReader(_fileName)) { string str = bioReader.ReadBlock(startIndex, seqPointer.StartingIndex + includesNewline, count, len); sequence.InsertRange(0, str); } // default for partial load sequence.IsReadOnly = true; return(sequence); }
/// <summary> /// Gets the sequence ID corresponding to the specified sequence pointer. /// </summary> /// <param name="pointer"> /// A sequence pointer representing the sequence whose ID is to be retrieved. /// </param> /// <returns>The sequence ID of the specified sequence.</returns> public string GetSequenceID(SequencePointer pointer) { if (pointer == null) { throw new ArgumentNullException("pointer"); } if (_mbfStreamReader == null || !_mbfStreamReader.CanRead) { _mbfStreamReader = new MBFStreamReader(_fileName); } _mbfStreamReader.Seek(pointer.IndexOffsets[0] - pointer.StartingLine, SeekOrigin.Begin); _mbfStreamReader.ReadLine(); // Read Sequence ID by looking back from the sequence starting index pointer.Id = _mbfStreamReader.GetLineField(2); return(pointer.Id); }
public void ValidateBAMParseAlignedSeqWithSeqPointer() { // Get values from XML node. string expectedSequence = _utilityObj._xmlUtil.GetTextValue( Constants.BAMToSAMConversionNode, Constants.ExpectedSeqWithPointersNode); string samFilePath = _utilityObj._xmlUtil.GetTextValue( Constants.BAMToSAMConversionNode, Constants.FilePathNode); string startingLineForPointer = _utilityObj._xmlUtil.GetTextValue( Constants.BAMToSAMConversionNode, Constants.LineNumberToPointNode); string startIndex = _utilityObj._xmlUtil.GetTextValue( Constants.BAMToSAMConversionNode, Constants.StartIndexNode); string endIndex = _utilityObj._xmlUtil.GetTextValue( Constants.BAMToSAMConversionNode, Constants.EndIndexNode); // Parse a BAM file using (BAMParser parserObj = new BAMParser()) { parserObj.EnforceDataVirtualization = true; SequenceAlignmentMap seqList = parserObj.Parse(samFilePath); Assert.IsNotNull(seqList); // Get a pointer object SequencePointer pointerObj = GetBAMSequencePointer(Int32.Parse(startingLineForPointer, (IFormatProvider)null), Int32.Parse(startIndex, (IFormatProvider)null), Int32.Parse(endIndex, (IFormatProvider)null)); // Parse a BAM file using Sequence Pointer. SAMAlignedSequence alignedSeq = ( SAMAlignedSequence)parserObj.ParseAlignedSequence(pointerObj); // Validate parsed SAM aligned sequence. Assert.AreEqual(expectedSequence, alignedSeq.QuerySequence.ToString()); Console.WriteLine(string.Format((IFormatProvider)null, "BAM Parser BVT : Sequence alignment aligned seq {0} validate successfully", alignedSeq.Sequences[0].ToString())); ApplicationLog.WriteLine(string.Format((IFormatProvider)null, "BAM Parser BVT : Sequence alignment aligned seq validate successfully")); } }
public void ValidateSequencePointerProperties() { SequencePointer pointerObj = new SequencePointer(); pointerObj.AlphabetName = "Dna"; pointerObj.Id = "PointerID"; pointerObj.IndexOffsets[0] = 1; pointerObj.IndexOffsets[1] = 10; pointerObj.StartingLine = 1; Assert.AreEqual("Dna", pointerObj.AlphabetName); Assert.AreEqual("PointerID", pointerObj.Id); Assert.AreEqual(1, pointerObj.IndexOffsets[0]); Assert.AreEqual(10, pointerObj.IndexOffsets[1]); Assert.AreEqual(1, pointerObj.StartingLine); ApplicationLog.WriteLine( "Sequence Pointer Bvt : Successfully validated all the properties"); Console.WriteLine( "Sequence Pointer Bvt : Successfully validated all the properties"); }
/// <summary> /// Parses a range of symbols starting from the specified index in the sequence. /// </summary> /// <param name="startIndex">The zero-based index at which to begin parsing.</param> /// <param name="count">The number of symbols to parse.</param> /// <param name="seqPointer">The sequence pointer of that sequence.</param> /// <returns>The parsed symbols as ASCII values.</returns> public byte[] ParseRange(int startIndex, int count, SequencePointer seqPointer) { if (0 > startIndex) { throw new ArgumentOutOfRangeException("startIndex"); } if (0 >= count) { throw new ArgumentOutOfRangeException("count"); } if (seqPointer == null) { throw new ArgumentNullException("seqPointer"); } // if the start index exceeds the sequence boundary if ((long)startIndex + seqPointer.IndexOffsets[0] >= seqPointer.IndexOffsets[1]) { return(null); } if (_mbfStreamReader == null || !_mbfStreamReader.CanRead) { _mbfStreamReader = new MBFStreamReader(_fileName); } long filePosition = startIndex + seqPointer.IndexOffsets[0]; int sequenceLength = (int)(seqPointer.IndexOffsets[1] - seqPointer.IndexOffsets[0]); if (count + startIndex >= sequenceLength) { count = (int)(sequenceLength - startIndex); } return(_mbfStreamReader.ReadBytes(filePosition, count)); }
/// <summary> /// Parses a range of sequence items starting from the specified index in the sequence. /// </summary> /// <param name="startIndex">The zero-based index at which to begin parsing.</param> /// <param name="count">The number of symbols to parse.</param> /// <param name="seqPointer">The sequence pointer of that sequence.</param> /// <returns>The parsed sequence.</returns> public ISequence ParseRange(int startIndex, int count, SequencePointer seqPointer) { if (0 > startIndex) { throw new ArgumentOutOfRangeException("startIndex"); } if (0 >= count) { throw new ArgumentOutOfRangeException("count"); } IAlphabet alphabet = Alphabets.All.Single(A => A.Name.Equals(seqPointer.AlphabetName)); Sequence sequence = new Sequence(alphabet); sequence.IsReadOnly = false; int start = (int)seqPointer.StartingIndex + startIndex; if (start >= seqPointer.EndingIndex) { return(null); } int includesNewline = seqPointer.StartingLine * Environment.NewLine.Length; int len = (int)(seqPointer.EndingIndex - seqPointer.StartingIndex); using (BioTextReader bioReader = new BioTextReader(_fileName)) { string sequenceString = bioReader.ReadBlock(startIndex, seqPointer.StartingIndex + includesNewline, count, len); sequence.InsertRange(0, sequenceString); } // default for partial load sequence.IsReadOnly = true; return(sequence); }
public void ValidateSAMParseAlignedSeqWithSeqPointer() { // Get values from XML node. string expectedSequence = Utility._xmlUtil.GetTextValue( Constants.SAMFileWithAllFieldsNode, Constants.ExpectedSeqWithPointersNode); string samFilePath = Utility._xmlUtil.GetTextValue( Constants.SAMFileWithAllFieldsNode, Constants.FilePathNode); string lineNumberForPointer = Utility._xmlUtil.GetTextValue( Constants.SAMFileWithAllFieldsNode, Constants.LineNumberToPointNode); // Parse a SAM file SAMParser parserObj = new SAMParser(); parserObj.EnforceDataVirtualization = true; SequenceAlignmentMap seqList = parserObj.Parse(samFilePath); Assert.IsNotNull(seqList); // Get a pointer object SequencePointer pointerObj = GetSequencePointer(Int32.Parse(lineNumberForPointer)); pointerObj.IndexOffsets[0] = 156; pointerObj.IndexOffsets[1] = 304; // Parse a SAM file using Sequence Pointer. SAMAlignedSequence alignedSeq = (SAMAlignedSequence)parserObj.ParseAlignedSequence(pointerObj); // Validate parsed SAM aligned sequence. Assert.AreEqual(expectedSequence, alignedSeq.QuerySequence.ToString()); Console.WriteLine(string.Format(null, "SAM Parser BVT : Sequence alignment aligned seq {0} validate successfully", alignedSeq.Sequences[0].ToString())); ApplicationLog.WriteLine(string.Format(null, "SAM Parser BVT : Sequence alignment aligned seq validate successfully")); }
/// <summary> /// Parses a range of sequence items starting from the specified index in the sequence. /// </summary> /// <param name="startIndex">The zero-based index at which to begin parsing.</param> /// <param name="count">The number of symbols to parse.</param> /// <param name="seqPointer">The sequence pointer of the specified sequence.</param> /// <returns>The parsed sequence.</returns> public ISequence ParseRange(int startIndex, int count, SequencePointer seqPointer) { if (string.IsNullOrEmpty(_fileName)) { throw new NotSupportedException(Resource.DataVirtualizationNeedsInputFile); } if (startIndex < 0) { throw new ArgumentOutOfRangeException("startIndex"); } if (count <= 0) { throw new ArgumentOutOfRangeException("count"); } IAlphabet alphabet = Alphabets.All.Single(A => A.Name.Equals(seqPointer.AlphabetName)); Sequence sequence = new Sequence(alphabet) { IsReadOnly = false }; if (_mbfStreamReader == null || !_mbfStreamReader.CanRead) { _mbfStreamReader = new MBFStreamReader(_fileName); } long fileIndex = startIndex + seqPointer.IndexOffsets[0]; char[] buffer = _mbfStreamReader.ReadChars(fileIndex, count); sequence.InsertRange(0, new string(buffer)); // default for partial load sequence.IsReadOnly = true; return(sequence); }
/// <summary> /// Get sequence ID corresponding to a given sequence pointer /// </summary> /// <param name="pointer">Sequence pointer</param> /// <returns>Sequence ID</returns> public string GetSequenceID(SequencePointer pointer) { if (pointer == null) { throw new ArgumentNullException("pointer"); } using (StreamReader sourceReader = new StreamReader(_fileName)) { int includesNewline = pointer.StartingLine * Environment.NewLine.Length; // Read Sequence ID by looking back from the sequence starting index sourceReader.BaseStream.Seek(pointer.IndexOffsets[0] + includesNewline, SeekOrigin.Begin); sourceReader.BaseStream.Seek(-2, SeekOrigin.Current); while (sourceReader.BaseStream.ReadByte() != '@') { sourceReader.BaseStream.Seek(-2, SeekOrigin.Current); } pointer.Id = sourceReader.ReadLine(); return(pointer.Id); } }
/// <summary> /// Parses a single FASTQ text from a reader into a QualitativeSequence. /// </summary> /// <param name="mbfReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not. /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new QualitativeSequence instance containing parsed data.</returns> private IQualitativeSequence ParseOneWithFastQFormat(MBFStreamReader mbfReader, bool isReadOnly) { SequencePointer sequencePointer = new SequencePointer(); string message; // Check for '@' symbol at the first line. if (!mbfReader.HasLines || !mbfReader.Line.StartsWith("@", StringComparison.Ordinal)) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, Name); Trace.Report(message); throw new FileFormatException(message); } // Process header line. string id = mbfReader.GetLineField(2).Trim(); // save sequence starting index sequencePointer.IndexOffsets[0] = mbfReader.Position; // Go to second line. mbfReader.GoToNextLine(); if (!mbfReader.HasLines || string.IsNullOrEmpty(mbfReader.Line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Get sequence from second line. string sequenceLine = mbfReader.Line; //save sequence ending index sequencePointer.IndexOffsets[1] = sequencePointer.IndexOffsets[0] + mbfReader.Line.Length; // Goto third line. mbfReader.GoToNextLine(); // Check for '+' symbol in the third line. if (!mbfReader.HasLines || !mbfReader.Line.StartsWith("+", StringComparison.Ordinal)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } string qualScoreId = mbfReader.GetLineField(2).Trim(); if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderData, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Goto fourth line. mbfReader.GoToNextLine(); if (!mbfReader.HasLines || string.IsNullOrEmpty(mbfReader.Line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Get the quality scores from the fourth line. byte[] qualScores = ASCIIEncoding.ASCII.GetBytes(mbfReader.Line); // Check for sequence length and quality score length. if (sequenceLine.Length != mbfReader.Line.Length) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } mbfReader.GoToNextLine(); IAlphabet alphabet = Alphabet; // Identify alphabet if it is not specified. if (alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, sequenceLine); if (alphabet == null) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, sequenceLine); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } } FastQFormatType fastQType = FastqType; // Identify fastq format type if AutoDetectFastQFormat property is set to true. if (AutoDetectFastQFormat) { fastQType = IdentifyFastQFormatType(qualScores); } QualitativeSequence sequence = null; if (Encoding == null) { sequence = new QualitativeSequence(alphabet, fastQType, sequenceLine, qualScores); } else { sequence = new QualitativeSequence(alphabet, fastQType, Encoding, sequenceLine, qualScores); } sequence.ID = id; sequence.IsReadOnly = isReadOnly; sequencePointer.AlphabetName = sequence.Alphabet.Name; sequencePointer.Id = sequence.ID; _sequencePointers.Add(sequencePointer); FileVirtualQualitativeSequenceProvider dataProvider = new FileVirtualQualitativeSequenceProvider(this, sequencePointer) { BlockSize = _blockSize, MaxNumberOfBlocks = _maxNumberOfBlocks }; sequence.VirtualQualitativeSequenceProvider = dataProvider; return(sequence); }
/// <summary> /// Parses a single FASTA text from a reader into a sequence. /// </summary> /// <param name="bioReader">bio text reader</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new Sequence instance containing parsed data.</returns> protected ISequence ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly) { SequencePointer sequencePointer = null; if (bioReader == null) { throw new ArgumentNullException("bioReader"); } string message; if (!bioReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { message = string.Format(CultureInfo.InvariantCulture, Resource.INVAILD_INPUT_FILE, Resource.FASTA_NAME); Trace.Report(message); throw new FileFormatException(message); } // Process header line. Sequence sequence; string id = bioReader.GetLineField(2).Trim(); if (_blockSize > FileLoadHelper.DefaultFullLoadBlockSize) { _lineCount++; _lineLength += bioReader.Line.Length; sequencePointer = new SequencePointer { StartingLine = _lineCount }; } bioReader.GoToNextLine(); IAlphabet alphabet = Alphabet; if (alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, bioReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, bioReader.Line); Trace.Report(message); throw new FileFormatException(message); } } if (Encoding == null) { sequence = new Sequence(alphabet); } else { sequence = new Sequence(alphabet, Encoding, string.Empty) { IsReadOnly = false }; } bool sameSequence = false; sequence.ID = id; while (bioReader.HasLines && !bioReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { if (Alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, bioReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, bioReader.Line); Trace.Report(message); throw new FileFormatException(message); } if (sequence.Alphabet != alphabet) { Sequence seq = new Sequence(alphabet, Encoding, sequence) { IsReadOnly = false }; sequence.Clear(); sequence = seq; } } // full load if (_blockSize <= 0) { sequence.InsertRange(sequence.Count, bioReader.Line); } else { if (sameSequence == false) { _sequenceBeginsAt = _lineLength; sameSequence = true; } _lineLength += bioReader.Line.Length; _lineCount++; } bioReader.GoToNextLine(); } if (sequence.MoleculeType == MoleculeType.Invalid) { sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet); } sequence.IsReadOnly = isReadOnly; // full load if (_blockSize == FileLoadHelper.DefaultFullLoadBlockSize) { return(sequence); } if (sequencePointer != null) { sequencePointer.AlphabetName = sequence.Alphabet.Name; sequencePointer.Id = sequence.ID; sequencePointer.StartingIndex = _sequenceBeginsAt; sequencePointer.EndingIndex = _lineLength; _sequencePointers.Add(sequencePointer); } _sequenceCount++; FileVirtualSequenceProvider dataprovider = new FileVirtualSequenceProvider(this, sequencePointer) { BlockSize = _blockSize, MaxNumberOfBlocks = _maxNumberOfBlocks }; sequence.VirtualSequenceProvider = dataprovider; return(sequence); }
/// <summary> /// Parses a single FASTA sequence from a file using MBFStreamReader. /// This method is only used in data virtualization scenarios. /// </summary> /// <param name="mbfReader">The MBFStreamReader of the file to be parsed.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence should be in read-only mode. /// If this flag is set to true then the resulting sequence's IsReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The parsed sequence.</returns> protected ISequence ParseOneWithSpecificFormat(MBFStreamReader mbfReader, bool isReadOnly) { SequencePointer sequencePointer = new SequencePointer(); if (mbfReader == null) { throw new ArgumentNullException("mbfReader"); } string message; if (!mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { message = string.Format(CultureInfo.InvariantCulture, Resource.INVALID_INPUT_FILE, Resource.FASTA_NAME); Trace.Report(message); throw new FileFormatException(message); } // Process header line. Sequence sequence; string id = mbfReader.GetLineField(2).Trim(); // save initial start and end indices sequencePointer.StartingLine = (int)(mbfReader.Position - mbfReader.CurrentLineStartingIndex); sequencePointer.IndexOffsets[0] = mbfReader.Position; sequencePointer.IndexOffsets[1] = mbfReader.Position; mbfReader.GoToNextLine(); IAlphabet alphabet = Alphabet; if (alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, mbfReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, mbfReader.Line); Trace.Report(message); throw new FileFormatException(message); } } if (Encoding == null) { sequence = new Sequence(alphabet); } else { sequence = new Sequence(alphabet, Encoding, string.Empty) { IsReadOnly = false }; } int currentBlockSize = 0; int symbolCount = -1; int newLineCharacterCount = mbfReader.NewLineCharacterCount; int prenewLineCharacterCount = 0; int lineLength = mbfReader.Line.Length; sequence.ID = id; while (mbfReader.HasLines && !mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { sequencePointer.IndexOffsets[1] += mbfReader.Line.Length; if (Alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, mbfReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, mbfReader.Line); Trace.Report(message); throw new FileFormatException(message); } if (sequence.Alphabet != alphabet) { Sequence seq = new Sequence(alphabet, Encoding, sequence) { IsReadOnly = false }; sequence.Clear(); sequence = seq; } } newLineCharacterCount = mbfReader.NewLineCharacterCount; lineLength = mbfReader.Line.Length; while (lineLength != 0 && _sidecarFileProvider != null) { if (lineLength + currentBlockSize + newLineCharacterCount <= _blockSize) { symbolCount += lineLength; currentBlockSize += lineLength + newLineCharacterCount; lineLength = 0; } else { symbolCount += _blockSize - currentBlockSize; lineLength = lineLength - (_blockSize - currentBlockSize); if (lineLength <= 0) { symbolCount += lineLength; prenewLineCharacterCount = newLineCharacterCount + lineLength; lineLength = 0; } currentBlockSize = _blockSize; } if (currentBlockSize == _blockSize) { // write to file. _sidecarFileProvider.WriteBlockIndex(symbolCount); currentBlockSize = prenewLineCharacterCount; prenewLineCharacterCount = 0; } } mbfReader.GoToNextLine(); } if (_sidecarFileProvider != null) { if (sequencePointer.IndexOffsets[1] - sequencePointer.IndexOffsets[0] > _blockSize && currentBlockSize - newLineCharacterCount > 0) { _sidecarFileProvider.WriteBlockIndex(symbolCount); } else { _sidecarFileProvider.WriteBlockIndex(0); } } if (sequence.MoleculeType == MoleculeType.Invalid) { sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet); } sequence.IsReadOnly = isReadOnly; sequencePointer.AlphabetName = sequence.Alphabet.Name; sequencePointer.Id = sequence.ID; if (_sidecarFileProvider != null) { // Write each sequence pointer to the sidecar file immediately _sidecarFileProvider.WritePointer(sequencePointer); } FileVirtualSequenceProvider dataprovider = new FileVirtualSequenceProvider(this, sequencePointer) { BlockSize = _blockSize, MaxNumberOfBlocks = _maxNumberOfBlocks }; sequence.VirtualSequenceProvider = dataprovider; return(sequence); }