/// <summary> /// Parses all the sequences in a SAM file. /// This method is used only in data virtualization scenarios. /// </summary> /// <param name="mbfReader">A reader for the sequence alignment text.</param> private void ParseSequences(MBFStreamReader mbfReader) { // if DV enabled if (IsDataVirtualizationEnabled && _sidecarFileProvider.SidecarFileExists) { try { while (mbfReader.HasLines && !mbfReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { SequencePointer sequencePointer = new SequencePointer { AlphabetName = Alphabets.DNA.Name }; // sequence starting index sequencePointer.IndexOffsets[0] = mbfReader.CurrentLineStartingIndex; // sequence ending index sequencePointer.IndexOffsets[1] = mbfReader.CurrentLineStartingIndex + mbfReader.Line.Length; // Write each sequence pointer to the sidecar file immediately _sidecarFileProvider.WritePointer(sequencePointer); mbfReader.GoToNextLine(); _lineCount++; } _sidecarFileProvider.Close(); } catch (Exception) { _sidecarFileProvider.Cleanup(); } } }
public void ValidateMBFStreamReaderProperties() { // Get values from xml string FilePath = Utility._xmlUtil.GetTextValue( Constants.SimpleFastAStreamReaderNode, Constants.FilePathNode); string newLineCharsCount = Utility._xmlUtil.GetTextValue( Constants.SimpleFastAStreamReaderNode, Constants.NewLineCharacterCountNode); string pos = Utility._xmlUtil.GetTextValue( Constants.SimpleFastAStreamReaderNode, Constants.PositionNode); string startingIndex = Utility._xmlUtil.GetTextValue( Constants.SimpleFastAStreamReaderNode, Constants.CurrentLineStartingIndexNode); MBFStreamReader streamReader = new MBFStreamReader(FilePath, true); // Validate Properties Assert.IsTrue(streamReader.CanRead); Assert.IsTrue(streamReader.SkipBlankLines); Assert.IsTrue(streamReader.HasLines); Assert.AreEqual(newLineCharsCount, streamReader.NewLineCharacterCount.ToString()); Assert.AreEqual(pos, streamReader.Position.ToString()); Assert.AreEqual(newLineCharsCount, streamReader.NewLineCharacterCount.ToString()); Assert.AreEqual(startingIndex, streamReader.CurrentLineStartingIndex.ToString()); Console.WriteLine("Validated the StreamReader properties successfully"); ApplicationLog.WriteLine("Validated the StreamReader properties successfully"); // Dispose StreamReader. streamReader.Close(); streamReader.Dispose(); }
/// <summary> /// Validate Read Biological sequences using MBFStreamReader /// </summary> /// <param name="nodeName">Name of the node used for different test case.</param> /// <param name="IsStartAndEndIndex">True if validating from start to end index substring, /// else false</param> void ValidateSubString(string nodeName, bool IsStartAndEndIndex) { // Get values from xml string FilePath = _utilityObj._xmlUtil.GetTextValue( nodeName, Constants.FilePathNode); string expectedString = _utilityObj._xmlUtil.GetTextValue( nodeName, Constants.ExpectedString); string startIndex = _utilityObj._xmlUtil.GetTextValue( nodeName, Constants.StartIndexNode); string endIndex = _utilityObj._xmlUtil.GetTextValue( nodeName, Constants.EndIndexNode); string subString = string.Empty; using (MBFStreamReader streamReader = new MBFStreamReader(FilePath)) { if (IsStartAndEndIndex) { subString = streamReader.GetLineField(Int32.Parse(startIndex, (IFormatProvider)null), Int32.Parse(endIndex, (IFormatProvider)null)); } else { subString = streamReader.GetLineField(Int32.Parse(startIndex, (IFormatProvider)null)); } // Validate sub string of a line. Assert.AreEqual(expectedString, subString); Console.WriteLine("The expected substring is {0}", subString); ApplicationLog.WriteLine("Validated the substring successfully"); } }
/// <summary> /// Parses the sequence represented by the specified sequence pointer. /// </summary> /// <param name="pointer"> /// A sequence pointer which holds information about the sequence to be retrieved. /// </param> /// <returns>IAlignedSequence object.</returns> public IAlignedSequence ParseAlignedSequence(SequencePointer pointer) { if (pointer == null) { throw new ArgumentNullException("pointer"); } if (string.IsNullOrEmpty(_fileName)) { throw new NotSupportedException(Resource.DataVirtualizationNeedsInputFile); } if (pointer.IndexOffsets[0] >= pointer.IndexOffsets[1]) return null; if (_mbfStreamReader == null || !_mbfStreamReader.CanRead) { _mbfStreamReader = new MBFStreamReader(_fileName); } string buffer; _mbfStreamReader.Seek(pointer.IndexOffsets[0], SeekOrigin.Begin); buffer = _mbfStreamReader.ReadLine(); return ParseSequence(buffer, _isReadOnly); }
public void TestMBFTextReaderConstructors() { string testFileFullName = @"TestUtils\Fasta\uniprot-dutpase.fasta"; using (StreamReader stream = new StreamReader(testFileFullName)) { using (MBFStreamReader mbfReader = new MBFStreamReader(testFileFullName)) { Assert.AreEqual(testFileFullName, mbfReader.FileName); Assert.AreEqual(stream.ReadLine(), mbfReader.Line); } } // MBFStreamReader(string) should read first line and set the Filename property. using (MBFStreamReader mbfReader = new MBFStreamReader(testFileFullName)) { Assert.AreEqual(testFileFullName, mbfReader.FileName); } using (Stream stream = new FileStream(testFileFullName, FileMode.Open, FileAccess.Read)) { using (MBFStreamReader mbfReader = new MBFStreamReader(testFileFullName)) { Assert.AreEqual(testFileFullName, mbfReader.FileName); } } using (MBFStreamReader mbfReader = new MBFStreamReader(testFileFullName)) { Assert.AreEqual(testFileFullName, mbfReader.FileName); } }
/// <summary> /// Validate Read characters from curent line /// </summary> /// <param name="nodeName">Name of the node used for different test case.</param> private void ValidateChars(string nodeName) { // Get values from xml string FilePath = Utility._xmlUtil.GetTextValue( nodeName, Constants.FilePathNode); string startIndex = Utility._xmlUtil.GetTextValue( nodeName, Constants.CharsStartIndexNode); string count = Utility._xmlUtil.GetTextValue( nodeName, Constants.CharsCountNode); MBFStreamReader streamReader = new MBFStreamReader(FilePath); string currentLine = streamReader.Line; char[] charsArray = streamReader.ReadChars(Int32.Parse(startIndex), Int32.Parse(count)); // Validate array. for (int i = 0; i < charsArray.Length; i++) { Assert.AreEqual(currentLine[i], charsArray[i]); Console.WriteLine("Validated the char {0} successfully", charsArray[i]); ApplicationLog.WriteLine("Validated the char successfully"); } // Dispose stream reader. streamReader.Close(); streamReader.Dispose(); }
/// <summary> /// Parses a range of symbols starting from the specified index in the sequence. /// </summary> /// <param name="startIndex">The zero-based index at which to begin parsing.</param> /// <param name="count">The number of symbols to parse.</param> /// <param name="seqPointer">The sequence pointer of that sequence.</param> /// <returns>The parsed symbols as ASCII values.</returns> public byte[] ParseRange(int startIndex, int count, SequencePointer seqPointer) { if (string.IsNullOrEmpty(_fileName)) { throw new NotSupportedException(Resource.DataVirtualizationNeedsInputFile); } if (seqPointer == null) { throw new ArgumentNullException("seqPointer"); } if (startIndex < 0) { throw new ArgumentOutOfRangeException("startIndex"); } if (count <= 0) { throw new ArgumentOutOfRangeException("count"); } if (_mbfStreamReader == null || !_mbfStreamReader.CanRead) { _mbfStreamReader = new MBFStreamReader(_fileName); } long fileIndex = startIndex + seqPointer.IndexOffsets[0]; return(_mbfStreamReader.ReadBytes(fileIndex, count)); }
/// <summary> /// Validate Read characters from curent line /// </summary> /// <param name="nodeName">Name of the node used for different test case.</param> void ValidateChars(string nodeName) { // Get values from xml string FilePath = _utilityObj._xmlUtil.GetTextValue( nodeName, Constants.FilePathNode); string startIndex = _utilityObj._xmlUtil.GetTextValue( nodeName, Constants.CharsStartIndexNode); string count = _utilityObj._xmlUtil.GetTextValue( nodeName, Constants.CharsCountNode); using (MBFStreamReader streamReader = new MBFStreamReader(FilePath)) { string currentLine = streamReader.Line; char[] charsArray = streamReader.ReadChars(Int32.Parse(startIndex, (IFormatProvider)null), Int32.Parse(count, (IFormatProvider)null)); // Validate array. for (int i = 0; i < charsArray.Length; i++) { Assert.AreEqual(currentLine[i], charsArray[i]); Console.WriteLine("Validated the char {0} successfully", charsArray[i]); ApplicationLog.WriteLine("Validated the char successfully"); } } }
/// <summary> /// Validate Read Biological sequences using MBFStreamReader /// </summary> /// <param name="nodeName">Name of the node used for different test case.</param> /// <param name="inputType">Different streaming ipnuts used for different test cases</param> void ValidateMBFStreamReader(string nodeName, StreamReaderInputType inputType) { // Get values from xml string FilePath = _utilityObj._xmlUtil.GetTextValue( nodeName, Constants.FilePathNode); string[] expectedOutput = _utilityObj._xmlUtil.GetTextValues( nodeName, Constants.ExpectedLinesNode); MBFStreamReader streamReader = null; try { // Read Fasta file. switch (inputType) { case StreamReaderInputType.FileName: streamReader = new MBFStreamReader(FilePath); break; case StreamReaderInputType.FileNameWithSkipBlankLines: streamReader = new MBFStreamReader(FilePath, true); break; case StreamReaderInputType.Stream: using (Stream stream = new FileStream(FilePath, FileMode.Open, FileAccess.ReadWrite)) { streamReader = new MBFStreamReader(stream); } break; case StreamReaderInputType.StreamWithSkipBlankLines: using (Stream stream = new FileStream(FilePath, FileMode.Open, FileAccess.ReadWrite)) { streamReader = new MBFStreamReader(stream, true); } break; } for (int i = 0; i < expectedOutput.Length; i++) { Assert.AreEqual(expectedOutput[i], streamReader.Line); Console.WriteLine("Validated the line {0} successfully", streamReader.Line); ApplicationLog.WriteLine("Validated the MBF StreamReader successfully"); // Move to next line streamReader.GoToNextLine(); } } finally { if (streamReader != null) { streamReader.Dispose(); } } }
/// <summary> /// Parses file with DV and returns Virtual Qualitative Sequences list. /// </summary> /// <param name="isReadOnly">Flag to indicate whether the sequences returned should be set to readonly or not.</param> private VirtualQualitativeSequenceList ParseWithDV(bool isReadOnly) { SidecarFileProvider sidecarFileProvider = null; sidecarFileProvider = new SidecarFileProvider(_fileName); sidecarFileProvider.Close(); // if valid sidecar file exists if (sidecarFileProvider.IsSidecarValid) { // Create virtual list and return return(new VirtualQualitativeSequenceList(sidecarFileProvider, this, sidecarFileProvider.Count) { CreateSequenceAsReadOnly = isReadOnly }); } // else create new sidecar using (sidecarFileProvider = new SidecarFileProvider(_fileName, true)) { using (_mbfStreamReader = new MBFStreamReader(_fileName)) { if (sidecarFileProvider.SidecarFileExists) { try { while (_mbfStreamReader.HasLines) { ParseOne(_mbfStreamReader, isReadOnly); } // Create sidecar sidecarFileProvider.CreateSidecarFile(_mbfStreamReader.FileName, _sequencePointers); VirtualQualitativeSequenceList virtualSequences = new VirtualQualitativeSequenceList(sidecarFileProvider, this, _sequencePointers.Count) { CreateSequenceAsReadOnly = isReadOnly }; _sequencePointers.Clear(); return(virtualSequences); } catch (Exception) { sidecarFileProvider.Cleanup(); } } } } return(null); }
/// <summary> /// Parses a sequence alignment texts from a file. /// </summary> /// <param name="fileName">file name.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequences in the sequence alignment should be in /// readonly mode or not. If this flag is set to true then the resulting sequences's /// isReadOnly property will be set to true, otherwise it will be set to false. /// </param> /// <returns>SequenceAlignmentMap object.</returns> public SequenceAlignmentMap Parse(string fileName, bool isReadOnly) { if (string.IsNullOrWhiteSpace(fileName)) { throw new ArgumentNullException("fileName"); } _fileName = fileName; // check if DV is required FileInfo fileInfo = new FileInfo(_fileName); _enforceDataVirtualizationByFileSize = EnforceDataVirtualizationByFileSize * FileLoadHelper.KBytes; if ((_enforceDataVirtualizationByFileSize != 0 && fileInfo.Length >= _enforceDataVirtualizationByFileSize) || _isDataVirtualizationEnforced) { EnforceDataVirtualization = true; } SequenceAlignmentMap sequenceAlignmentMap = null; SAMAlignmentHeader header = null; if (IsDataVirtualizationEnabled) { VirtualAlignedSequenceList<SAMAlignedSequence> queries = null; using (MBFStreamReader mbfReader = new MBFStreamReader(fileName)) { header = ParserSAMHeader(mbfReader); _sidecarFileProvider = new SidecarFileProvider(fileName); // if a valid sidecar does not exist then recreate it if (_sidecarFileProvider.SidecarFileExists && _sidecarFileProvider.IsSidecarValid == false) { ParseSequences(mbfReader); } if (_sidecarFileProvider.IsSidecarValid) { queries = new VirtualAlignedSequenceList<SAMAlignedSequence>(_sidecarFileProvider, this, _sidecarFileProvider.Count); sequenceAlignmentMap = new SequenceAlignmentMap(header, queries); return sequenceAlignmentMap; } } } using (MBFTextReader mbfReader = new MBFTextReader(fileName)) { return Parse(mbfReader, isReadOnly); } }
public void TestMBFTextReaderCoreFunctionality() { string testFileFullName = @"TestUtils\Fasta\5_sequences.fasta"; StreamReader streamReader = null; try { streamReader = new StreamReader(testFileFullName); using (MBFStreamReader mbfReader = new MBFStreamReader(testFileFullName)) { //Test line access members. Assert.IsTrue(mbfReader.HasLines); // Test line reads string streamLine = streamReader.ReadLine(); Assert.AreEqual(streamLine, mbfReader.Line); // Test getting of line fields Assert.AreEqual(streamLine.Substring(26, 10), mbfReader.GetLineField(27, 36)); Assert.AreEqual(streamLine.Substring(14), mbfReader.GetLineField(15)); // Test moving to next line mbfReader.GoToNextLine(); Assert.AreEqual(streamReader.ReadLine(), mbfReader.Line); char[] streamBuffer = new char[10]; char[] bioBuffer; // Test seeking to a position in the stream streamReader.DiscardBufferedData(); streamReader.BaseStream.Seek(100, SeekOrigin.Begin); mbfReader.Seek(100, SeekOrigin.Begin); Assert.AreEqual(streamReader.BaseStream.Position, mbfReader.Position); // Test character reading streamReader.ReadBlock(streamBuffer, 0, 10); bioBuffer = mbfReader.ReadChars(100, 10); for (int i = 0; i <= streamBuffer.Length; i++) { Assert.AreEqual(streamBuffer[0], bioBuffer[0]); } } } finally { if (streamReader != null) { streamReader.Dispose(); } } }
/// <summary> /// Parses a single FastQ text from a MBFStreamReader. /// </summary> /// <param name="mbfReader">MBFStreamReader instance for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not. /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The list of parsed IQualitativeSequence objects.</returns> private IQualitativeSequence ParseOne(MBFStreamReader mbfReader, bool isReadOnly) { // no empty files allowed if (!mbfReader.HasLines) { string message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, Resource.IONoTextToParse); Trace.Report(message); throw new FileFormatException(message); } // do the actual parsing return(ParseOneWithFastQFormat(mbfReader, isReadOnly)); }
/// <summary> /// Parses a range of sequence items starting from the specified index in the sequence. /// </summary> /// <param name="startIndex">The zero-based index at which to begin parsing.</param> /// <param name="count">The number of symbols to parse.</param> /// <param name="seqPointer">The sequence pointer of that sequence.</param> /// <returns>The parsed sequence.</returns> public ISequence ParseRange(int startIndex, int count, SequencePointer seqPointer) { if (0 > startIndex) { throw new ArgumentOutOfRangeException("startIndex"); } if (0 >= count) { throw new ArgumentOutOfRangeException("count"); } if (seqPointer == null) { throw new ArgumentNullException("seqPointer"); } // if the start index exceeds the sequence boundary if ((long)startIndex + seqPointer.IndexOffsets[0] >= seqPointer.IndexOffsets[1]) { return(null); } IAlphabet alphabet = Alphabets.All.Single(A => A.Name.Equals(seqPointer.AlphabetName)); Sequence sequence = new Sequence(alphabet) { IsReadOnly = false }; if (_mbfStreamReader == null || !_mbfStreamReader.CanRead) { _mbfStreamReader = new MBFStreamReader(_fileName); } long filePosition = startIndex + seqPointer.IndexOffsets[0]; int sequenceLength = (int)(seqPointer.IndexOffsets[1] - seqPointer.IndexOffsets[0]); if (count + startIndex >= sequenceLength) { count = (int)(sequenceLength - startIndex); } char[] buffer = _mbfStreamReader.ReadChars(filePosition, count); sequence.InsertRange(0, new string(buffer)); // default for partial load sequence.IsReadOnly = true; return(sequence); }
/// <summary> /// Parses SAM alignment header from specified MBFStreamReader. /// </summary> /// <param name="mbfReader">MBF text reader.</param> private static SAMAlignmentHeader ParseSAMHeader(MBFStreamReader mbfReader) { _headerLength = 0; SAMAlignmentHeader samHeader = new SAMAlignmentHeader(); if (mbfReader.HasLines && mbfReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { while (mbfReader.HasLines && mbfReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { _headerLength += mbfReader.Line.Length; string[] tokens = mbfReader.Line.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries); string recordTypecode = tokens[0].Substring(1); // Validate the header format. ValidateHeaderLineFormat(mbfReader.Line); SAMRecordField headerLine = null; if (string.Compare(recordTypecode, "CO", StringComparison.OrdinalIgnoreCase) != 0) { List <string> tags = new List <string>(); headerLine = new SAMRecordField(recordTypecode); for (int i = 1; i < tokens.Length; i++) { string tagToken = tokens[i]; string tagName = tagToken.Substring(0, 2); tags.Add(tagName); headerLine.Tags.Add(new SAMRecordFieldTag(tagName, tagToken.Substring(3))); } samHeader.RecordFields.Add(headerLine); } else { samHeader.Comments.Add(mbfReader.Line.Substring(4)); } mbfReader.GoToNextLine(); } string message = samHeader.IsValid(); if (!string.IsNullOrEmpty(message)) { throw new FormatException(message); } } return(samHeader); }
/// <summary> /// Parses a single sequence using a MBFStreamReader. /// This method is only used in data virtualization scenarios. /// </summary> /// <param name="mbfReader">The MBFStreamReader of the file to be parsed.</param> /// <param name="isReadOnly">Indicates whether the parsed sequence is read-only.</param> /// <returns>The parsed sequence.</returns> private ISequence ParseOne(MBFStreamReader mbfReader, bool isReadOnly) { _fileName = mbfReader.FileName; // no empty files allowed if (!mbfReader.HasLines) { string message = Resource.Parser_NoTextErrorMessage; Trace.Report(message); throw new InvalidOperationException(message); } // do the actual parsing ISequence sequence = ParseOneWithSpecificFormat(mbfReader, isReadOnly); return(sequence); }
/// <summary> /// Parses a single biological sequence from a file. /// </summary> /// <param name="filename">The name of a biological sequence file.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting QualitativeSequence should be in read-only mode or not. /// If this flag is set to true then the resulting QualitativeSequence's IsReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The parsed IQualitativeSequence object.</returns> public IQualitativeSequence ParseOne(string filename, bool isReadOnly) { if (IsDataVirtualizationEnabled) { using (MBFStreamReader mbfStreamReader = new MBFStreamReader(filename)) { return(ParseOne(mbfStreamReader, isReadOnly)); } } else { using (MBFTextReader mbfReader = new MBFTextReader(filename)) { return(ParseOne(mbfReader, isReadOnly)); } } }
public void ValidatePosition() { string FilePath = Utility._xmlUtil.GetTextValue( Constants.SimpleFastAStreamReaderNode, Constants.FilePathNode); string pos = Utility._xmlUtil.GetTextValue( Constants.SimpleFastAStreamReaderNode, Constants.PositionNode); MBFStreamReader reader = new MBFStreamReader(FilePath); // Set position at the begining. reader.Seek(Int32.Parse(pos), SeekOrigin.Begin); // Validate the set position Assert.AreEqual(pos, reader.Position.ToString()); Console.WriteLine("Validate the position successfulyy"); ApplicationLog.WriteLine("Validated the position successfully"); }
/// <summary> /// Gets the sequence ID corresponding to the specified sequence pointer. /// </summary> /// <param name="pointer"> /// A sequence pointer representing the sequence whose ID is to be retrieved. /// </param> /// <returns>The sequence ID of the specified sequence.</returns> public string GetSequenceID(SequencePointer pointer) { if (pointer == null) { throw new ArgumentNullException("pointer"); } if (_mbfStreamReader == null || !_mbfStreamReader.CanRead) { _mbfStreamReader = new MBFStreamReader(_fileName); } _mbfStreamReader.Seek(pointer.IndexOffsets[0] - pointer.StartingLine, SeekOrigin.Begin); _mbfStreamReader.ReadLine(); // Read Sequence ID by looking back from the sequence starting index pointer.Id = _mbfStreamReader.GetLineField(2); return(pointer.Id); }
/// <summary> /// Parses a range of symbols starting from the specified index in the sequence. /// </summary> /// <param name="startIndex">The zero-based index at which to begin parsing.</param> /// <param name="count">The number of symbols to parse.</param> /// <param name="seqPointer">The sequence pointer of that sequence.</param> /// <returns>The parsed symbols as ASCII values.</returns> public byte[] ParseRange(int startIndex, int count, SequencePointer seqPointer) { if (0 > startIndex) { throw new ArgumentOutOfRangeException("startIndex"); } if (0 >= count) { throw new ArgumentOutOfRangeException("count"); } if (seqPointer == null) { throw new ArgumentNullException("seqPointer"); } // if the start index exceeds the sequence boundary if ((long)startIndex + seqPointer.IndexOffsets[0] >= seqPointer.IndexOffsets[1]) { return(null); } if (_mbfStreamReader == null || !_mbfStreamReader.CanRead) { _mbfStreamReader = new MBFStreamReader(_fileName); } long filePosition = startIndex + seqPointer.IndexOffsets[0]; int sequenceLength = (int)(seqPointer.IndexOffsets[1] - seqPointer.IndexOffsets[0]); if (count + startIndex >= sequenceLength) { count = (int)(sequenceLength - startIndex); } return(_mbfStreamReader.ReadBytes(filePosition, count)); }
/// <summary> /// Parses a range of sequence items starting from the specified index in the sequence. /// </summary> /// <param name="startIndex">The zero-based index at which to begin parsing.</param> /// <param name="count">The number of symbols to parse.</param> /// <param name="seqPointer">The sequence pointer of the specified sequence.</param> /// <returns>The parsed sequence.</returns> public ISequence ParseRange(int startIndex, int count, SequencePointer seqPointer) { if (string.IsNullOrEmpty(_fileName)) { throw new NotSupportedException(Resource.DataVirtualizationNeedsInputFile); } if (startIndex < 0) { throw new ArgumentOutOfRangeException("startIndex"); } if (count <= 0) { throw new ArgumentOutOfRangeException("count"); } IAlphabet alphabet = Alphabets.All.Single(A => A.Name.Equals(seqPointer.AlphabetName)); Sequence sequence = new Sequence(alphabet) { IsReadOnly = false }; if (_mbfStreamReader == null || !_mbfStreamReader.CanRead) { _mbfStreamReader = new MBFStreamReader(_fileName); } long fileIndex = startIndex + seqPointer.IndexOffsets[0]; char[] buffer = _mbfStreamReader.ReadChars(fileIndex, count); sequence.InsertRange(0, new string(buffer)); // default for partial load sequence.IsReadOnly = true; return(sequence); }
/// <summary> /// Validate Read Biological sequences using MBFStreamReader /// </summary> /// <param name="nodeName">Name of the node used for different test case.</param> /// <param name="IsStartAndEndIndex">True if validating from start to end index substring, /// else false</param> private void ValidateSubString(string nodeName, bool IsStartAndEndIndex) { // Get values from xml string FilePath = Utility._xmlUtil.GetTextValue( nodeName, Constants.FilePathNode); string expectedString = Utility._xmlUtil.GetTextValue( nodeName, Constants.ExpectedString); string startIndex = Utility._xmlUtil.GetTextValue( nodeName, Constants.StartIndexNode); string endIndex = Utility._xmlUtil.GetTextValue( nodeName, Constants.EndIndexNode); string subString = string.Empty; MBFStreamReader streamReader = new MBFStreamReader(FilePath); if (IsStartAndEndIndex) { subString = streamReader.GetLineField(Int32.Parse(startIndex), Int32.Parse(endIndex)); } else { subString = streamReader.GetLineField(Int32.Parse(startIndex)); } // Validate sub string of a line. Assert.AreEqual(expectedString, subString); Console.WriteLine("The expected substring is {0}", subString); ApplicationLog.WriteLine("Validated the substring successfully"); // Dispose stream reader. streamReader.Close(); streamReader.Dispose(); }
/// <summary> /// Parses a single FASTQ text from a reader into a QualitativeSequence. /// </summary> /// <param name="mbfReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not. /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new QualitativeSequence instance containing parsed data.</returns> private IQualitativeSequence ParseOneWithFastQFormat(MBFStreamReader mbfReader, bool isReadOnly) { SequencePointer sequencePointer = new SequencePointer(); string message; // Check for '@' symbol at the first line. if (!mbfReader.HasLines || !mbfReader.Line.StartsWith("@", StringComparison.Ordinal)) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, Name); Trace.Report(message); throw new FileFormatException(message); } // Process header line. string id = mbfReader.GetLineField(2).Trim(); // save sequence starting index sequencePointer.IndexOffsets[0] = mbfReader.Position; // Go to second line. mbfReader.GoToNextLine(); if (!mbfReader.HasLines || string.IsNullOrEmpty(mbfReader.Line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Get sequence from second line. string sequenceLine = mbfReader.Line; //save sequence ending index sequencePointer.IndexOffsets[1] = sequencePointer.IndexOffsets[0] + mbfReader.Line.Length; // Goto third line. mbfReader.GoToNextLine(); // Check for '+' symbol in the third line. if (!mbfReader.HasLines || !mbfReader.Line.StartsWith("+", StringComparison.Ordinal)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } string qualScoreId = mbfReader.GetLineField(2).Trim(); if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderData, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Goto fourth line. mbfReader.GoToNextLine(); if (!mbfReader.HasLines || string.IsNullOrEmpty(mbfReader.Line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Get the quality scores from the fourth line. byte[] qualScores = ASCIIEncoding.ASCII.GetBytes(mbfReader.Line); // Check for sequence length and quality score length. if (sequenceLine.Length != mbfReader.Line.Length) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } mbfReader.GoToNextLine(); IAlphabet alphabet = Alphabet; // Identify alphabet if it is not specified. if (alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, sequenceLine); if (alphabet == null) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, sequenceLine); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } } FastQFormatType fastQType = FastqType; // Identify fastq format type if AutoDetectFastQFormat property is set to true. if (AutoDetectFastQFormat) { fastQType = IdentifyFastQFormatType(qualScores); } QualitativeSequence sequence = null; if (Encoding == null) { sequence = new QualitativeSequence(alphabet, fastQType, sequenceLine, qualScores); } else { sequence = new QualitativeSequence(alphabet, fastQType, Encoding, sequenceLine, qualScores); } sequence.ID = id; sequence.IsReadOnly = isReadOnly; sequencePointer.AlphabetName = sequence.Alphabet.Name; sequencePointer.Id = sequence.ID; _sequencePointers.Add(sequencePointer); FileVirtualQualitativeSequenceProvider dataProvider = new FileVirtualQualitativeSequenceProvider(this, sequencePointer) { BlockSize = _blockSize, MaxNumberOfBlocks = _maxNumberOfBlocks }; sequence.VirtualQualitativeSequenceProvider = dataProvider; return(sequence); }
/// <summary> /// Parses a single FASTQ text from a reader into a QualitativeSequence. /// </summary> /// <param name="mbfReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not. /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new QualitativeSequence instance containing parsed data.</returns> protected ISequence ParseOneWithSpecificFormat(MBFStreamReader mbfReader, bool isReadOnly) { return(ParseOneWithFastQFormat(mbfReader, isReadOnly)); }
/// <summary> /// Parses a list of biological sequence texts from a file. /// </summary> /// <param name="filename">The name of a biological sequence file.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequences should be in readonly mode or not. /// If this flag is set to true then the resulting sequences's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The list of parsed ISequence objects.</returns> public IList <ISequence> Parse(string filename, bool isReadOnly) { // default to full load _blockSize = FileLoadHelper.DefaultFullLoadBlockSize; _maxNumberOfBlocks = 0; // check if DV is required if (filename != null) { _fileName = filename; FileInfo fileInfo = new FileInfo(_fileName); _enforceDataVirtualizationByFileSize = EnforceDataVirtualizationByFileSize * FileLoadHelper.KBytes; if ((_enforceDataVirtualizationByFileSize != 0 && fileInfo.Length >= _enforceDataVirtualizationByFileSize) || _isDataVirtualizationEnforced) { _blockSize = FileLoadHelper.DefaultBlockSize; _maxNumberOfBlocks = FileLoadHelper.DefaultMaxNumberOfBlocks; } } // Check for sidecar if (IsDataVirtualizationEnabled) { _sidecarFileProvider = new SidecarFileProvider(_fileName); _sidecarFileProvider.Close(); // if valid sidecar file exists if (_sidecarFileProvider.IsSidecarValid) { // Create virtual list and return return(new VirtualSequenceList(_sidecarFileProvider, this, _sidecarFileProvider.Count) { CreateSequenceAsReadOnly = isReadOnly }); } // else create new sidecar _sidecarFileProvider = new SidecarFileProvider(_fileName, true); if (_sidecarFileProvider.SidecarFileExists) { using (_mbfStreamReader = new MBFStreamReader(_fileName)) { try { while (_mbfStreamReader.HasLines) { // Parse and forget as the list is now maintained by DV using sequence pointers ParseOne(_mbfStreamReader, isReadOnly); } _sidecarFileProvider.Close(); VirtualSequenceList virtualSequences = new VirtualSequenceList(_sidecarFileProvider, this, _sidecarFileProvider.Count) { CreateSequenceAsReadOnly = isReadOnly }; return(virtualSequences); } catch (Exception) { _sidecarFileProvider.Cleanup(); } } } } // non-DV parsing using (MBFTextReader mbfReader = new MBFTextReader(filename)) { return(Parse(mbfReader, isReadOnly)); } }
/// <summary> /// Parses a single FASTA sequence from a file using MBFStreamReader. /// This method is only used in data virtualization scenarios. /// </summary> /// <param name="mbfReader">The MBFStreamReader of the file to be parsed.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence should be in read-only mode. /// If this flag is set to true then the resulting sequence's IsReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The parsed sequence.</returns> protected ISequence ParseOneWithSpecificFormat(MBFStreamReader mbfReader, bool isReadOnly) { SequencePointer sequencePointer = new SequencePointer(); if (mbfReader == null) { throw new ArgumentNullException("mbfReader"); } string message; if (!mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { message = string.Format(CultureInfo.InvariantCulture, Resource.INVALID_INPUT_FILE, Resource.FASTA_NAME); Trace.Report(message); throw new FileFormatException(message); } // Process header line. Sequence sequence; string id = mbfReader.GetLineField(2).Trim(); // save initial start and end indices sequencePointer.StartingLine = (int)(mbfReader.Position - mbfReader.CurrentLineStartingIndex); sequencePointer.IndexOffsets[0] = mbfReader.Position; sequencePointer.IndexOffsets[1] = mbfReader.Position; mbfReader.GoToNextLine(); IAlphabet alphabet = Alphabet; if (alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, mbfReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, mbfReader.Line); Trace.Report(message); throw new FileFormatException(message); } } if (Encoding == null) { sequence = new Sequence(alphabet); } else { sequence = new Sequence(alphabet, Encoding, string.Empty) { IsReadOnly = false }; } int currentBlockSize = 0; int symbolCount = -1; int newLineCharacterCount = mbfReader.NewLineCharacterCount; int prenewLineCharacterCount = 0; int lineLength = mbfReader.Line.Length; sequence.ID = id; while (mbfReader.HasLines && !mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { sequencePointer.IndexOffsets[1] += mbfReader.Line.Length; if (Alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, mbfReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, mbfReader.Line); Trace.Report(message); throw new FileFormatException(message); } if (sequence.Alphabet != alphabet) { Sequence seq = new Sequence(alphabet, Encoding, sequence) { IsReadOnly = false }; sequence.Clear(); sequence = seq; } } newLineCharacterCount = mbfReader.NewLineCharacterCount; lineLength = mbfReader.Line.Length; while (lineLength != 0 && _sidecarFileProvider != null) { if (lineLength + currentBlockSize + newLineCharacterCount <= _blockSize) { symbolCount += lineLength; currentBlockSize += lineLength + newLineCharacterCount; lineLength = 0; } else { symbolCount += _blockSize - currentBlockSize; lineLength = lineLength - (_blockSize - currentBlockSize); if (lineLength <= 0) { symbolCount += lineLength; prenewLineCharacterCount = newLineCharacterCount + lineLength; lineLength = 0; } currentBlockSize = _blockSize; } if (currentBlockSize == _blockSize) { // write to file. _sidecarFileProvider.WriteBlockIndex(symbolCount); currentBlockSize = prenewLineCharacterCount; prenewLineCharacterCount = 0; } } mbfReader.GoToNextLine(); } if (_sidecarFileProvider != null) { if (sequencePointer.IndexOffsets[1] - sequencePointer.IndexOffsets[0] > _blockSize && currentBlockSize - newLineCharacterCount > 0) { _sidecarFileProvider.WriteBlockIndex(symbolCount); } else { _sidecarFileProvider.WriteBlockIndex(0); } } if (sequence.MoleculeType == MoleculeType.Invalid) { sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet); } sequence.IsReadOnly = isReadOnly; sequencePointer.AlphabetName = sequence.Alphabet.Name; sequencePointer.Id = sequence.ID; if (_sidecarFileProvider != null) { // Write each sequence pointer to the sidecar file immediately _sidecarFileProvider.WritePointer(sequencePointer); } FileVirtualSequenceProvider dataprovider = new FileVirtualSequenceProvider(this, sequencePointer) { BlockSize = _blockSize, MaxNumberOfBlocks = _maxNumberOfBlocks }; sequence.VirtualSequenceProvider = dataprovider; return(sequence); }
/// <summary> /// Parses a sequence alignment texts from a file. /// </summary> /// <param name="fileName">file name.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequences in the sequence alignment should be in /// readonly mode or not. If this flag is set to true then the resulting sequences's /// isReadOnly property will be set to true, otherwise it will be set to false. /// </param> /// <returns>SequenceAlignmentMap object.</returns> public SequenceAlignmentMap Parse(string fileName, bool isReadOnly) { if (string.IsNullOrWhiteSpace(fileName)) { throw new ArgumentNullException("fileName"); } _fileName = fileName; // check if DV is required FileInfo fileInfo = new FileInfo(_fileName); _enforceDataVirtualizationByFileSize = EnforceDataVirtualizationByFileSize * FileLoadHelper.KBytes; if ((_enforceDataVirtualizationByFileSize != 0 && fileInfo.Length >= _enforceDataVirtualizationByFileSize) || _isDataVirtualizationEnforced) { EnforceDataVirtualization = true; } SequenceAlignmentMap sequenceAlignmentMap = null; SAMAlignmentHeader header = null; if (IsDataVirtualizationEnabled) { VirtualAlignedSequenceList <SAMAlignedSequence> queries = null; using (MBFStreamReader mbfReader = new MBFStreamReader(fileName)) { header = ParseSAMHeader(mbfReader); if (header.Comments.Count == 0 && header.RecordFields.Count == 0) { try { // verify whether this is a valid SAM file by parsing a single sequence ParseSequence(mbfReader.Line, true, Alphabet, Encoding, RefSequences); } catch (IndexOutOfRangeException) { throw new FileFormatException(Resource.SAM_InvalidInputFile); } } _sidecarFileProvider = new SidecarFileProvider(fileName); // if a valid sidecar does not exist then recreate it if (_sidecarFileProvider.SidecarFileExists && _sidecarFileProvider.IsSidecarValid == false) { ParseSequences(mbfReader); } if (_sidecarFileProvider.IsSidecarValid) { queries = new VirtualAlignedSequenceList <SAMAlignedSequence>(_sidecarFileProvider, this, _sidecarFileProvider.Count); sequenceAlignmentMap = new SequenceAlignmentMap(header, queries); return(sequenceAlignmentMap); } } } using (MBFTextReader mbfReader = new MBFTextReader(fileName)) { return(Parse(mbfReader, isReadOnly)); } }