/// <summary> /// Parses all the sequences in a SAM file. /// This method is used only in data virtualization scenarios. /// </summary> /// <param name="mbfReader">A reader for the sequence alignment text.</param> private void ParseSequences(MBFStreamReader mbfReader) { // if DV enabled if (IsDataVirtualizationEnabled && _sidecarFileProvider.SidecarFileExists) { try { while (mbfReader.HasLines && !mbfReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { SequencePointer sequencePointer = new SequencePointer { AlphabetName = Alphabets.DNA.Name }; // sequence starting index sequencePointer.IndexOffsets[0] = mbfReader.CurrentLineStartingIndex; // sequence ending index sequencePointer.IndexOffsets[1] = mbfReader.CurrentLineStartingIndex + mbfReader.Line.Length; // Write each sequence pointer to the sidecar file immediately _sidecarFileProvider.WritePointer(sequencePointer); mbfReader.GoToNextLine(); _lineCount++; } _sidecarFileProvider.Close(); } catch (Exception) { _sidecarFileProvider.Cleanup(); } } }
/// <summary> /// Parses a single FASTA sequence from a file using MBFStreamReader. /// This method is only used in data virtualization scenarios. /// </summary> /// <param name="mbfReader">The MBFStreamReader of the file to be parsed.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence should be in read-only mode. /// If this flag is set to true then the resulting sequence's IsReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The parsed sequence.</returns> protected ISequence ParseOneWithSpecificFormat(MBFStreamReader mbfReader, bool isReadOnly) { SequencePointer sequencePointer = new SequencePointer(); if (mbfReader == null) { throw new ArgumentNullException("mbfReader"); } string message; if (!mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { message = string.Format(CultureInfo.InvariantCulture, Resource.INVALID_INPUT_FILE, Resource.FASTA_NAME); Trace.Report(message); throw new FileFormatException(message); } // Process header line. Sequence sequence; string id = mbfReader.GetLineField(2).Trim(); // save initial start and end indices sequencePointer.StartingLine = (int)(mbfReader.Position - mbfReader.CurrentLineStartingIndex); sequencePointer.IndexOffsets[0] = mbfReader.Position; sequencePointer.IndexOffsets[1] = mbfReader.Position; mbfReader.GoToNextLine(); IAlphabet alphabet = Alphabet; if (alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, mbfReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, mbfReader.Line); Trace.Report(message); throw new FileFormatException(message); } } if (Encoding == null) { sequence = new Sequence(alphabet); } else { sequence = new Sequence(alphabet, Encoding, string.Empty) { IsReadOnly = false }; } int currentBlockSize = 0; int symbolCount = -1; int newLineCharacterCount = mbfReader.NewLineCharacterCount; int prenewLineCharacterCount = 0; int lineLength = mbfReader.Line.Length; sequence.ID = id; while (mbfReader.HasLines && !mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { sequencePointer.IndexOffsets[1] += mbfReader.Line.Length; if (Alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, mbfReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, mbfReader.Line); Trace.Report(message); throw new FileFormatException(message); } if (sequence.Alphabet != alphabet) { Sequence seq = new Sequence(alphabet, Encoding, sequence) { IsReadOnly = false }; sequence.Clear(); sequence = seq; } } newLineCharacterCount = mbfReader.NewLineCharacterCount; lineLength = mbfReader.Line.Length; while (lineLength != 0 && _sidecarFileProvider != null) { if (lineLength + currentBlockSize + newLineCharacterCount <= _blockSize) { symbolCount += lineLength; currentBlockSize += lineLength + newLineCharacterCount; lineLength = 0; } else { symbolCount += _blockSize - currentBlockSize; lineLength = lineLength - (_blockSize - currentBlockSize); if (lineLength <= 0) { symbolCount += lineLength; prenewLineCharacterCount = newLineCharacterCount + lineLength; lineLength = 0; } currentBlockSize = _blockSize; } if (currentBlockSize == _blockSize) { // write to file. _sidecarFileProvider.WriteBlockIndex(symbolCount); currentBlockSize = prenewLineCharacterCount; prenewLineCharacterCount = 0; } } mbfReader.GoToNextLine(); } if (_sidecarFileProvider != null) { if (sequencePointer.IndexOffsets[1] - sequencePointer.IndexOffsets[0] > _blockSize && currentBlockSize - newLineCharacterCount > 0) { _sidecarFileProvider.WriteBlockIndex(symbolCount); } else { _sidecarFileProvider.WriteBlockIndex(0); } } if (sequence.MoleculeType == MoleculeType.Invalid) { sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet); } sequence.IsReadOnly = isReadOnly; sequencePointer.AlphabetName = sequence.Alphabet.Name; sequencePointer.Id = sequence.ID; if (_sidecarFileProvider != null) { // Write each sequence pointer to the sidecar file immediately _sidecarFileProvider.WritePointer(sequencePointer); } FileVirtualSequenceProvider dataprovider = new FileVirtualSequenceProvider(this, sequencePointer) { BlockSize = _blockSize, MaxNumberOfBlocks = _maxNumberOfBlocks }; sequence.VirtualSequenceProvider = dataprovider; return(sequence); }