Ejemplo n.º 1
0
        /// <summary>
        /// Parses a single FASTA sequence from a file using MBFStreamReader.
        /// This method is only used in data virtualization scenarios.
        /// </summary>
        /// <param name="mbfReader">The MBFStreamReader of the file to be parsed.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequence should be in read-only mode.
        /// If this flag is set to true then the resulting sequence's IsReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>The parsed sequence.</returns>
        protected ISequence ParseOneWithSpecificFormat(MBFStreamReader mbfReader, bool isReadOnly)
        {
            SequencePointer sequencePointer = new SequencePointer();

            if (mbfReader == null)
            {
                throw new ArgumentNullException("mbfReader");
            }

            string message;

            if (!mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                message = string.Format(CultureInfo.InvariantCulture,
                                        Resource.INVALID_INPUT_FILE,
                                        Resource.FASTA_NAME);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Process header line.
            Sequence sequence;
            string   id = mbfReader.GetLineField(2).Trim();

            // save initial start and end indices
            sequencePointer.StartingLine    = (int)(mbfReader.Position - mbfReader.CurrentLineStartingIndex);
            sequencePointer.IndexOffsets[0] = mbfReader.Position;
            sequencePointer.IndexOffsets[1] = mbfReader.Position;

            mbfReader.GoToNextLine();

            IAlphabet alphabet = Alphabet;

            if (alphabet == null)
            {
                alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, mbfReader.Line);

                if (alphabet == null)
                {
                    message = string.Format(CultureInfo.InvariantCulture,
                                            Resource.InvalidSymbolInString,
                                            mbfReader.Line);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            if (Encoding == null)
            {
                sequence = new Sequence(alphabet);
            }
            else
            {
                sequence = new Sequence(alphabet, Encoding, string.Empty)
                {
                    IsReadOnly = false
                };
            }

            int currentBlockSize         = 0;
            int symbolCount              = -1;
            int newLineCharacterCount    = mbfReader.NewLineCharacterCount;
            int prenewLineCharacterCount = 0;
            int lineLength = mbfReader.Line.Length;

            sequence.ID = id;

            while (mbfReader.HasLines && !mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                sequencePointer.IndexOffsets[1] += mbfReader.Line.Length;
                if (Alphabet == null)
                {
                    alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, mbfReader.Line);

                    if (alphabet == null)
                    {
                        message = string.Format(CultureInfo.InvariantCulture,
                                                Resource.InvalidSymbolInString,
                                                mbfReader.Line);
                        Trace.Report(message);
                        throw new FileFormatException(message);
                    }

                    if (sequence.Alphabet != alphabet)
                    {
                        Sequence seq = new Sequence(alphabet, Encoding, sequence)
                        {
                            IsReadOnly = false
                        };
                        sequence.Clear();
                        sequence = seq;
                    }
                }

                newLineCharacterCount = mbfReader.NewLineCharacterCount;
                lineLength            = mbfReader.Line.Length;

                while (lineLength != 0 && _sidecarFileProvider != null)
                {
                    if (lineLength + currentBlockSize + newLineCharacterCount <= _blockSize)
                    {
                        symbolCount      += lineLength;
                        currentBlockSize += lineLength + newLineCharacterCount;
                        lineLength        = 0;
                    }
                    else
                    {
                        symbolCount += _blockSize - currentBlockSize;
                        lineLength   = lineLength - (_blockSize - currentBlockSize);
                        if (lineLength <= 0)
                        {
                            symbolCount += lineLength;
                            prenewLineCharacterCount = newLineCharacterCount + lineLength;
                            lineLength = 0;
                        }

                        currentBlockSize = _blockSize;
                    }

                    if (currentBlockSize == _blockSize)
                    {
                        // write to file.
                        _sidecarFileProvider.WriteBlockIndex(symbolCount);
                        currentBlockSize         = prenewLineCharacterCount;
                        prenewLineCharacterCount = 0;
                    }
                }

                mbfReader.GoToNextLine();
            }

            if (_sidecarFileProvider != null)
            {
                if (sequencePointer.IndexOffsets[1] - sequencePointer.IndexOffsets[0] > _blockSize &&
                    currentBlockSize - newLineCharacterCount > 0)
                {
                    _sidecarFileProvider.WriteBlockIndex(symbolCount);
                }
                else
                {
                    _sidecarFileProvider.WriteBlockIndex(0);
                }
            }

            if (sequence.MoleculeType == MoleculeType.Invalid)
            {
                sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet);
            }

            sequence.IsReadOnly = isReadOnly;

            sequencePointer.AlphabetName = sequence.Alphabet.Name;
            sequencePointer.Id           = sequence.ID;

            if (_sidecarFileProvider != null)
            {
                // Write each sequence pointer to the sidecar file immediately
                _sidecarFileProvider.WritePointer(sequencePointer);
            }

            FileVirtualSequenceProvider dataprovider = new FileVirtualSequenceProvider(this, sequencePointer)
            {
                BlockSize         = _blockSize,
                MaxNumberOfBlocks = _maxNumberOfBlocks
            };

            sequence.VirtualSequenceProvider = dataprovider;
            return(sequence);
        }