Beispiel #1
0
        /// <summary>
        /// Parses a single FASTQ text from a reader into a QualitativeSequence.
        /// </summary>
        /// <param name="bioReader">A reader for a biological sequence text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not.
        /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>A new QualitativeSequence instance containing parsed data.</returns>
        private IQualitativeSequence ParseOneWithFastQFormat(BioTextReader bioReader, bool isReadOnly)
        {
            SequencePointer sequencePointer = new SequencePointer();
            string          message         = string.Empty;

            // Check for '@' symbol at the first line.
            if (!bioReader.HasLines || !bioReader.Line.StartsWith("@", StringComparison.Ordinal))
            {
                message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Process header line.
            string id = bioReader.GetLineField(2).Trim();

            _numberOfCharactersParsed    += bioReader.Line.Length;
            sequencePointer.StartingIndex = _numberOfCharactersParsed;
            sequencePointer.StartingLine  = bioReader.LineNumber;

            // Go to second line.
            bioReader.GoToNextLine();
            if (!bioReader.HasLines || string.IsNullOrEmpty(bioReader.Line))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Get sequence from second line.
            string sequenceLine = bioReader.Line;

            _numberOfCharactersParsed  += bioReader.Line.Length;
            sequencePointer.EndingIndex = _numberOfCharactersParsed;

            // Goto third line.
            bioReader.GoToNextLine();

            // Check for '+' symbol in the third line.
            if (!bioReader.HasLines || !bioReader.Line.StartsWith("+", StringComparison.Ordinal))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            _numberOfCharactersParsed += bioReader.Line.Length;

            string qualScoreId = bioReader.GetLineField(2).Trim();

            if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderData, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Goto fourth line.
            bioReader.GoToNextLine();
            if (!bioReader.HasLines || string.IsNullOrEmpty(bioReader.Line))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            _numberOfCharactersParsed += bioReader.Line.Length;

            // Get the quality scores from the fourth line.
            byte[] qualScores = ASCIIEncoding.ASCII.GetBytes(bioReader.Line);

            // Check for sequence length and quality score length.
            if (sequenceLine.Length != bioReader.Line.Length)
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            bioReader.GoToNextLine();

            IAlphabet alphabet = Alphabet;

            // Identify alphabet if it is not specified.
            if (alphabet == null)
            {
                alphabet = IdentifyAlphabet(alphabet, sequenceLine);

                if (alphabet == null)
                {
                    string message1 = string.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, sequenceLine);
                    message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            FastQFormatType fastQType = FastqType;

            // Identify fastq format type if AutoDetectFastQFormat property is set to true.
            if (AutoDetectFastQFormat)
            {
                fastQType = IdentifyFastQFormatType(qualScores);
            }

            QualitativeSequence sequence = null;

            if (Encoding == null)
            {
                sequence = new QualitativeSequence(alphabet, fastQType, sequenceLine, qualScores);
            }
            else
            {
                sequence = new QualitativeSequence(alphabet, fastQType, Encoding, sequenceLine, qualScores);
            }

            sequence.ID         = id;
            sequence.IsReadOnly = isReadOnly;

            // full load
            if (_blockSize == FileLoadHelper.DefaultFullLoadBlockSize)
            {
                return(sequence);
            }

            sequencePointer.AlphabetName = sequence.Alphabet.Name;
            sequencePointer.Id           = sequence.ID;
            _sequencePointers.Add(sequencePointer);

            FileVirtualQualitativeSequenceProvider dataProvider = new FileVirtualQualitativeSequenceProvider(this, sequencePointer)
            {
                BlockSize         = _blockSize,
                MaxNumberOfBlocks = _maxNumberOfBlocks
            };

            sequence.VirtualQualitativeSequenceProvider = dataProvider;

            return(sequence);
        }
Beispiel #2
0
        // Processes headers, which are a type of comment.
        private void ParseHeaders(BioTextReader bioReader)
        {
            while (bioReader.HasLines && bioReader.Line.StartsWith(_commentMark, StringComparison.Ordinal))
            {
                Sequence specificSeq = null;

                // process headers, but ignore other comments
                if (bioReader.Line.StartsWith(_headerMark, StringComparison.Ordinal))
                {
                    string[] fields = bioReader.GetLineField(3).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                    switch (fields[0].ToUpperInvariant())
                    {
                    case "GFF-VERSION":
                        if (fields.Length > 1 && fields[1] != "2")
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.GffUnsupportedVersion,
                                bioReader.LocationString);
                            Trace.Report(message);
                            throw new NotSupportedException(message);
                        }
                        // don't store this
                        break;

                    case "SOURCE-VERSION":
                        _commonSeq.Metadata["source"]  = fields[1];
                        _commonSeq.Metadata["version"] = fields[2];
                        break;

                    case "DATE":
                        DateTime date;
                        if (!DateTime.TryParse(fields[1], out date))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.ParserInvalidDate,
                                bioReader.LocationString);
                            Trace.Report(message);
                            throw new FormatException(message);
                        }

                        _commonSeq.Metadata["date"] = date;
                        break;

                    case "TYPE":
                        if (fields.Length == 2)
                        {
                            _commonSeq.MoleculeType = GetMoleculeType(fields[1]);
                            if (_commonSeq.MoleculeType == MoleculeType.Invalid)
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.InvalidType,
                                    bioReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }
                        }
                        else
                        {
                            specificSeq = GetSpecificSequence(fields[2], GetMoleculeType(fields[1]), bioReader);

                            if (specificSeq.MoleculeType == MoleculeType.Invalid)
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.InvalidType,
                                    bioReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }
                        }
                        break;

                    case "DNA":
                    case "RNA":
                    case "PROTEIN":
                        specificSeq = GetSpecificSequence(fields[1], GetMoleculeType(fields[0]), bioReader);
                        bioReader.GoToNextLine();

                        while (bioReader.HasLines && bioReader.Line != "##end-" + fields[0])
                        {
                            if (!bioReader.Line.StartsWith(_headerMark, StringComparison.Ordinal))
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.GffInvalidSequence,
                                    bioReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }

                            specificSeq.InsertRange(specificSeq.Count, bioReader.GetLineField(3));

                            bioReader.GoToNextLine();
                        }

                        break;

                    case "SEQUENCE-REGION":
                        specificSeq = GetSpecificSequence(fields[1], MoleculeType.Invalid, bioReader);
                        specificSeq.Metadata["start"] = fields[2];
                        specificSeq.Metadata["end"]   = fields[3];
                        break;
                    }
                }

                bioReader.GoToNextLine();
            }
        }
Beispiel #3
0
        /// <summary>
        /// Parses a single FASTA text from a reader into a sequence.
        /// </summary>
        /// <param name="bioReader">bio text reader</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequence should be in readonly mode or not.
        /// If this flag is set to true then the resulting sequence's isReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>A new Sequence instance containing parsed data.</returns>
        protected ISequence ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly)
        {
            SequencePointer sequencePointer = null;

            if (bioReader == null)
            {
                throw new ArgumentNullException("bioReader");
            }

            string message;

            if (!bioReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                message = string.Format(CultureInfo.InvariantCulture,
                                        Resource.INVAILD_INPUT_FILE,
                                        Resource.FASTA_NAME);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Process header line.
            Sequence sequence;
            string   id = bioReader.GetLineField(2).Trim();

            if (_blockSize > FileLoadHelper.DefaultFullLoadBlockSize)
            {
                _lineCount++;
                _lineLength    += bioReader.Line.Length;
                sequencePointer = new SequencePointer {
                    StartingLine = _lineCount
                };
            }

            bioReader.GoToNextLine();

            IAlphabet alphabet = Alphabet;

            if (alphabet == null)
            {
                alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, bioReader.Line);

                if (alphabet == null)
                {
                    message = string.Format(CultureInfo.InvariantCulture,
                                            Resource.InvalidSymbolInString,
                                            bioReader.Line);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            if (Encoding == null)
            {
                sequence = new Sequence(alphabet);
            }
            else
            {
                sequence = new Sequence(alphabet, Encoding, string.Empty)
                {
                    IsReadOnly = false
                };
            }

            bool sameSequence = false;

            sequence.ID = id;
            while (bioReader.HasLines && !bioReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                if (Alphabet == null)
                {
                    alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, bioReader.Line);

                    if (alphabet == null)
                    {
                        message = string.Format(CultureInfo.InvariantCulture,
                                                Resource.InvalidSymbolInString,
                                                bioReader.Line);
                        Trace.Report(message);
                        throw new FileFormatException(message);
                    }

                    if (sequence.Alphabet != alphabet)
                    {
                        Sequence seq = new Sequence(alphabet, Encoding, sequence)
                        {
                            IsReadOnly = false
                        };
                        sequence.Clear();
                        sequence = seq;
                    }
                }


                // full load
                if (_blockSize <= 0)
                {
                    sequence.InsertRange(sequence.Count, bioReader.Line);
                }
                else
                {
                    if (sameSequence == false)
                    {
                        _sequenceBeginsAt = _lineLength;
                        sameSequence      = true;
                    }

                    _lineLength += bioReader.Line.Length;
                    _lineCount++;
                }

                bioReader.GoToNextLine();
            }

            if (sequence.MoleculeType == MoleculeType.Invalid)
            {
                sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet);
            }
            sequence.IsReadOnly = isReadOnly;

            // full load
            if (_blockSize == FileLoadHelper.DefaultFullLoadBlockSize)
            {
                return(sequence);
            }

            if (sequencePointer != null)
            {
                sequencePointer.AlphabetName = sequence.Alphabet.Name;
                sequencePointer.Id           = sequence.ID;

                sequencePointer.StartingIndex = _sequenceBeginsAt;
                sequencePointer.EndingIndex   = _lineLength;
                _sequencePointers.Add(sequencePointer);
            }
            _sequenceCount++;
            FileVirtualSequenceProvider dataprovider = new FileVirtualSequenceProvider(this, sequencePointer)
            {
                BlockSize         = _blockSize,
                MaxNumberOfBlocks = _maxNumberOfBlocks
            };

            sequence.VirtualSequenceProvider = dataprovider;
            return(sequence);
        }
Beispiel #4
0
        // LOCUS is the first line in a GenBank record
        private void ParseLocus(BioTextReader bioReader, ref Sequence sequence)
        {
            GenBankLocusInfo locusInfo = new GenBankLocusInfo();

            // GenBank spec recommends token rather than position-based parsing, but this
            // is only partially possible without making extra assumptions about the presence
            // of optional fields.
            string[] tokens = bioReader.LineData.Split(new char[] { ' ' },
                                                       StringSplitOptions.RemoveEmptyEntries);
            sequence.ID    = tokens[0];
            locusInfo.Name = tokens[0];

            int sequenceLength;

            if (!int.TryParse(tokens[1], out sequenceLength))
            {
                throw new InvalidOperationException();
            }
            locusInfo.SequenceLength = sequenceLength;

            string seqType = tokens[2];

            if (seqType != "bp" && seqType != "aa")
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidLocus,
                    bioReader.Line);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }

            // Determine format version and parse the remaining fields by position.
            string strandType;
            string strandTopology;
            string division;
            string rawDate;
            string molType = string.Empty;

            if (Helper.StringHasMatch(bioReader.GetLineField(31, 32), "bp", "aa"))
            {
                // older format
                strandType     = bioReader.GetLineField(34, 36).Trim();
                strandTopology = bioReader.GetLineField(43, 52).Trim();
                division       = bioReader.GetLineField(53, 56).Trim();
                rawDate        = bioReader.GetLineField(63).Trim();

                // molecule type field is not used for amino acid chains
                if (seqType != "aa")
                {
                    molType = bioReader.GetLineField(37, 42).Trim();
                }
            }
            else
            {
                // newer format
                strandType     = bioReader.GetLineField(45, 47).Trim();
                strandTopology = bioReader.GetLineField(56, 63).Trim();
                division       = bioReader.GetLineField(65, 67).Trim();
                rawDate        = bioReader.GetLineField(69).Trim();

                // molecule type field is not used for amino acid chains
                if (seqType != "aa")
                {
                    molType = bioReader.GetLineField(48, 53).Trim();
                }
            }

            // process strand type
            if (!Helper.StringHasMatch(strandType, string.Empty, "ss-", "ds-", "ms-"))
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidLocus,
                    bioReader.Line);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }
            locusInfo.Strand = Helper.GetStrandType(strandType);

            // process strand topology
            if (!Helper.StringHasMatch(strandTopology, string.Empty, "linear", "circular"))
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidStrand,
                    strandTopology);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }

            locusInfo.StrandTopology = Helper.GetStrandTopology(strandTopology);

            // process division
            try
            {
                locusInfo.DivisionCode = (SequenceDivisionCode)Enum.Parse(typeof(SequenceDivisionCode), division);
            }
            catch (ArgumentException)
            {
                locusInfo.DivisionCode = SequenceDivisionCode.None;
            }

            // process date
            DateTime date;

            if (!DateTime.TryParse(rawDate, out date))
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidDate,
                    rawDate);
                Trace.Report(message);
                throw new FormatException(message);
            }

            locusInfo.Date         = date;
            locusInfo.SequenceType = seqType;

            // process sequence type and molecule type
            MoleculeType moleculeType;

            if (seqType == "aa")
            {
                moleculeType = MoleculeType.Protein;
            }
            else
            {
                moleculeType = GetMoleculeType(molType);

                if (moleculeType == MoleculeType.Invalid)
                {
                    string message = String.Format(
                        CultureInfo.CurrentCulture,
                        Properties.Resource.ParserInvalidLocus,
                        bioReader.Line);
                    Trace.Report(message);
                    throw new FormatException(message);
                }
            }

            IAlphabet alphabet = GetAlphabet(moleculeType);

            if (alphabet != sequence.Alphabet)
            {
                if (Alphabet != null && Alphabet != alphabet)
                {
                    string message = Properties.Resource.ParserIncorrectAlphabet;
                    Trace.Report(message);
                    throw new InvalidDataException(message);
                }
                sequence            = new Sequence(alphabet, Encoding, sequence);
                sequence.IsReadOnly = false;
            }

            sequence.MoleculeType  = moleculeType;
            locusInfo.MoleculeType = moleculeType;
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            metadata.Locus = locusInfo;
            bioReader.GoToNextLine();
        }