Ejemplo n.º 1
0
        public void TestMBFTextReaderCoreFunctionality()
        {
            using (MBFTextReader mbfReader = new MBFTextReader(testFileFullName))
            {
                // Test line access members.
                Assert.IsTrue(mbfReader.HasLines);
                Assert.AreEqual("LOCUS       SCU49845     5028 bp    DNA             PLN       21-JUN-1999",
                                mbfReader.Line);
                Assert.IsTrue(mbfReader.LineHasHeader);
                Assert.AreEqual("LOCUS", mbfReader.LineHeader);
                Assert.IsTrue(mbfReader.LineHasData);
                Assert.AreEqual("SCU49845     5028 bp    DNA             PLN       21-JUN-1999",
                                mbfReader.LineData);
                Assert.AreEqual("NA  ", mbfReader.GetLineField(38, 41));

                // Test reading lines and line number tracking.
                for (int i = 1; i < 6; i++)
                {
                    mbfReader.GoToNextLine();
                }
                Assert.AreEqual(7, mbfReader.LineNumber);
                Assert.AreEqual("KEYWORDS", mbfReader.LineHeader);

                // Test switching line indent.
                mbfReader.DataIndent = 2;
                Assert.AreEqual("KE", mbfReader.LineHeader);
                Assert.AreEqual("YWORDS    .", mbfReader.LineData);

                // Test recognition of blank header and data.
                for (int i = 6; i < 8; i++)
                {
                    mbfReader.GoToNextLine();
                }
                Assert.IsFalse(mbfReader.LineHasHeader); // line starts with 2 spaces
                Assert.IsTrue(mbfReader.LineHasData);
                mbfReader.DataIndent = 37;               // the line length
                Assert.IsTrue(mbfReader.LineHasHeader);
                Assert.IsFalse(mbfReader.LineHasData);
                mbfReader.DataIndent = 12; // back to standard line length

                // Test skipping sections and EOF recognition.
                mbfReader.SkipToNextSection(); // ref 1
                mbfReader.SkipToNextSection(); // ref 2
                mbfReader.SkipToNextSection(); // features
                mbfReader.SkipToNextSection(); // origin
                mbfReader.SkipToNextSection(); // "//"
                Assert.IsTrue(mbfReader.HasLines);
                mbfReader.GoToNextLine();      // EOF
                Assert.IsTrue(mbfReader.HasLines);
            }
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Parses a single FASTQ text from a reader into a QualitativeSequence.
        /// </summary>
        /// <param name="mbfReader">A reader for a biological sequence text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not.
        /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>A new QualitativeSequence instance containing parsed data.</returns>
        private IQualitativeSequence ParseOneWithFastQFormat(MBFTextReader mbfReader, bool isReadOnly)
        {
            string message;

            // Check for '@' symbol at the first line.
            if (!mbfReader.HasLines || !mbfReader.Line.StartsWith("@", StringComparison.Ordinal))
            {
                message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, Name);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Process header line.
            string id = mbfReader.GetLineField(2).Trim();

            // Go to second line.
            mbfReader.GoToNextLine();
            if (!mbfReader.HasLines || string.IsNullOrEmpty(mbfReader.Line))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Get sequence from second line.
            string sequenceLine = mbfReader.Line;

            // Goto third line.
            mbfReader.GoToNextLine();

            // Check for '+' symbol in the third line.
            if (!mbfReader.HasLines || !mbfReader.Line.StartsWith("+", StringComparison.Ordinal))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            string qualScoreId = mbfReader.GetLineField(2).Trim();

            if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderData, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Goto fourth line.
            mbfReader.GoToNextLine();
            if (!mbfReader.HasLines || string.IsNullOrEmpty(mbfReader.Line))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Get the quality scores from the fourth line.
            byte[] qualScores = ASCIIEncoding.ASCII.GetBytes(mbfReader.Line);

            // Check for sequence length and quality score length.
            if (sequenceLine.Length != mbfReader.Line.Length)
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            mbfReader.GoToNextLine();

            IAlphabet alphabet = Alphabet;

            // Identify alphabet if it is not specified.
            if (alphabet == null)
            {
                alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, sequenceLine);

                if (alphabet == null)
                {
                    string message1 = string.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, sequenceLine);
                    message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            FastQFormatType fastQType = FastqType;

            // Identify fastq format type if AutoDetectFastQFormat property is set to true.
            if (AutoDetectFastQFormat)
            {
                fastQType = IdentifyFastQFormatType(qualScores);
            }

            QualitativeSequence sequence = null;

            if (Encoding == null)
            {
                sequence = new QualitativeSequence(alphabet, fastQType, sequenceLine, qualScores);
            }
            else
            {
                sequence = new QualitativeSequence(alphabet, fastQType, Encoding, sequenceLine, qualScores);
            }

            sequence.ID         = id;
            sequence.IsReadOnly = isReadOnly;

            return(sequence);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Parses a single FASTA sequence from a file using MBFTextReader.
        /// This method is used in non-data virtualization scenarios.
        /// </summary>
        /// <param name="mbfReader">The MBFTextReader of the file to be parsed.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequence should be in read-only mode.
        /// If this flag is set to true then the resulting sequence's IsReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>The parsed sequence.</returns>
        protected ISequence ParseOneWithSpecificFormat(MBFTextReader mbfReader, bool isReadOnly)
        {
            if (mbfReader == null)
            {
                throw new ArgumentNullException("mbfReader");
            }

            string message;

            if (!mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                message = string.Format(CultureInfo.InvariantCulture,
                                        Resource.INVALID_INPUT_FILE,
                                        Resource.FASTA_NAME);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Process header line.
            Sequence sequence;
            string   id = mbfReader.GetLineField(2).Trim();

            mbfReader.GoToNextLine();

            IAlphabet alphabet = Alphabet;

            if (alphabet == null)
            {
                alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, mbfReader.Line);

                if (alphabet == null)
                {
                    message = string.Format(CultureInfo.InvariantCulture,
                                            Resource.InvalidSymbolInString,
                                            mbfReader.Line);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            if (Encoding == null)
            {
                sequence = new Sequence(alphabet);
            }
            else
            {
                sequence = new Sequence(alphabet, Encoding, string.Empty)
                {
                    IsReadOnly = false
                };
            }

            sequence.ID = id;
            while (mbfReader.HasLines && !mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                if (Alphabet == null)
                {
                    alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, mbfReader.Line);

                    if (alphabet == null)
                    {
                        message = string.Format(CultureInfo.InvariantCulture,
                                                Resource.InvalidSymbolInString,
                                                mbfReader.Line);
                        Trace.Report(message);
                        throw new FileFormatException(message);
                    }

                    if (sequence.Alphabet != alphabet)
                    {
                        Sequence seq = new Sequence(alphabet, Encoding, sequence)
                        {
                            IsReadOnly = false
                        };
                        sequence.Clear();
                        sequence = seq;
                    }
                }

                sequence.InsertRange(sequence.Count, mbfReader.Line);
                mbfReader.GoToNextLine();
            }

            if (sequence.MoleculeType == MoleculeType.Invalid)
            {
                sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet);
            }

            sequence.IsReadOnly = isReadOnly;
            return(sequence);
        }
Ejemplo n.º 4
0
        // Processes headers, which are a type of comment.
        private void ParseHeaders(MBFTextReader mbfReader)
        {
            string comments      = string.Empty;
            int    commentsCount = 1;

            while (mbfReader.HasLines && mbfReader.Line.TrimStart().StartsWith(_commentMark, StringComparison.Ordinal))
            {
                Sequence specificSeq = null;

                // process headers, but ignore other comments
                if (mbfReader.Line.StartsWith(_headerMark, StringComparison.Ordinal))
                {
                    string[] fields = mbfReader.GetLineField(3).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                    // Add if any comments.
                    if (!string.IsNullOrEmpty(comments))
                    {
                        _commonSeq.Metadata[_commentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] = comments;
                        comments = string.Empty;
                        commentsCount++;
                    }

                    switch (fields[0].ToUpperInvariant())
                    {
                    case _gffVersionKey:
                        if (fields.Length > 1 && fields[1] != "2")
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.GffUnsupportedVersion,
                                mbfReader.LocationString);
                            Trace.Report(message);
                            throw new NotSupportedException(message);
                        }

                        // Store "GFF-VERSION" to get keep the order of comments/headers.
                        _commonSeq.Metadata[_gffVersionKey] = fields[1];

                        break;

                    case _sourceVersionKey:

                        MetadataListItem <string> sourceVersion = new MetadataListItem <string>(_sourceVersionKey, string.Empty);
                        sourceVersion.SubItems.Add(_sourceKey, fields[1]);
                        sourceVersion.SubItems.Add(_versionKey, fields[2]);

                        _commonSeq.Metadata[_sourceVersionKey] = sourceVersion;

                        break;

                    case _dateKey:
                        DateTime date;
                        if (!DateTime.TryParse(fields[1], out date))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.ParserInvalidDate,
                                mbfReader.LocationString);
                            Trace.Report(message);
                            throw new FormatException(message);
                        }

                        _commonSeq.Metadata[_dateLowerCaseKey] = date;
                        break;

                    case _typeKey:
                        if (fields.Length == 2)
                        {
                            _commonSeq.MoleculeType = GetMoleculeType(fields[1]);
                            if (_commonSeq.MoleculeType == MoleculeType.Invalid)
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.InvalidType,
                                    mbfReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }

                            // Store "TYPE" to get keep the order of comments/headers.
                            _commonSeq.Metadata[_typeKey] = fields[1];
                        }
                        else
                        {
                            specificSeq = GetSpecificSequence(fields[2], GetMoleculeType(fields[1]), mbfReader, false);

                            if (specificSeq.MoleculeType == MoleculeType.Invalid)
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.InvalidType,
                                    mbfReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }

                            // Store "TYPE" to get keep the order of comments/headers.
                            // Store seq id as value.
                            _commonSeq.Metadata[_multiTypeKey + fields[2]] = fields[2];
                        }
                        break;

                    case "DNA":
                    case "RNA":
                    case "PROTEIN":
                        specificSeq = GetSpecificSequence(fields[1], GetMoleculeType(fields[0]), mbfReader, false);
                        mbfReader.GoToNextLine();

                        // Store seq id as value.
                        _commonSeq.Metadata[_multiSeqDataKey + fields[1]] = fields[1];

                        while (mbfReader.HasLines && mbfReader.Line != _seqDataEnd + fields[0])
                        {
                            if (!mbfReader.Line.StartsWith(_headerMark, StringComparison.Ordinal))
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.GffInvalidSequence,
                                    mbfReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }

                            specificSeq.InsertRange(specificSeq.Count, mbfReader.GetLineField(3));

                            mbfReader.GoToNextLine();
                        }

                        break;

                    case _seqRegKey:

                        specificSeq = GetSpecificSequence(fields[1], MoleculeType.Invalid, mbfReader, false);
                        specificSeq.Metadata["start"] = fields[2];
                        specificSeq.Metadata["end"]   = fields[3];

                        // Store seq id as value.
                        _commonSeq.Metadata[_multiSeqRegKey + fields[1]] = fields[1];
                        break;
                    }
                }
                else
                {
                    comments = string.IsNullOrEmpty(comments) ? mbfReader.Line : comments + Environment.NewLine + mbfReader.Line;
                }

                mbfReader.GoToNextLine();
            }

            if (!string.IsNullOrEmpty(comments))
            {
                _commonSeq.Metadata[_commentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] = comments;
                comments = string.Empty;
            }
        }