public void TestMBFTextReaderCoreFunctionality() { using (MBFTextReader mbfReader = new MBFTextReader(testFileFullName)) { // Test line access members. Assert.IsTrue(mbfReader.HasLines); Assert.AreEqual("LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999", mbfReader.Line); Assert.IsTrue(mbfReader.LineHasHeader); Assert.AreEqual("LOCUS", mbfReader.LineHeader); Assert.IsTrue(mbfReader.LineHasData); Assert.AreEqual("SCU49845 5028 bp DNA PLN 21-JUN-1999", mbfReader.LineData); Assert.AreEqual("NA ", mbfReader.GetLineField(38, 41)); // Test reading lines and line number tracking. for (int i = 1; i < 6; i++) { mbfReader.GoToNextLine(); } Assert.AreEqual(7, mbfReader.LineNumber); Assert.AreEqual("KEYWORDS", mbfReader.LineHeader); // Test switching line indent. mbfReader.DataIndent = 2; Assert.AreEqual("KE", mbfReader.LineHeader); Assert.AreEqual("YWORDS .", mbfReader.LineData); // Test recognition of blank header and data. for (int i = 6; i < 8; i++) { mbfReader.GoToNextLine(); } Assert.IsFalse(mbfReader.LineHasHeader); // line starts with 2 spaces Assert.IsTrue(mbfReader.LineHasData); mbfReader.DataIndent = 37; // the line length Assert.IsTrue(mbfReader.LineHasHeader); Assert.IsFalse(mbfReader.LineHasData); mbfReader.DataIndent = 12; // back to standard line length // Test skipping sections and EOF recognition. mbfReader.SkipToNextSection(); // ref 1 mbfReader.SkipToNextSection(); // ref 2 mbfReader.SkipToNextSection(); // features mbfReader.SkipToNextSection(); // origin mbfReader.SkipToNextSection(); // "//" Assert.IsTrue(mbfReader.HasLines); mbfReader.GoToNextLine(); // EOF Assert.IsTrue(mbfReader.HasLines); } }
/// <summary> /// Parses a single FASTQ text from a reader into a QualitativeSequence. /// </summary> /// <param name="mbfReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not. /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new QualitativeSequence instance containing parsed data.</returns> private IQualitativeSequence ParseOneWithFastQFormat(MBFTextReader mbfReader, bool isReadOnly) { string message; // Check for '@' symbol at the first line. if (!mbfReader.HasLines || !mbfReader.Line.StartsWith("@", StringComparison.Ordinal)) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, Name); Trace.Report(message); throw new FileFormatException(message); } // Process header line. string id = mbfReader.GetLineField(2).Trim(); // Go to second line. mbfReader.GoToNextLine(); if (!mbfReader.HasLines || string.IsNullOrEmpty(mbfReader.Line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Get sequence from second line. string sequenceLine = mbfReader.Line; // Goto third line. mbfReader.GoToNextLine(); // Check for '+' symbol in the third line. if (!mbfReader.HasLines || !mbfReader.Line.StartsWith("+", StringComparison.Ordinal)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } string qualScoreId = mbfReader.GetLineField(2).Trim(); if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderData, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Goto fourth line. mbfReader.GoToNextLine(); if (!mbfReader.HasLines || string.IsNullOrEmpty(mbfReader.Line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Get the quality scores from the fourth line. byte[] qualScores = ASCIIEncoding.ASCII.GetBytes(mbfReader.Line); // Check for sequence length and quality score length. if (sequenceLine.Length != mbfReader.Line.Length) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } mbfReader.GoToNextLine(); IAlphabet alphabet = Alphabet; // Identify alphabet if it is not specified. if (alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, sequenceLine); if (alphabet == null) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, sequenceLine); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } } FastQFormatType fastQType = FastqType; // Identify fastq format type if AutoDetectFastQFormat property is set to true. if (AutoDetectFastQFormat) { fastQType = IdentifyFastQFormatType(qualScores); } QualitativeSequence sequence = null; if (Encoding == null) { sequence = new QualitativeSequence(alphabet, fastQType, sequenceLine, qualScores); } else { sequence = new QualitativeSequence(alphabet, fastQType, Encoding, sequenceLine, qualScores); } sequence.ID = id; sequence.IsReadOnly = isReadOnly; return(sequence); }
/// <summary> /// Parses a single FASTA sequence from a file using MBFTextReader. /// This method is used in non-data virtualization scenarios. /// </summary> /// <param name="mbfReader">The MBFTextReader of the file to be parsed.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence should be in read-only mode. /// If this flag is set to true then the resulting sequence's IsReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The parsed sequence.</returns> protected ISequence ParseOneWithSpecificFormat(MBFTextReader mbfReader, bool isReadOnly) { if (mbfReader == null) { throw new ArgumentNullException("mbfReader"); } string message; if (!mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { message = string.Format(CultureInfo.InvariantCulture, Resource.INVALID_INPUT_FILE, Resource.FASTA_NAME); Trace.Report(message); throw new FileFormatException(message); } // Process header line. Sequence sequence; string id = mbfReader.GetLineField(2).Trim(); mbfReader.GoToNextLine(); IAlphabet alphabet = Alphabet; if (alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, mbfReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, mbfReader.Line); Trace.Report(message); throw new FileFormatException(message); } } if (Encoding == null) { sequence = new Sequence(alphabet); } else { sequence = new Sequence(alphabet, Encoding, string.Empty) { IsReadOnly = false }; } sequence.ID = id; while (mbfReader.HasLines && !mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { if (Alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, mbfReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, mbfReader.Line); Trace.Report(message); throw new FileFormatException(message); } if (sequence.Alphabet != alphabet) { Sequence seq = new Sequence(alphabet, Encoding, sequence) { IsReadOnly = false }; sequence.Clear(); sequence = seq; } } sequence.InsertRange(sequence.Count, mbfReader.Line); mbfReader.GoToNextLine(); } if (sequence.MoleculeType == MoleculeType.Invalid) { sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet); } sequence.IsReadOnly = isReadOnly; return(sequence); }
// Processes headers, which are a type of comment. private void ParseHeaders(MBFTextReader mbfReader) { string comments = string.Empty; int commentsCount = 1; while (mbfReader.HasLines && mbfReader.Line.TrimStart().StartsWith(_commentMark, StringComparison.Ordinal)) { Sequence specificSeq = null; // process headers, but ignore other comments if (mbfReader.Line.StartsWith(_headerMark, StringComparison.Ordinal)) { string[] fields = mbfReader.GetLineField(3).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // Add if any comments. if (!string.IsNullOrEmpty(comments)) { _commonSeq.Metadata[_commentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] = comments; comments = string.Empty; commentsCount++; } switch (fields[0].ToUpperInvariant()) { case _gffVersionKey: if (fields.Length > 1 && fields[1] != "2") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffUnsupportedVersion, mbfReader.LocationString); Trace.Report(message); throw new NotSupportedException(message); } // Store "GFF-VERSION" to get keep the order of comments/headers. _commonSeq.Metadata[_gffVersionKey] = fields[1]; break; case _sourceVersionKey: MetadataListItem <string> sourceVersion = new MetadataListItem <string>(_sourceVersionKey, string.Empty); sourceVersion.SubItems.Add(_sourceKey, fields[1]); sourceVersion.SubItems.Add(_versionKey, fields[2]); _commonSeq.Metadata[_sourceVersionKey] = sourceVersion; break; case _dateKey: DateTime date; if (!DateTime.TryParse(fields[1], out date)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidDate, mbfReader.LocationString); Trace.Report(message); throw new FormatException(message); } _commonSeq.Metadata[_dateLowerCaseKey] = date; break; case _typeKey: if (fields.Length == 2) { _commonSeq.MoleculeType = GetMoleculeType(fields[1]); if (_commonSeq.MoleculeType == MoleculeType.Invalid) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.InvalidType, mbfReader.LocationString); Trace.Report(message); throw new FormatException(message); } // Store "TYPE" to get keep the order of comments/headers. _commonSeq.Metadata[_typeKey] = fields[1]; } else { specificSeq = GetSpecificSequence(fields[2], GetMoleculeType(fields[1]), mbfReader, false); if (specificSeq.MoleculeType == MoleculeType.Invalid) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.InvalidType, mbfReader.LocationString); Trace.Report(message); throw new FormatException(message); } // Store "TYPE" to get keep the order of comments/headers. // Store seq id as value. _commonSeq.Metadata[_multiTypeKey + fields[2]] = fields[2]; } break; case "DNA": case "RNA": case "PROTEIN": specificSeq = GetSpecificSequence(fields[1], GetMoleculeType(fields[0]), mbfReader, false); mbfReader.GoToNextLine(); // Store seq id as value. _commonSeq.Metadata[_multiSeqDataKey + fields[1]] = fields[1]; while (mbfReader.HasLines && mbfReader.Line != _seqDataEnd + fields[0]) { if (!mbfReader.Line.StartsWith(_headerMark, StringComparison.Ordinal)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidSequence, mbfReader.LocationString); Trace.Report(message); throw new FormatException(message); } specificSeq.InsertRange(specificSeq.Count, mbfReader.GetLineField(3)); mbfReader.GoToNextLine(); } break; case _seqRegKey: specificSeq = GetSpecificSequence(fields[1], MoleculeType.Invalid, mbfReader, false); specificSeq.Metadata["start"] = fields[2]; specificSeq.Metadata["end"] = fields[3]; // Store seq id as value. _commonSeq.Metadata[_multiSeqRegKey + fields[1]] = fields[1]; break; } } else { comments = string.IsNullOrEmpty(comments) ? mbfReader.Line : comments + Environment.NewLine + mbfReader.Line; } mbfReader.GoToNextLine(); } if (!string.IsNullOrEmpty(comments)) { _commonSeq.Metadata[_commentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] = comments; comments = string.Empty; } }