/// <summary> /// Parse Nexus Header /// </summary> /// <param name="mbfReader">A reader for a biological sequence text.</param> private void ParseHeader(MBFTextReader mbfReader) { string message = string.Empty; if (!mbfReader.Line.StartsWith("#NEXUS", StringComparison.OrdinalIgnoreCase)) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name); throw new InvalidDataException(message); } mbfReader.GoToNextLine(); // Skip blank lines until we get to the first block. // Title of Alignment if (mbfReader.Line.Trim().StartsWith("[", StringComparison.OrdinalIgnoreCase)) { while (mbfReader.HasLines) { mbfReader.GoToNextLine(); if (mbfReader.Line.Trim().EndsWith("]", StringComparison.OrdinalIgnoreCase)) { break; } } } mbfReader.GoToNextLine(); // Now that we're at the first block, one or more blank lines are the block separators, which we'll need. mbfReader.SkipBlankLines = false; }
private void ParseOrigin(MBFTextReader mbfReader, GenBankMetadata metadata, ref Sequence sequence) { // The origin line can contain optional data; don't put empty string into // metadata. if (!String.IsNullOrEmpty(mbfReader.LineData)) { metadata.Origin = mbfReader.LineData; } mbfReader.GoToNextLine(); IAlphabet alphabet = null; var sequenceBuilder = new StringBuilder(); while (mbfReader.HasLines && mbfReader.Line[0] == ' ') { // Using a regex is too slow. int len = mbfReader.Line.Length; int k = 10; while (k < len) { string seqData = mbfReader.Line.Substring(k, Math.Min(10, len - k)); sequenceBuilder.Append(seqData); k += 11; } mbfReader.GoToNextLine(); } var sequenceString = sequenceBuilder.ToString().Trim(); if (!string.IsNullOrEmpty(sequenceString)) { if (Alphabet == null) { alphabet = IdentifyAlphabet(alphabet, sequenceString); if (alphabet == null) { var message = String.Format(Resource.InvalidSymbolInString, mbfReader.Line); Trace.Report(message); throw new Exception(message); } if (sequence.Alphabet != alphabet) { Sequence seq = new Sequence(alphabet, Encoding, sequence) { MoleculeType = sequence.MoleculeType, IsReadOnly = false }; sequence.Clear(); sequence = seq; } } sequence.InsertRange(sequence.Count, sequenceString); } }
public void TestMBFTextReaderCoreFunctionality() { using (MBFTextReader mbfReader = new MBFTextReader(testFileFullName)) { // Test line access members. Assert.IsTrue(mbfReader.HasLines); Assert.AreEqual("LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999", mbfReader.Line); Assert.IsTrue(mbfReader.LineHasHeader); Assert.AreEqual("LOCUS", mbfReader.LineHeader); Assert.IsTrue(mbfReader.LineHasData); Assert.AreEqual("SCU49845 5028 bp DNA PLN 21-JUN-1999", mbfReader.LineData); Assert.AreEqual("NA ", mbfReader.GetLineField(38, 41)); // Test reading lines and line number tracking. for (int i = 1; i < 6; i++) { mbfReader.GoToNextLine(); } Assert.AreEqual(7, mbfReader.LineNumber); Assert.AreEqual("KEYWORDS", mbfReader.LineHeader); // Test switching line indent. mbfReader.DataIndent = 2; Assert.AreEqual("KE", mbfReader.LineHeader); Assert.AreEqual("YWORDS .", mbfReader.LineData); // Test recognition of blank header and data. for (int i = 6; i < 8; i++) { mbfReader.GoToNextLine(); } Assert.IsFalse(mbfReader.LineHasHeader); // line starts with 2 spaces Assert.IsTrue(mbfReader.LineHasData); mbfReader.DataIndent = 37; // the line length Assert.IsTrue(mbfReader.LineHasHeader); Assert.IsFalse(mbfReader.LineHasData); mbfReader.DataIndent = 12; // back to standard line length // Test skipping sections and EOF recognition. mbfReader.SkipToNextSection(); // ref 1 mbfReader.SkipToNextSection(); // ref 2 mbfReader.SkipToNextSection(); // features mbfReader.SkipToNextSection(); // origin mbfReader.SkipToNextSection(); // "//" Assert.IsTrue(mbfReader.HasLines); mbfReader.GoToNextLine(); // EOF Assert.IsTrue(mbfReader.HasLines); } }
// Handle optional BASE COUNT, then ORIGIN and sequence data. private void ParseSequence(MBFTextReader mbfReader, ref Sequence sequence) { string message = string.Empty; GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // set data indent for sequence headers mbfReader.DataIndent = _dataIndent; while (mbfReader.HasLines) { if (mbfReader.Line.StartsWith("//", StringComparison.Ordinal)) { mbfReader.GoToNextLine(); break; // end of sequence record } switch (mbfReader.LineHeader) { case "BASE COUNT": // The BASE COUNT linetype is obsolete and was removed // from the GenBank flatfile format in October 2003. But if it is // present, we will use it. We get the untrimmed version since it // starts with a right justified column. metadata.BaseCount = mbfReader.Line.Substring(_dataIndent); mbfReader.GoToNextLine(); break; case "ORIGIN": // Change Note: The original implementation would validate the alphabet every line // which would greatly impact performance on large sequences. This updates the method // to improve performance by validating the alphabet after parsing the sequence. ParseOrigin(mbfReader, metadata, ref sequence); break; case "CONTIG": metadata.Contig = ParseMultiLineData(mbfReader, Environment.NewLine); // don't go to next line; current line still needs to be processed break; default: message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserUnexpectedLineInSequence, mbfReader.Line); Trace.Report(message); throw new InvalidDataException(message); } } }
// returns a string of the data for a header block that spans multiple lines private static string ParseMultiLineData(MBFTextReader mbfReader, string lineBreakSubstitution) { string data = mbfReader.LineData; mbfReader.GoToNextLine(); // while succeeding lines start with no header, add to data while (mbfReader.HasLines && !mbfReader.LineHasHeader) { data += lineBreakSubstitution + mbfReader.LineData; mbfReader.GoToNextLine(); } return(data); }
/// <summary> /// Parses a list of sequences using a MBFTextReader. /// </summary> /// <remarks> /// This method should be overridden by any parsers that need to process file-scope /// metadata that applies to all of the sequences in the file. /// </remarks> /// <param name="mbfReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequences should be in readonly mode or not. /// If this flag is set to true then the resulting sequences's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The list of parsed ISequence objects.</returns> protected virtual IList <ISequenceAlignment> Parse(MBFTextReader mbfReader, bool isReadOnly) { if (mbfReader == null) { throw new ArgumentNullException("mbfReader"); } // no empty files allowed if (!mbfReader.HasLines) { string message = Properties.Resource.IONoTextToParse; throw new InvalidDataException(message); } List <ISequenceAlignment> alignments = new List <ISequenceAlignment>(); // Parse Header, Loop through the blocks and parse while (mbfReader.HasLines) { if (string.IsNullOrEmpty(mbfReader.Line.Trim())) { mbfReader.GoToNextLine(); continue; } alignments.Add(ParseOneWithSpecificFormat(mbfReader, isReadOnly)); } return(alignments); }
/// <summary> /// Parses the GenBank LOCUS using a token based approach which provides more flexibility for /// GenBank documents that do not follow the standard 100%. /// </summary> /// <param name="mbfReader"></param> /// <param name="sequence"></param> private void ParseLocusByTokens(MBFTextReader mbfReader, ref Sequence sequence) { var locusInfo = new GenBankLocusTokenParser().Parse(mbfReader.LineData); IAlphabet alphabet = GetAlphabet(locusInfo.MoleculeType); if (alphabet != sequence.Alphabet) { if (Alphabet != null && Alphabet != alphabet) { Trace.Report(Resource.ParserIncorrectAlphabet); throw new InvalidDataException(Resource.ParserIncorrectAlphabet); } sequence = new Sequence(alphabet, Encoding, sequence) { IsReadOnly = false }; } sequence.ID = locusInfo.Name; sequence.MoleculeType = locusInfo.MoleculeType; var metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; metadata.Locus = locusInfo; mbfReader.GoToNextLine(); }
/// <summary> /// Parses all the sequences in a SAM file. /// </summary> /// <param name="seqAlignment">SequenceAlignmentMap object</param> /// <param name="mbfReader">A reader for the sequence alignment text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the sequences in the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequences's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> private void ParseSequences(SequenceAlignmentMap seqAlignment, MBFTextReader mbfReader, bool isReadOnly) { while (mbfReader.HasLines && !mbfReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { SAMAlignedSequence alignedSeq = ParseSequence(mbfReader, isReadOnly); seqAlignment.QuerySequences.Add(alignedSeq); mbfReader.GoToNextLine(); } }
/// <summary> /// Parses SAM alignment header from specified MBFTextReader. /// </summary> /// <param name="mbfReader">MBF text reader.</param> public static SAMAlignmentHeader ParseSAMHeader(MBFTextReader mbfReader) { if (mbfReader == null) { throw new ArgumentNullException("mbfReader"); } _headerLength = 0; SAMAlignmentHeader samHeader = new SAMAlignmentHeader(); if (mbfReader.HasLines && mbfReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { while (mbfReader.HasLines && mbfReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { _headerLength += mbfReader.Line.Length; string[] tokens = mbfReader.Line.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries); string recordTypecode = tokens[0].Substring(1); // Validate the header format. ValidateHeaderLineFormat(mbfReader.Line); SAMRecordField headerLine = null; if (string.Compare(recordTypecode, "CO", StringComparison.OrdinalIgnoreCase) != 0) { List <string> tags = new List <string>(); headerLine = new SAMRecordField(recordTypecode); for (int i = 1; i < tokens.Length; i++) { string tagToken = tokens[i]; string tagName = tagToken.Substring(0, 2); tags.Add(tagName); headerLine.Tags.Add(new SAMRecordFieldTag(tagName, tagToken.Substring(3))); } samHeader.RecordFields.Add(headerLine); } else { samHeader.Comments.Add(mbfReader.Line.Substring(4)); } mbfReader.GoToNextLine(); } string message = samHeader.IsValid(); if (!string.IsNullOrEmpty(message)) { throw new FormatException(message); } } return(samHeader); }
/// <summary> /// Read XML BLAST data from the reader, and build one or more /// BlastRecordGroup objects (each containing one or more /// BlastSearchRecord results). /// </summary> /// <param name="reader">The text source</param> /// <returns>A list of BLAST iteration objects</returns> public IList <BlastResult> Parse(TextReader reader) { List <BlastResult> records = new List <BlastResult>(); StringBuilder sb = new StringBuilder(); using (MBFTextReader mbfReader = new MBFTextReader(reader)) { mbfReader.SkipBlankLines = false; while (mbfReader.HasLines) { if (mbfReader.Line.StartsWith("RPS-BLAST", StringComparison.OrdinalIgnoreCase)) { mbfReader.GoToNextLine(); continue; } if (mbfReader.Line.StartsWith("<?xml version", StringComparison.OrdinalIgnoreCase) && mbfReader.LineNumber > 1) { records.Add(ParseXML(sb)); sb = new StringBuilder(); } sb.AppendLine(mbfReader.Line); mbfReader.GoToNextLine(); } } if (sb.Length > 0) { records.Add(ParseXML(sb)); } if (records.Count == 0) { string message = Properties.Resource.BlastNoRecords; Trace.Report(message); throw new FormatException(message); } return(records); }
/// <summary> /// Parses SequenceAlignmentMap using a MBFTextReader. /// </summary> /// <param name="mbfReader">A reader for a sequence alignment text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether sequences in the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequences's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The list of parsed ISequenceAlignment objects.</returns> private SequenceAlignmentMap Parse(MBFTextReader mbfReader, bool isReadOnly) { _fileName = mbfReader.FileName; // Parse Header, Loop through the blocks and parse while (mbfReader.HasLines) { if (string.IsNullOrEmpty(mbfReader.Line.Trim())) { mbfReader.GoToNextLine(); continue; } return ParseOneWithSpecificFormat(mbfReader, isReadOnly); } return null; }
/// <summary> /// Gets Aligned seqeunces in the Specified SAM file. /// </summary> /// <param name="textReader">SAM file stream.</param> private IEnumerable <SAMAlignedSequence> GetAlignedSequence(MBFTextReader textReader) { bool isFilterRequired = IsFilterApplied(); bool display = true; //Displays SAM as output. while (textReader.HasLines) { SAMAlignedSequence alignedSequence = SAMParser.ParseSequence(textReader, false); if (isFilterRequired) { display = Filter(alignedSequence); } if (display) { yield return(alignedSequence); } textReader.GoToNextLine(); } }
// Parses the consecutive feature lines for one sequence. private void ParseFeatures(MBFTextReader mbfReader) { // The non-comment lines contain features, which are each stored as MetadataListItems. // The fields of each feature are referred to as sub-items. For GFF, these have // unique keys, but for compatability with our internal representation of features from // GenBank format, each sub-item is a list of strings, rather than a simple string. List <MetadataListItem <List <string> > > featureList = null; Sequence specificSeq = null; while (mbfReader.HasLines) { if (mbfReader.Line.StartsWith(_headerMark, StringComparison.Ordinal)) { // ignore comments mbfReader.GoToNextLine(); } else { // fields are tab-delimited string[] featureFields = mbfReader.Line.Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); if (featureFields.Length < _minFieldsPerFeature || featureFields.Length > _maxFieldsPerFeature) { string message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name);; throw new InvalidDataException(message); } // The featureFields array should now contain the following fields: // featureFields[0]: sequence name // featureFields[1]: source // featureFields[2]: feature name // featureFields[3]: start // featureFields[4]: end // featureFields[5]: score // featureFields[6]: strand // featureFields[7]: frame // featureFields[8]: attributes (optional) // Process sequence name. if (specificSeq == null) { specificSeq = GetSpecificSequence(featureFields[0], MoleculeType.Invalid, mbfReader); // Retrieve features list, or add empty features list to metadata if this // is the first feature. if (specificSeq.Metadata.ContainsKey("features")) { featureList = specificSeq.Metadata["features"] as List <MetadataListItem <List <string> > >; } else { featureList = new List <MetadataListItem <List <string> > >(); specificSeq.Metadata["features"] = featureList; } } else if (specificSeq.DisplayID != featureFields[0]) { // don't go to next line; current line still needs to be processed break; } // use feature name as key; attributes field is stored as free text string attributes = (featureFields.Length == 9 ? featureFields[8] : string.Empty); MetadataListItem <List <string> > feature = new MetadataListItem <List <string> >(featureFields[2], attributes); // source feature.SubItems.Add(_sourceKey, new List <string> { featureFields[1] }); // start is an int int ignoreMe; if (!int.TryParse(featureFields[3], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "start", featureFields[3]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("start", new List <string> { featureFields[3] }); // end is an int if (!int.TryParse(featureFields[4], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "end", featureFields[4]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("end", new List <string> { featureFields[4] }); // source is a double, or a dot as a space holder if (featureFields[5] != ".") { double ignoreMeToo; if (!double.TryParse(featureFields[5], out ignoreMeToo)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "score", featureFields[5]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("score", new List <string> { featureFields[5] }); } // strand is + or -, or a dot as a space holder if (featureFields[6] != ".") { if (featureFields[6] != "+" && featureFields[6] != "-") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "strand", featureFields[6]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("strand", new List <string> { featureFields[6] }); } // frame is an int, or a dot as a space holder if (featureFields[7] != ".") { if (!int.TryParse(featureFields[7], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "frame", featureFields[7]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("frame", new List <string> { featureFields[7] }); } // done with that one featureList.Add(feature); mbfReader.GoToNextLine(); } } // A feature file with no features? May it never be. if (featureList == null) { string message = Properties.Resource.GFFNoFeatures; Trace.Report(message); throw new InvalidOperationException(message); } // if any seqs are left in _sequencesInHeader add it to _sequences if (_sequencesInHeader.Count > 0) { _sequences.AddRange(_sequencesInHeader); _sequencesInHeader.Clear(); } }
// Processes headers, which are a type of comment. private void ParseHeaders(MBFTextReader mbfReader) { string comments = string.Empty; int commentsCount = 1; while (mbfReader.HasLines && mbfReader.Line.TrimStart().StartsWith(_commentMark, StringComparison.Ordinal)) { Sequence specificSeq = null; // process headers, but ignore other comments if (mbfReader.Line.StartsWith(_headerMark, StringComparison.Ordinal)) { string[] fields = mbfReader.GetLineField(3).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // Add if any comments. if (!string.IsNullOrEmpty(comments)) { _commonSeq.Metadata[_commentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] = comments; comments = string.Empty; commentsCount++; } switch (fields[0].ToUpperInvariant()) { case _gffVersionKey: if (fields.Length > 1 && fields[1] != "2") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffUnsupportedVersion, mbfReader.LocationString); Trace.Report(message); throw new NotSupportedException(message); } // Store "GFF-VERSION" to get keep the order of comments/headers. _commonSeq.Metadata[_gffVersionKey] = fields[1]; break; case _sourceVersionKey: MetadataListItem <string> sourceVersion = new MetadataListItem <string>(_sourceVersionKey, string.Empty); sourceVersion.SubItems.Add(_sourceKey, fields[1]); sourceVersion.SubItems.Add(_versionKey, fields[2]); _commonSeq.Metadata[_sourceVersionKey] = sourceVersion; break; case _dateKey: DateTime date; if (!DateTime.TryParse(fields[1], out date)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidDate, mbfReader.LocationString); Trace.Report(message); throw new FormatException(message); } _commonSeq.Metadata[_dateLowerCaseKey] = date; break; case _typeKey: if (fields.Length == 2) { _commonSeq.MoleculeType = GetMoleculeType(fields[1]); if (_commonSeq.MoleculeType == MoleculeType.Invalid) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.InvalidType, mbfReader.LocationString); Trace.Report(message); throw new FormatException(message); } // Store "TYPE" to get keep the order of comments/headers. _commonSeq.Metadata[_typeKey] = fields[1]; } else { specificSeq = GetSpecificSequence(fields[2], GetMoleculeType(fields[1]), mbfReader, false); if (specificSeq.MoleculeType == MoleculeType.Invalid) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.InvalidType, mbfReader.LocationString); Trace.Report(message); throw new FormatException(message); } // Store "TYPE" to get keep the order of comments/headers. // Store seq id as value. _commonSeq.Metadata[_multiTypeKey + fields[2]] = fields[2]; } break; case "DNA": case "RNA": case "PROTEIN": specificSeq = GetSpecificSequence(fields[1], GetMoleculeType(fields[0]), mbfReader, false); mbfReader.GoToNextLine(); // Store seq id as value. _commonSeq.Metadata[_multiSeqDataKey + fields[1]] = fields[1]; while (mbfReader.HasLines && mbfReader.Line != _seqDataEnd + fields[0]) { if (!mbfReader.Line.StartsWith(_headerMark, StringComparison.Ordinal)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidSequence, mbfReader.LocationString); Trace.Report(message); throw new FormatException(message); } specificSeq.InsertRange(specificSeq.Count, mbfReader.GetLineField(3)); mbfReader.GoToNextLine(); } break; case _seqRegKey: specificSeq = GetSpecificSequence(fields[1], MoleculeType.Invalid, mbfReader, false); specificSeq.Metadata["start"] = fields[2]; specificSeq.Metadata["end"] = fields[3]; // Store seq id as value. _commonSeq.Metadata[_multiSeqRegKey + fields[1]] = fields[1]; break; } } else { comments = string.IsNullOrEmpty(comments) ? mbfReader.Line : comments + Environment.NewLine + mbfReader.Line; } mbfReader.GoToNextLine(); } if (!string.IsNullOrEmpty(comments)) { _commonSeq.Metadata[_commentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] = comments; comments = string.Empty; } }
private void ParseFeatures(MBFTextReader mbfReader, ref Sequence sequence) { ILocationBuilder locBuilder = LocationBuilder; if (locBuilder == null) { throw new InvalidOperationException(Resource.NullLocationBuild); } // set data indent for features mbfReader.DataIndent = _featureDataIndent; // The sub-items of a feature are referred to as qualifiers. These do not have unique // keys, so they are stored as lists in the SubItems dictionary. SequenceFeatures features = new SequenceFeatures(); IList <FeatureItem> featureList = features.All; while (mbfReader.HasLines) { if (String.IsNullOrEmpty(mbfReader.Line) || mbfReader.LineHeader == "FEATURES") { mbfReader.GoToNextLine(); continue; } if (mbfReader.Line[0] != ' ') { // start of non-feature text break; } if (!mbfReader.LineHasHeader) { string message = Properties.Resource.GenbankEmptyFeature; Trace.Report(message); throw new InvalidDataException(message); } // check for multi-line location string string featureKey = mbfReader.LineHeader; string location = mbfReader.LineData; mbfReader.GoToNextLine(); while (mbfReader.HasLines && !mbfReader.LineHasHeader && mbfReader.LineHasData && !mbfReader.LineData.StartsWith("/", StringComparison.Ordinal)) { location += mbfReader.LineData; mbfReader.GoToNextLine(); } // create features as MetadataListItems FeatureItem feature = new FeatureItem(featureKey, locBuilder.GetLocation(location)); // process the list of qualifiers, which are each in the form of // /key="value" string qualifierKey = string.Empty; string qualifierValue = string.Empty; while (mbfReader.HasLines) { if (!mbfReader.LineHasHeader && mbfReader.LineHasData) { // '/' denotes a continuation of the previous line if (mbfReader.LineData.StartsWith("/", StringComparison.Ordinal)) { // new qualifier; save previous if this isn't the first if (!String.IsNullOrEmpty(qualifierKey)) { AddQualifierToFeature(feature, qualifierKey, qualifierValue); } // set the key and value of this qualifier int equalsIndex = mbfReader.LineData.IndexOf('='); if (equalsIndex < 0) { // no value, just key (this is allowed, see NC_005213.gbk) qualifierKey = mbfReader.LineData.Substring(1); qualifierValue = string.Empty; } else if (equalsIndex > 0) { qualifierKey = mbfReader.LineData.Substring(1, equalsIndex - 1); qualifierValue = mbfReader.LineData.Substring(equalsIndex + 1); } else { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GenbankInvalidFeature, mbfReader.Line); Trace.Report(message); throw new InvalidDataException(message); } } else { // Continuation of previous line; "note" gets a line break, and // everything else except "translation" and "transl_except" gets a // space to separate words. if (qualifierKey == "note") { qualifierValue += Environment.NewLine; } else if (qualifierKey != "translation" && qualifierKey != "transl_except") { qualifierValue += " "; } qualifierValue += mbfReader.LineData; } mbfReader.GoToNextLine(); } else if (mbfReader.Line.StartsWith("\t", StringComparison.Ordinal)) { // this seems to be data corruption; but BioPerl test set includes // (old, 2003) NT_021877.gbk which has this problem, so we // handle it ApplicationLog.WriteLine("WARN: nonstandard line format at line {0}: '{1}'", mbfReader.LineNumber, mbfReader.Line); qualifierValue += " " + mbfReader.Line.Trim(); mbfReader.GoToNextLine(); } else { break; } } // add last qualifier if (!String.IsNullOrEmpty(qualifierKey)) { AddQualifierToFeature(feature, qualifierKey, qualifierValue); } // still add feature, even if it has no qualifiers featureList.Add(StandardFeatureMap.GetStandardFeatureItem(feature)); } if (featureList.Count > 0) { ((GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]).Features = features; } }
/// <summary> /// Parses a single FASTA sequence from a file using MBFTextReader. /// This method is used in non-data virtualization scenarios. /// </summary> /// <param name="mbfReader">The MBFTextReader of the file to be parsed.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence should be in read-only mode. /// If this flag is set to true then the resulting sequence's IsReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The parsed sequence.</returns> protected ISequence ParseOneWithSpecificFormat(MBFTextReader mbfReader, bool isReadOnly) { if (mbfReader == null) { throw new ArgumentNullException("mbfReader"); } string message; if (!mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { message = string.Format(CultureInfo.InvariantCulture, Resource.INVALID_INPUT_FILE, Resource.FASTA_NAME); Trace.Report(message); throw new FileFormatException(message); } // Process header line. Sequence sequence; string id = mbfReader.GetLineField(2).Trim(); mbfReader.GoToNextLine(); IAlphabet alphabet = Alphabet; if (alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, mbfReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, mbfReader.Line); Trace.Report(message); throw new FileFormatException(message); } } if (Encoding == null) { sequence = new Sequence(alphabet); } else { sequence = new Sequence(alphabet, Encoding, string.Empty) { IsReadOnly = false }; } sequence.ID = id; while (mbfReader.HasLines && !mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { if (Alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, mbfReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, mbfReader.Line); Trace.Report(message); throw new FileFormatException(message); } if (sequence.Alphabet != alphabet) { Sequence seq = new Sequence(alphabet, Encoding, sequence) { IsReadOnly = false }; sequence.Clear(); sequence = seq; } } sequence.InsertRange(sequence.Count, mbfReader.Line); mbfReader.GoToNextLine(); } if (sequence.MoleculeType == MoleculeType.Invalid) { sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet); } sequence.IsReadOnly = isReadOnly; return(sequence); }
private static void ParseReferences(MBFTextReader mbfReader, ref Sequence sequence) { GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; IList <CitationReference> referenceList = metadata.References; CitationReference reference = null; //List<MetadataListItem<string>> referenceList = new List<MetadataListItem<string>>(); //MetadataListItem<string> reference = null; while (mbfReader.HasLines) { if (mbfReader.LineHeader == "REFERENCE") { // add previous reference if (reference != null) { referenceList.Add(reference); } // check for start/end e.g. (bases 1 to 118), or prose notes Match m = Regex.Match(mbfReader.LineData, @"^(?<number>\d+)(\s+\((?<location>.*)\))?"); if (!m.Success) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserReferenceError, mbfReader.LineData); Trace.Report(message); throw new InvalidDataException(message); } // create new reference string number = m.Groups["number"].Value; string location = m.Groups["location"].Value; reference = new CitationReference(); int outValue; if (!int.TryParse(number, out outValue)) { throw new InvalidOperationException(); } reference.Number = outValue; reference.Location = location; mbfReader.GoToNextLine(); } else if (mbfReader.Line.StartsWith(" ", StringComparison.Ordinal)) { switch (mbfReader.LineHeader) { // all the following are extracted the same way - possibly multiline case "AUTHORS": reference.Authors = ParseMultiLineData(mbfReader, " "); break; case "CONSRTM": reference.Consortiums = ParseMultiLineData(mbfReader, " "); break; case "TITLE": reference.Title = ParseMultiLineData(mbfReader, " "); break; case "JOURNAL": reference.Journal = ParseMultiLineData(mbfReader, " "); break; case "REMARK": reference.Remarks = ParseMultiLineData(mbfReader, " "); break; case "MEDLINE": reference.Medline = ParseMultiLineData(mbfReader, " "); break; case "PUBMED": reference.PubMed = ParseMultiLineData(mbfReader, " "); break; default: string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidReferenceField, mbfReader.LineHeader); Trace.Report(message); throw new InvalidDataException(message); } } else { // add last reference if (reference != null) { referenceList.Add(reference); } // don't go to next line; current line still needs to be processed break; } } }
private static void ParseSource(MBFTextReader mbfReader, ref Sequence sequence) { string source = string.Empty; string organism = string.Empty; string classLevels = string.Empty; while (mbfReader.HasLines) { if (mbfReader.LineHeader == "SOURCE") { // data can be multiline. spec says last line must end with period // (note: this doesn't apply unless multiline) bool lastDotted = true; source = mbfReader.LineData; mbfReader.GoToNextLine(); while (mbfReader.HasLines && !mbfReader.LineHasHeader) { source += " " + mbfReader.LineData; lastDotted = (source.EndsWith(".", StringComparison.Ordinal)); mbfReader.GoToNextLine(); } if (!lastDotted && Trace.Want(Trace.SeqWarnings)) { Trace.Report("GenBank.ParseSource", Properties.Resource.OutOfSpec, source); } // don't go to next line; current line still needs to be processed } else if (mbfReader.Line[0] == ' ') { if (mbfReader.LineHeader != "ORGANISM") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidSourceField, mbfReader.LineHeader); Trace.Report(message); throw new InvalidDataException(message); } // this also can be multiline organism = mbfReader.LineData; mbfReader.GoToNextLine(); while (mbfReader.HasLines && !mbfReader.LineHasHeader) { if (mbfReader.Line.EndsWith(";", StringComparison.Ordinal) || mbfReader.Line.EndsWith(".", StringComparison.Ordinal)) { if (!String.IsNullOrEmpty(classLevels)) { classLevels += " "; } classLevels += mbfReader.LineData; } else { organism += " " + mbfReader.LineData; } mbfReader.GoToNextLine(); } // don't go to next line; current line still needs to be processed } else { // don't go to next line; current line still needs to be processed break; } } GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; metadata.Source = new SequenceSource(); metadata.Source.CommonName = source; if (!string.IsNullOrEmpty(organism)) { int index = organism.IndexOf(" ", StringComparison.Ordinal); if (index > 0) { metadata.Source.Organism.Genus = organism.Substring(0, index); if (organism.Length > index) { index++; metadata.Source.Organism.Species = organism.Substring(index, organism.Length - index); } } else { metadata.Source.Organism.Genus = organism; } } metadata.Source.Organism.ClassLevels = classLevels; }
/// <summary> /// Parses a single Phylip text from a reader into a sequence. /// 1. First link has Count of Taxa and length of each sequence /// 2. Sequences /// a. First ten character are ID /// b. Sequence itself /// </summary> /// <param name="mbfReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false.</param> /// <returns>A new Sequence Alignment instance containing parsed data.</returns> protected ISequenceAlignment ParseOneWithSpecificFormat(MBFTextReader mbfReader, bool isReadOnly) { if (mbfReader == null) { throw new ArgumentNullException("mbfReader"); } string message = string.Empty; // Parse first line IList <string> tokens = mbfReader.Line.Split((char[])null, StringSplitOptions.RemoveEmptyEntries); if (2 != tokens.Count) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name); throw new InvalidDataException(message); } bool isFirstBlock = true; int sequenceCount = 0; int sequenceLength = 0; IList <Sequence> data = new List <Sequence>(); string id = string.Empty; string sequenceString = string.Empty; Sequence sequence = null; IAlphabet alignmentAlphabet = null; sequenceCount = Int32.Parse(tokens[0], CultureInfo.InvariantCulture); sequenceLength = Int32.Parse(tokens[1], CultureInfo.InvariantCulture); mbfReader.GoToNextLine(); // Skip blank lines until we get to the first block. // Now that we're at the first block, one or more blank lines are the block separators, which we'll need. mbfReader.SkipBlankLines = false; while (mbfReader.HasLines) { if (string.IsNullOrEmpty(mbfReader.Line.Trim())) { mbfReader.GoToNextLine(); continue; } for (int index = 0; index < sequenceCount; index++) { if (isFirstBlock) { // First 10 characters are sequence ID, remaining is the first block of sequence // Note that both may contain whitespace, and there may be no whitespace between them. if (mbfReader.Line.Length <= 10) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name); throw new Exception(message); } id = mbfReader.Line.Substring(0, 10).Trim(); sequenceString = Util.Helper.StringRemoveWhitespace(mbfReader.Line.Substring(10)); IAlphabet alphabet = Alphabet; if (null == alphabet) { alphabet = _basicParser.IdentifyAlphabet(alphabet, sequenceString); if (null == alphabet) { message = string.Format( CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, sequenceString); throw new InvalidDataException(message); } else { if (null == alignmentAlphabet) { alignmentAlphabet = alphabet; } else { if (alignmentAlphabet != alphabet) { message = Properties.Resource.SequenceAlphabetMismatch; throw new InvalidDataException(message); } } } } if (Encoding == null) { sequence = new Sequence(alphabet, sequenceString); } else { sequence = new Sequence(alphabet, Encoding, sequenceString); } sequence.ID = id; sequence.IsReadOnly = false; data.Add(sequence); } else { sequence = data[index]; sequence.InsertRange(sequence.Count, Util.Helper.StringRemoveWhitespace(mbfReader.Line)); } mbfReader.GoToNextLine(); } // Reset the first block flag isFirstBlock = false; } // Validate for the count of sequence if (sequenceCount != data.Count) { throw new InvalidDataException(Properties.Resource.SequenceCountMismatch); } SequenceAlignment sequenceAlignment = new SequenceAlignment(); sequenceAlignment.AlignedSequences.Add(new AlignedSequence()); foreach (Sequence dataSequence in data) { dataSequence.IsReadOnly = isReadOnly; // Validate for the count of sequence if (sequenceLength != dataSequence.Count) { throw new InvalidDataException(Properties.Resource.SequenceLengthMismatch); } sequenceAlignment.AlignedSequences[0].Sequences.Add(dataSequence); } return(sequenceAlignment); }
// parses everything before the features section private void ParseHeaders(MBFTextReader mbfReader, ref Sequence sequence) { GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; string data = string.Empty; string[] tokens = null; // set data indent for headers mbfReader.DataIndent = _dataIndent; // only allow one locus line bool haveParsedLocus = false; // parse until we hit the features or sequence section bool haveFinishedHeaders = false; while (mbfReader.HasLines && !haveFinishedHeaders) { switch (mbfReader.LineHeader) { case "LOCUS": if (haveParsedLocus) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserSecondLocus, mbfReader.LocationString); Trace.Report(message); throw new InvalidDataException(message); } ParseLocusByTokens(mbfReader, ref sequence); metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; haveParsedLocus = true; // don't go to next line; current line still needs to be processed break; case "VERSION": tokens = mbfReader.LineData.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // first token contains accession and version Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$"); metadata.Version = new GenBankVersion(); if (m.Success) { metadata.Version.Version = m.Groups["version"].Value; // The first token in the data from the accession line is referred to as // the primary accession number, and should be the one used here in the // version line. string versionLineAccession = m.Groups["accession"].Value; if (metadata.Accession == null) { ApplicationLog.WriteLine("WARN: VERSION processed before ACCESSION"); } else { if (!versionLineAccession.Equals(metadata.Accession.Primary)) { ApplicationLog.WriteLine("WARN: VERSION tag doesn't match ACCESSION"); } else { metadata.Version.Accession = metadata.Accession.Primary; } } } // second token contains primary ID m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)"); if (m.Success) { metadata.Version.GINumber = m.Groups["primaryID"].Value; } mbfReader.GoToNextLine(); break; case "PROJECT": tokens = mbfReader.LineData.Split(':'); if (tokens.Length == 2) { metadata.Project = new ProjectIdentifier(); metadata.Project.Name = tokens[0]; tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { metadata.Project.Numbers.Add(tokens[i]); } } else { ApplicationLog.WriteLine("WARN: unexpected PROJECT header: " + mbfReader.Line); } mbfReader.GoToNextLine(); break; case "SOURCE": ParseSource(mbfReader, ref sequence); metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "REFERENCE": ParseReferences(mbfReader, ref sequence); // can encounter more than one metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "COMMENT": ParseComments(mbfReader, ref sequence); // can encounter more than one metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "PRIMARY": // This header is followed by sequence info in a table format that could be // stored in a custom object. The first line contains column headers. // For now, just validate the presence of the headers, and save the data // as a string. tokens = mbfReader.LineData.Split("\t ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); // Validating for minimum two headers. if (tokens.Length != 4) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserPrimaryLineError, mbfReader.Line); Trace.Report(message); throw new InvalidDataException(message); } string primaryData = ParseMultiLineData(mbfReader, Environment.NewLine); metadata.Primary = primaryData; // don't go to next line; current line still needs to be processed break; // all the following are extracted the same way - possibly multiline case "DEFINITION": metadata.Definition = ParseMultiLineData(mbfReader, " "); break; case "ACCESSION": data = ParseMultiLineData(mbfReader, " "); metadata.Accession = new GenBankAccession(); string[] accessions = data.Split(' '); metadata.Accession.Primary = accessions[0]; for (int i = 1; i < accessions.Length; i++) { metadata.Accession.Secondary.Add(accessions[i]); } break; case "DBLINK": tokens = mbfReader.LineData.Split(':'); if (tokens.Length == 2) { metadata.DBLink = new CrossReferenceLink(); if (string.Compare(tokens[0], CrossReferenceType.Project.ToString(), StringComparison.OrdinalIgnoreCase) == 0) { metadata.DBLink.Type = CrossReferenceType.Project; } else { metadata.DBLink.Type = CrossReferenceType.TraceAssemblyArchive; } tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { metadata.DBLink.Numbers.Add(tokens[i]); } } else { ApplicationLog.WriteLine("WARN: unexpected DBLINK header: " + mbfReader.Line); } mbfReader.GoToNextLine(); break; case "DBSOURCE": metadata.DBSource = ParseMultiLineData(mbfReader, " "); break; case "KEYWORDS": metadata.Keywords = ParseMultiLineData(mbfReader, " "); break; case "SEGMENT": data = ParseMultiLineData(mbfReader, " "); string delimeter = "of"; tokens = data.Split(delimeter.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); int outvalue; if (tokens.Length == 2) { metadata.Segment = new SequenceSegment(); if (int.TryParse(tokens[0].Trim(), out outvalue)) { metadata.Segment.Current = outvalue; } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + mbfReader.Line); } if (int.TryParse(tokens[1].Trim(), out outvalue)) { metadata.Segment.Count = outvalue; } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + mbfReader.Line); } } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + mbfReader.Line); } break; // all the following indicate sections beyond the headers parsed by this method case "FEATURES": case "BASE COUNT": case "ORIGIN": case "CONTIG": haveFinishedHeaders = true; break; default: ApplicationLog.WriteLine(ToString() + "WARN: unknown {0} -> {1}", mbfReader.LineHeader, mbfReader.LineData); string errMessage = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParseHeaderError, mbfReader.LineHeader); Trace.Report(errMessage); throw new InvalidDataException(errMessage); } } // check for required features if (!haveParsedLocus) { string message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name); Trace.Report(message); throw new InvalidDataException(message); } }
/// <summary> /// Parse the Sequence data in the block /// </summary> /// <param name="mbfReader">A reader for a biological sequence text.</param> /// <param name="IDs">List of sequence IDs</param> /// <returns>parse sequence in alignment</returns> private static Dictionary <string, string> ParseCharacterBlock(MBFTextReader mbfReader, IList <string> IDs) { bool isInCharactersBlock = true; string data = string.Empty; int sequenceLength = 0; Dictionary <string, string> dataSet = new Dictionary <string, string>(); while (mbfReader.HasLines && isInCharactersBlock) { mbfReader.GoToNextLine(); IList <string> tokens = GetTokens(mbfReader.Line); if (0 == string.Compare("DIMENSIONS", tokens[0], StringComparison.OrdinalIgnoreCase)) { tokens[0] = string.Empty; // Parse dimensions // 1. Length of sequence do { foreach (string token in tokens) { data = token.Trim(new char[] { ';' }); if (string.IsNullOrEmpty(data)) { continue; } if (data.StartsWith("nchar=", StringComparison.OrdinalIgnoreCase)) { sequenceLength = Int32.Parse(data.Substring(6), CultureInfo.InvariantCulture); } } if (mbfReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase)) { break; } else { mbfReader.GoToNextLine(); tokens = GetTokens(mbfReader.Line); } }while (mbfReader.HasLines); } else if (0 == string.Compare("FORMAT", tokens[0], StringComparison.OrdinalIgnoreCase)) { tokens[0] = string.Empty; // Parse format // 1. Notation for "missing" // 2. Notation for "gap" // 3. Notation for "matchchar" // 4. data type do { if (mbfReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase)) { break; } else { mbfReader.GoToNextLine(); tokens = GetTokens(mbfReader.Line); } }while (mbfReader.HasLines); } if (0 == string.Compare("MATRIX", tokens[0], StringComparison.OrdinalIgnoreCase)) { tokens[0] = string.Empty; // "If available" ignore the data in square brackets [] while (mbfReader.HasLines) { if (mbfReader.Line.StartsWith("[", StringComparison.OrdinalIgnoreCase)) { mbfReader.GoToNextLine(); } else { break; } } // Here are the alignment sequences while (mbfReader.HasLines) { mbfReader.GoToNextLine(); if (string.IsNullOrEmpty(mbfReader.Line.Trim())) { continue; } tokens = GetTokens(mbfReader.Line); if (tokens[0].StartsWith(";", StringComparison.OrdinalIgnoreCase)) { isInCharactersBlock = false; break; } if (IDs.Contains(tokens[0])) { data = tokens[1]; if (dataSet.ContainsKey(tokens[0])) { data = string.Concat(dataSet[tokens[0]], data); } dataSet[tokens[0]] = data; } } } else if (tokens[0].StartsWith(";", StringComparison.OrdinalIgnoreCase)) { isInCharactersBlock = false; } } // Read the end line "end;" mbfReader.GoToNextLine(); // Validate the length of sequence foreach (string dataSequence in dataSet.Values) { if (dataSequence.Length != sequenceLength) { throw new FormatException(Properties.Resource.SequenceLengthMismatch); } } return(dataSet); }
/// <summary> /// Parses a single ClustalW text from a reader into a sequence. /// </summary> /// <param name="mbfReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false.</param> /// <returns>A new Sequence Alignment instance containing parsed data.</returns> protected ISequenceAlignment ParseOneWithSpecificFormat(MBFTextReader mbfReader, bool isReadOnly) { if (mbfReader == null) { throw new ArgumentNullException("mbfReader"); } string message = string.Empty; if (!mbfReader.Line.StartsWith("CLUSTAL", StringComparison.OrdinalIgnoreCase)) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name); throw new InvalidDataException(message); } mbfReader.GoToNextLine(); // Skip blank lines until we get to the first block. // Now that we're at the first block, one or more blank lines are the block separators, which we'll need. mbfReader.SkipBlankLines = false; Dictionary <string, ISequence> mapIdToSequence = new Dictionary <string, ISequence>(); IAlphabet alignmentAlphabet = null; bool isFirstBlock = true; bool inBlock = false; while (mbfReader.HasLines) { // Blank line or consensus line signals end of block. if (String.IsNullOrEmpty(mbfReader.Line) || Helper.ContainsOnly(mbfReader.Line, '*', ' ', '.', '+', ':')) { if (inBlock) { // Blank line signifies end of block inBlock = false; isFirstBlock = false; } } else // It's not a blank or consensus line. { // It's a data line in a block. // Lines begin with sequence id, then the sequence segment, and optionally a number, which we will ignore string[] tokens = mbfReader.Line.Split((char[])null, StringSplitOptions.RemoveEmptyEntries); // (char[])null uses whitespace delimiters string id = tokens[0]; string data = tokens[1].ToUpper(CultureInfo.InvariantCulture); Sequence sequence = null; IAlphabet alphabet = Alphabet; inBlock = true; if (isFirstBlock) { if (null == alphabet) { alphabet = _basicParser.IdentifyAlphabet(alphabet, data); if (null == alphabet) { message = string.Format( CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, data); throw new InvalidDataException(message); } else { if (null == alignmentAlphabet) { alignmentAlphabet = alphabet; } else { if (alignmentAlphabet != alphabet) { message = string.Format( CultureInfo.CurrentCulture, Properties.Resource.SequenceAlphabetMismatch); throw new InvalidDataException(message); } } } } if (Encoding == null) { sequence = new Sequence(alphabet, data); } else { sequence = new Sequence(alphabet, Encoding, data); } sequence.ID = id; sequence.IsReadOnly = false; mapIdToSequence.Add(id, sequence); } else { if (!mapIdToSequence.ContainsKey(id)) { message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.ClustalUnknownSequence, id); throw new InvalidDataException(message); } sequence = (Sequence)mapIdToSequence[id]; sequence.InsertRange(sequence.Count, data); } } mbfReader.GoToNextLine(); } SequenceAlignment sequenceAlignment = new SequenceAlignment(); sequenceAlignment.AlignedSequences.Add(new AlignedSequence()); foreach (Sequence alignmentSequence in mapIdToSequence.Values) { alignmentSequence.IsReadOnly = isReadOnly; sequenceAlignment.AlignedSequences[0].Sequences.Add(alignmentSequence); } return(sequenceAlignment); }
/// <summary> /// Parses a single Nexus text from a reader into a sequence. /// </summary> /// <param name="mbfReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false.</param> /// <returns>A new Sequence instance containing parsed data.</returns> protected ISequenceAlignment ParseOneWithSpecificFormat(MBFTextReader mbfReader, bool isReadOnly) { if (mbfReader == null) { throw new ArgumentNullException("mbfReader"); } ParseHeader(mbfReader); string message = string.Empty; ISequenceAlignment sequenceAlignment = new SequenceAlignment(); sequenceAlignment.AlignedSequences.Add(new AlignedSequence()); IList <string> ids = null; bool isInBlock = true; if (mbfReader.Line.StartsWith("begin", StringComparison.OrdinalIgnoreCase)) { while (mbfReader.HasLines && isInBlock) { if (string.IsNullOrEmpty(mbfReader.Line.Trim())) { mbfReader.GoToNextLine(); continue; } string blockName = GetTokens(mbfReader.Line)[1]; switch (blockName.ToUpper(CultureInfo.InvariantCulture)) { case "TAXA": case "TAXA;": // This block contains the count of sequence & title of each sequence ids = (IList <string>)ParseTaxaBlock(mbfReader); break; case "CHARACTERS": case "CHARACTERS;": // Block contains sequences Dictionary <string, string> dataSet = ParseCharacterBlock(mbfReader, ids); IAlphabet alignmentAlphabet = null; string data = string.Empty; foreach (string ID in ids) { IAlphabet alphabet = Alphabet; Sequence sequence = null; data = dataSet[ID]; if (null == alphabet) { alphabet = _basicParser.IdentifyAlphabet(alphabet, data); if (null == alphabet) { message = string.Format( CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, data); throw new InvalidDataException(message); } else { if (null == alignmentAlphabet) { alignmentAlphabet = alphabet; } else { if (alignmentAlphabet != alphabet) { message = string.Format( CultureInfo.InvariantCulture, Properties.Resource.SequenceAlphabetMismatch); throw new InvalidDataException(message); } } } } if (Encoding == null) { sequence = new Sequence(alphabet, data); } else { sequence = new Sequence(alphabet, Encoding, data); } sequence.IsReadOnly = isReadOnly; sequence.ID = ID; sequenceAlignment.AlignedSequences[0].Sequences.Add(sequence); } break; case "END": case "END;": // Have reached the end of block isInBlock = false; break; default: // skip this block while (mbfReader.HasLines) { mbfReader.GoToNextLine(); if (0 == string.Compare(mbfReader.Line, "end;", StringComparison.OrdinalIgnoreCase)) { break; } } break; } mbfReader.GoToNextLine(); } } return(sequenceAlignment); }
/// <summary> /// Parses a single FASTQ text from a reader into a QualitativeSequence. /// </summary> /// <param name="mbfReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not. /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new QualitativeSequence instance containing parsed data.</returns> private IQualitativeSequence ParseOneWithFastQFormat(MBFTextReader mbfReader, bool isReadOnly) { string message; // Check for '@' symbol at the first line. if (!mbfReader.HasLines || !mbfReader.Line.StartsWith("@", StringComparison.Ordinal)) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, Name); Trace.Report(message); throw new FileFormatException(message); } // Process header line. string id = mbfReader.GetLineField(2).Trim(); // Go to second line. mbfReader.GoToNextLine(); if (!mbfReader.HasLines || string.IsNullOrEmpty(mbfReader.Line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Get sequence from second line. string sequenceLine = mbfReader.Line; // Goto third line. mbfReader.GoToNextLine(); // Check for '+' symbol in the third line. if (!mbfReader.HasLines || !mbfReader.Line.StartsWith("+", StringComparison.Ordinal)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } string qualScoreId = mbfReader.GetLineField(2).Trim(); if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderData, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Goto fourth line. mbfReader.GoToNextLine(); if (!mbfReader.HasLines || string.IsNullOrEmpty(mbfReader.Line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Get the quality scores from the fourth line. byte[] qualScores = ASCIIEncoding.ASCII.GetBytes(mbfReader.Line); // Check for sequence length and quality score length. if (sequenceLine.Length != mbfReader.Line.Length) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } mbfReader.GoToNextLine(); IAlphabet alphabet = Alphabet; // Identify alphabet if it is not specified. if (alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, sequenceLine); if (alphabet == null) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, sequenceLine); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } } FastQFormatType fastQType = FastqType; // Identify fastq format type if AutoDetectFastQFormat property is set to true. if (AutoDetectFastQFormat) { fastQType = IdentifyFastQFormatType(qualScores); } QualitativeSequence sequence = null; if (Encoding == null) { sequence = new QualitativeSequence(alphabet, fastQType, sequenceLine, qualScores); } else { sequence = new QualitativeSequence(alphabet, fastQType, Encoding, sequenceLine, qualScores); } sequence.ID = id; sequence.IsReadOnly = isReadOnly; return(sequence); }
/// <summary> /// Gets the list of sequence titles /// </summary> /// <param name="mbfReader">A reader for a biological sequence text.</param> /// <returns>List of sequence IDs</returns> private static IList <string> ParseTaxaBlock(MBFTextReader mbfReader) { bool isInTaxaBlock = true; string data = string.Empty; int sequenceCount = 0; IList <string> IDs = new List <string>(); while (mbfReader.HasLines && isInTaxaBlock) { mbfReader.GoToNextLine(); IList <string> tokens = GetTokens(mbfReader.Line); switch (tokens[0].ToUpper(CultureInfo.InvariantCulture)) { case "DIMENSIONS": tokens[0] = string.Empty; // Parse dimensions // 1. Read count of sequence do { foreach (string token in tokens) { data = token.Trim(new char[] { ';' }); if (string.IsNullOrEmpty(data)) { continue; } if (data.StartsWith("ntax=", StringComparison.OrdinalIgnoreCase)) { sequenceCount = Int32.Parse(data.Substring(5), CultureInfo.InvariantCulture); } } if (mbfReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase)) { break; } else { mbfReader.GoToNextLine(); tokens = GetTokens(mbfReader.Line); } }while (mbfReader.HasLines); break; case "TAXLABELS": case "TAXLABELS;": tokens[0] = string.Empty; // Parse taxlabels // 1. Read IDs of sequence do { foreach (string token in tokens) { data = token.Trim(new char[] { ';' }); if (string.IsNullOrEmpty(data)) { continue; } IDs.Add(data); } if (mbfReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase)) { break; } else { mbfReader.GoToNextLine(); tokens = GetTokens(mbfReader.Line); } }while (mbfReader.HasLines); break; case "END": case "END;": // Have reached the end of taxa block isInTaxaBlock = false; break; default: break; } } // Read the end line "end;" mbfReader.GoToNextLine(); // Validate the count if (sequenceCount != IDs.Count) { throw new InvalidDataException(Properties.Resource.NtaxMismatch); } return(IDs); }