// Write all the header sections that come before the features section. private void WriteHeaders(ISequence sequence, TextWriter writer) { GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; if (metadata != null) { WriteLocus(sequence, writer); WriteHeaderSection("DEFINITION", metadata.Definition, writer); if (metadata.Accession != null) { WriteHeaderSection("ACCESSION", Helper.GetGenBankAccession(metadata.Accession), writer); string version = ""; if (metadata.Version != null) { version = metadata.Accession.Primary + "." + metadata.Version.Version; if (!string.IsNullOrEmpty(metadata.Version.GINumber)) { version += " GI:" + metadata.Version.GINumber; } if (version.Length > 0) { WriteHeaderSection("VERSION", version, writer); } } } if (metadata.Project != null) { WriteHeaderSection("PROJECT", Helper.GetProjectIdentifier(metadata.Project), writer); } if (metadata.DBLink != null) { WriteHeaderSection("DBLINK", Helper.GetCrossReferenceLink(metadata.DBLink), writer); } WriteHeaderSection("DBSOURCE", metadata.DBSource, writer); WriteHeaderSection("KEYWORDS", metadata.Keywords, writer); if (metadata.Segment != null) { WriteHeaderSection("SEGMENT", Helper.GetSequenceSegment(metadata.Segment), writer); } WriteSource(metadata, writer); WriteReferences(metadata, writer); WriteComments(metadata, writer); WriteHeaderSection("PRIMARY", metadata.Primary, writer); } }
// Handle optional BASE COUNT, then ORIGIN and sequence data. private void ParseSequence(MBFTextReader mbfReader, ref Sequence sequence) { string message = string.Empty; GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // set data indent for sequence headers mbfReader.DataIndent = _dataIndent; while (mbfReader.HasLines) { if (mbfReader.Line.StartsWith("//", StringComparison.Ordinal)) { mbfReader.GoToNextLine(); break; // end of sequence record } switch (mbfReader.LineHeader) { case "BASE COUNT": // The BASE COUNT linetype is obsolete and was removed // from the GenBank flatfile format in October 2003. But if it is // present, we will use it. We get the untrimmed version since it // starts with a right justified column. metadata.BaseCount = mbfReader.Line.Substring(_dataIndent); mbfReader.GoToNextLine(); break; case "ORIGIN": // Change Note: The original implementation would validate the alphabet every line // which would greatly impact performance on large sequences. This updates the method // to improve performance by validating the alphabet after parsing the sequence. ParseOrigin(mbfReader, metadata, ref sequence); break; case "CONTIG": metadata.Contig = ParseMultiLineData(mbfReader, Environment.NewLine); // don't go to next line; current line still needs to be processed break; default: message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserUnexpectedLineInSequence, mbfReader.Line); Trace.Report(message); throw new InvalidDataException(message); } } }
private void WriteFeatures(ISequence sequence, TextWriter writer) { ILocationBuilder locBuilder = LocationBuilder; if (locBuilder == null) { throw new InvalidOperationException(Resource.NullLocationBuild); } GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; if (metadata != null && metadata.Features != null) { WriteFeatureSection("FEATURES", "Location/Qualifiers", writer); // write the features in the order they were put in the list foreach (FeatureItem feature in metadata.Features.All) { WriteFeatureSection(_featureHeaderIndentString + feature.Key, locBuilder.GetLocationString(feature.Location), writer); // The sub-items of a feature are referred to as qualifiers. These do not have // unique keys, so they are stored as lists in the SubItems dictionary. foreach (KeyValuePair <string, List <string> > qualifierList in feature.Qualifiers) { foreach (string qualifierValue in qualifierList.Value) { string data = "/" + qualifierList.Key; if (qualifierValue != string.Empty) { data += "=" + qualifierValue; } // use a blank header; the qualifier key is part of the data WriteFeatureSection(string.Empty, data, writer); } } } } }
private void WriteReferences(GenBankMetadata metadata, TextWriter writer) { if (metadata.References != null) { foreach (CitationReference reference in metadata.References) { // format the data for the first line string data = reference.Number.ToString(); if (!string.IsNullOrEmpty(reference.Location)) { data = data.PadRight(3) + "(" + reference.Location + ")"; } WriteHeaderSection("REFERENCE", data, writer); WriteHeaderSection(" AUTHORS", reference.Authors, writer); WriteHeaderSection(" CONSRTM", reference.Consortiums, writer); WriteHeaderSection(" TITLE", reference.Title, writer); WriteHeaderSection(" JOURNAL", reference.Journal, writer); WriteHeaderSection(" MEDLINE", reference.Medline, writer); WriteHeaderSection(" PUBMED", reference.PubMed, writer); WriteHeaderSection(" REMARK", reference.Remarks, writer); } } }
private static void ParseSource(MBFTextReader mbfReader, ref Sequence sequence) { string source = string.Empty; string organism = string.Empty; string classLevels = string.Empty; while (mbfReader.HasLines) { if (mbfReader.LineHeader == "SOURCE") { // data can be multiline. spec says last line must end with period // (note: this doesn't apply unless multiline) bool lastDotted = true; source = mbfReader.LineData; mbfReader.GoToNextLine(); while (mbfReader.HasLines && !mbfReader.LineHasHeader) { source += " " + mbfReader.LineData; lastDotted = (source.EndsWith(".", StringComparison.Ordinal)); mbfReader.GoToNextLine(); } if (!lastDotted && Trace.Want(Trace.SeqWarnings)) { Trace.Report("GenBank.ParseSource", Properties.Resource.OutOfSpec, source); } // don't go to next line; current line still needs to be processed } else if (mbfReader.Line[0] == ' ') { if (mbfReader.LineHeader != "ORGANISM") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidSourceField, mbfReader.LineHeader); Trace.Report(message); throw new InvalidDataException(message); } // this also can be multiline organism = mbfReader.LineData; mbfReader.GoToNextLine(); while (mbfReader.HasLines && !mbfReader.LineHasHeader) { if (mbfReader.Line.EndsWith(";", StringComparison.Ordinal) || mbfReader.Line.EndsWith(".", StringComparison.Ordinal)) { if (!String.IsNullOrEmpty(classLevels)) { classLevels += " "; } classLevels += mbfReader.LineData; } else { organism += " " + mbfReader.LineData; } mbfReader.GoToNextLine(); } // don't go to next line; current line still needs to be processed } else { // don't go to next line; current line still needs to be processed break; } } GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; metadata.Source = new SequenceSource(); metadata.Source.CommonName = source; if (!string.IsNullOrEmpty(organism)) { int index = organism.IndexOf(" ", StringComparison.Ordinal); if (index > 0) { metadata.Source.Organism.Genus = organism.Substring(0, index); if (organism.Length > index) { index++; metadata.Source.Organism.Species = organism.Substring(index, organism.Length - index); } } else { metadata.Source.Organism.Genus = organism; } } metadata.Source.Organism.ClassLevels = classLevels; }
private static void ParseReferences(MBFTextReader mbfReader, ref Sequence sequence) { GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; IList <CitationReference> referenceList = metadata.References; CitationReference reference = null; //List<MetadataListItem<string>> referenceList = new List<MetadataListItem<string>>(); //MetadataListItem<string> reference = null; while (mbfReader.HasLines) { if (mbfReader.LineHeader == "REFERENCE") { // add previous reference if (reference != null) { referenceList.Add(reference); } // check for start/end e.g. (bases 1 to 118), or prose notes Match m = Regex.Match(mbfReader.LineData, @"^(?<number>\d+)(\s+\((?<location>.*)\))?"); if (!m.Success) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserReferenceError, mbfReader.LineData); Trace.Report(message); throw new InvalidDataException(message); } // create new reference string number = m.Groups["number"].Value; string location = m.Groups["location"].Value; reference = new CitationReference(); int outValue; if (!int.TryParse(number, out outValue)) { throw new InvalidOperationException(); } reference.Number = outValue; reference.Location = location; mbfReader.GoToNextLine(); } else if (mbfReader.Line.StartsWith(" ", StringComparison.Ordinal)) { switch (mbfReader.LineHeader) { // all the following are extracted the same way - possibly multiline case "AUTHORS": reference.Authors = ParseMultiLineData(mbfReader, " "); break; case "CONSRTM": reference.Consortiums = ParseMultiLineData(mbfReader, " "); break; case "TITLE": reference.Title = ParseMultiLineData(mbfReader, " "); break; case "JOURNAL": reference.Journal = ParseMultiLineData(mbfReader, " "); break; case "REMARK": reference.Remarks = ParseMultiLineData(mbfReader, " "); break; case "MEDLINE": reference.Medline = ParseMultiLineData(mbfReader, " "); break; case "PUBMED": reference.PubMed = ParseMultiLineData(mbfReader, " "); break; default: string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidReferenceField, mbfReader.LineHeader); Trace.Report(message); throw new InvalidDataException(message); } } else { // add last reference if (reference != null) { referenceList.Add(reference); } // don't go to next line; current line still needs to be processed break; } } }
// parses everything before the features section private void ParseHeaders(MBFTextReader mbfReader, ref Sequence sequence) { GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; string data = string.Empty; string[] tokens = null; // set data indent for headers mbfReader.DataIndent = _dataIndent; // only allow one locus line bool haveParsedLocus = false; // parse until we hit the features or sequence section bool haveFinishedHeaders = false; while (mbfReader.HasLines && !haveFinishedHeaders) { switch (mbfReader.LineHeader) { case "LOCUS": if (haveParsedLocus) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserSecondLocus, mbfReader.LocationString); Trace.Report(message); throw new InvalidDataException(message); } ParseLocusByTokens(mbfReader, ref sequence); metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; haveParsedLocus = true; // don't go to next line; current line still needs to be processed break; case "VERSION": tokens = mbfReader.LineData.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // first token contains accession and version Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$"); metadata.Version = new GenBankVersion(); if (m.Success) { metadata.Version.Version = m.Groups["version"].Value; // The first token in the data from the accession line is referred to as // the primary accession number, and should be the one used here in the // version line. string versionLineAccession = m.Groups["accession"].Value; if (metadata.Accession == null) { ApplicationLog.WriteLine("WARN: VERSION processed before ACCESSION"); } else { if (!versionLineAccession.Equals(metadata.Accession.Primary)) { ApplicationLog.WriteLine("WARN: VERSION tag doesn't match ACCESSION"); } else { metadata.Version.Accession = metadata.Accession.Primary; } } } // second token contains primary ID m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)"); if (m.Success) { metadata.Version.GINumber = m.Groups["primaryID"].Value; } mbfReader.GoToNextLine(); break; case "PROJECT": tokens = mbfReader.LineData.Split(':'); if (tokens.Length == 2) { metadata.Project = new ProjectIdentifier(); metadata.Project.Name = tokens[0]; tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { metadata.Project.Numbers.Add(tokens[i]); } } else { ApplicationLog.WriteLine("WARN: unexpected PROJECT header: " + mbfReader.Line); } mbfReader.GoToNextLine(); break; case "SOURCE": ParseSource(mbfReader, ref sequence); metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "REFERENCE": ParseReferences(mbfReader, ref sequence); // can encounter more than one metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "COMMENT": ParseComments(mbfReader, ref sequence); // can encounter more than one metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "PRIMARY": // This header is followed by sequence info in a table format that could be // stored in a custom object. The first line contains column headers. // For now, just validate the presence of the headers, and save the data // as a string. tokens = mbfReader.LineData.Split("\t ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); // Validating for minimum two headers. if (tokens.Length != 4) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserPrimaryLineError, mbfReader.Line); Trace.Report(message); throw new InvalidDataException(message); } string primaryData = ParseMultiLineData(mbfReader, Environment.NewLine); metadata.Primary = primaryData; // don't go to next line; current line still needs to be processed break; // all the following are extracted the same way - possibly multiline case "DEFINITION": metadata.Definition = ParseMultiLineData(mbfReader, " "); break; case "ACCESSION": data = ParseMultiLineData(mbfReader, " "); metadata.Accession = new GenBankAccession(); string[] accessions = data.Split(' '); metadata.Accession.Primary = accessions[0]; for (int i = 1; i < accessions.Length; i++) { metadata.Accession.Secondary.Add(accessions[i]); } break; case "DBLINK": tokens = mbfReader.LineData.Split(':'); if (tokens.Length == 2) { metadata.DBLink = new CrossReferenceLink(); if (string.Compare(tokens[0], CrossReferenceType.Project.ToString(), StringComparison.OrdinalIgnoreCase) == 0) { metadata.DBLink.Type = CrossReferenceType.Project; } else { metadata.DBLink.Type = CrossReferenceType.TraceAssemblyArchive; } tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { metadata.DBLink.Numbers.Add(tokens[i]); } } else { ApplicationLog.WriteLine("WARN: unexpected DBLINK header: " + mbfReader.Line); } mbfReader.GoToNextLine(); break; case "DBSOURCE": metadata.DBSource = ParseMultiLineData(mbfReader, " "); break; case "KEYWORDS": metadata.Keywords = ParseMultiLineData(mbfReader, " "); break; case "SEGMENT": data = ParseMultiLineData(mbfReader, " "); string delimeter = "of"; tokens = data.Split(delimeter.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); int outvalue; if (tokens.Length == 2) { metadata.Segment = new SequenceSegment(); if (int.TryParse(tokens[0].Trim(), out outvalue)) { metadata.Segment.Current = outvalue; } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + mbfReader.Line); } if (int.TryParse(tokens[1].Trim(), out outvalue)) { metadata.Segment.Count = outvalue; } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + mbfReader.Line); } } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + mbfReader.Line); } break; // all the following indicate sections beyond the headers parsed by this method case "FEATURES": case "BASE COUNT": case "ORIGIN": case "CONTIG": haveFinishedHeaders = true; break; default: ApplicationLog.WriteLine(ToString() + "WARN: unknown {0} -> {1}", mbfReader.LineHeader, mbfReader.LineData); string errMessage = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParseHeaderError, mbfReader.LineHeader); Trace.Report(errMessage); throw new InvalidDataException(errMessage); } } // check for required features if (!haveParsedLocus) { string message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name); Trace.Report(message); throw new InvalidDataException(message); } }
private void WriteLocus(ISequence sequence, TextWriter writer) { // determine molecule and seqiemce type GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; GenBankLocusInfo locusInfo = null; if (metadata != null) { locusInfo = metadata.Locus; } string molType = sequence.MoleculeType.ToString(); string seqType; if (sequence.MoleculeType != MoleculeType.Invalid) { if (molType == MoleculeType.Protein.ToString()) { seqType = "aa"; molType = string.Empty; // protein files don't use molecule type } else { seqType = "bp"; } } else { if (sequence.Alphabet == Alphabets.Protein) { seqType = "aa"; molType = string.Empty; // protein files don't use molecule type } else { seqType = "bp"; if (sequence.Alphabet == Alphabets.DNA) { molType = MoleculeType.DNA.ToString(); } else { molType = MoleculeType.RNA.ToString(); } } } // retrieve metadata fields string strandType = string.Empty; string strandTopology = string.Empty; string division = string.Empty; DateTime date = DateTime.Now; if (locusInfo != null) { strandType = Helper.GetStrandType(locusInfo.Strand); strandTopology = Helper.GetStrandTopology(locusInfo.StrandTopology); if (locusInfo.DivisionCode != SequenceDivisionCode.None) { division = locusInfo.DivisionCode.ToString(); } date = locusInfo.Date; } writer.WriteLine("{0,-12}{1,-16} {2,11} {3} {4,3}{5,-6} {6,-8} {7,3} {8}", "LOCUS", sequence.ID, sequence.Count, seqType, strandType, molType, strandTopology, division, date.ToString("dd-MMM-yyyy").ToUpper()); }