/// <summary> /// Private Constructor for clone method. /// </summary> /// <param name="other">GenBankMetadata instance to clone.</param> private GenBankMetadata(GenBankMetadata other) { if (other.Locus != null) { Locus = other.Locus.Clone(); } Definition = other.Definition; if (other.Accession != null) { Accession = other.Accession.Clone(); } if (other.Version != null) { Version = other.Version.Clone(); } if (other.Project != null) { Project = other.Project.Clone(); } if (other.DbLinks != null) { DbLinks = other.DbLinks.ToList(); } DbSource = other.DbSource; Keywords = other.Keywords; if (other.Segment != null) { Segment = other.Segment.Clone(); } if (other.Source != null) { Source = other.Source.Clone(); } References = new List<CitationReference>(); foreach (CitationReference reference in other.References) { References.Add(reference.Clone()); } Comments = new List<string>(other.Comments); Primary = other.Primary; if (other.Features != null) { Features = other.Features.Clone(); } BaseCount = other.BaseCount; Origin = other.Origin; Contig = other.Contig; }
// Write all the header sections that come before the features section. private void WriteHeaders(ISequence sequence, TextWriter txtWriter) { GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; if (metadata != null) { WriteLocus(sequence, txtWriter); WriteHeaderSection("DEFINITION", metadata.Definition, txtWriter); if (metadata.Accession != null) { WriteHeaderSection("ACCESSION", Helper.GetGenBankAccession(metadata.Accession), txtWriter); string version; if (metadata.Version != null) { version = metadata.Accession.Primary + "." + metadata.Version.Version; if (!string.IsNullOrEmpty(metadata.Version.GiNumber)) { version += " GI:" + metadata.Version.GiNumber; } if (version.Length > 0) { WriteHeaderSection("VERSION", version, txtWriter); } } } if (metadata.Project != null) { WriteHeaderSection("PROJECT", Helper.GetProjectIdentifier(metadata.Project), txtWriter); } if (metadata.DbLinks != null && metadata.DbLinks.Count > 0) { WriteHeaderSection("DBLINK", Helper.GetCrossReferenceLink(metadata.DbLinks), txtWriter); } WriteHeaderSection("DBSOURCE", metadata.DbSource, txtWriter); WriteHeaderSection("KEYWORDS", metadata.Keywords, txtWriter); if (metadata.Segment != null) { WriteHeaderSection("SEGMENT", Helper.GetSequenceSegment(metadata.Segment), txtWriter); } WriteSource(metadata, txtWriter); WriteReferences(metadata, txtWriter); WriteComments(metadata, txtWriter); WriteHeaderSection("PRIMARY", metadata.Primary, txtWriter); } }
private void WriteFeatures(ISequence sequence, TextWriter txtWriter) { ILocationBuilder locBuilder = LocationBuilder; if (locBuilder == null) { throw new InvalidOperationException(Properties.Resource.NullLocationBuild); } GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; if (metadata != null && metadata.Features != null) { WriteFeatureSection("FEATURES", "Location/Qualifiers", txtWriter); // write the features in the order they were put in the list foreach (FeatureItem feature in metadata.Features.All) { WriteFeatureSection(FeatureHeaderIndentString + feature.Key, locBuilder.GetLocationString(feature.Location), txtWriter); // The sub-items of a feature are referred to as qualifiers. These do not have // unique keys, so they are stored as lists in the SubItems dictionary. foreach (KeyValuePair <string, List <string> > qualifierList in feature.Qualifiers) { foreach (string qualifierValue in qualifierList.Value) { string data = "/" + qualifierList.Key; if (qualifierValue != null) { if (qualifierValue != string.Empty) { data += "="; string s = "" + (char)34; if (qualifierValue.StartsWith(s) == false) { data += s; } data += qualifierValue; if (qualifierValue.EndsWith(s) == false) { data += s; } } } // use a blank header; the qualifier key is part of the data WriteFeatureSection(string.Empty, data, txtWriter); } } } } }
/// <summary> /// Parses the GenBank Sequence from the GenBank file. /// Handle optional BASE COUNT, then ORIGIN and sequence data. /// </summary> /// <param name="line">parse line</param> /// <param name="sequence">The sequence.</param> /// <param name="stream">The stream reader.</param> private void ParseSequence(ref string line, ref Sequence sequence, StreamReader stream) { GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; while (line != null) { if (line.StartsWith("//", StringComparison.Ordinal)) { line = GoToNextLine(line, stream); break; // end of sequence record } // set data indent for sequence headers string lineHeader = GetLineHeader(line, DataIndent); switch (lineHeader) { case "BASE COUNT": // The BASE COUNT linetype is obsolete and was removed // from the GenBank flat-file format in October 2003. But if it is // present, we will use it. We get the untrimmed version since it // starts with a right justified column. metadata.BaseCount = line.Substring(DataIndent); line = GoToNextLine(line, stream); break; case "ORIGIN": // Change Note: The original implementation would validate the alphabet every line // which would greatly impact performance on large sequences. This updates the method // to improve performance by validating the alphabet after parsing the sequence. ParseOrigin(ref line, metadata, stream); break; case "CONTIG": metadata.Contig = ParseMultiLineData(ref line, Environment.NewLine, DataIndent, stream); // don't go to next line; current line still needs to be processed break; default: string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserUnexpectedLineInSequence, line); Trace.Report(message); throw new InvalidDataException(message); } } }
/// <summary> /// Extracts supposed sequence name from metadata. /// </summary> /// <param name="metadata"> /// The metadata. /// </param> /// <returns> /// Supposed name as <see cref="string"/>. /// </returns> /// <exception cref="Exception"> /// Thrown if all name fields are contradictory. /// </exception> public static string ExtractSequenceName(GenBankMetadata metadata) { string species = metadata.Source.Organism.Species.GetLargestRepeatingSubstring(); string commonName = metadata.Source.CommonName; string definition = metadata.Definition.TrimEnd(", complete genome.") .TrimEnd(", complete sequence.") .TrimEnd(", complete CDS.") .TrimEnd(", complete cds.") .TrimEnd(", genome."); if (commonName.Contains(species)) { if (definition.Contains(commonName)) { return definition; } if (commonName.Contains(definition)) { return commonName; } return commonName + " | " + definition; } if (species.Contains(commonName)) { if (definition.Contains(species)) { return definition; } if (species.Contains(definition)) { return species; } return species + " | " + definition; } throw new Exception("Sequences names are not equal. CommonName = " + commonName + ", Species = " + species + ", Definition = " + definition); }
private void WriteReferences(GenBankMetadata metadata, TextWriter txtWriter) { if (metadata.References != null) { foreach (CitationReference reference in metadata.References) { // format the data for the first line string data = reference.Number.ToString(CultureInfo.InvariantCulture); if (!string.IsNullOrEmpty(reference.Location)) { data = data.PadRight(3) + "(" + reference.Location + ")"; } WriteHeaderSection("REFERENCE", data, txtWriter); WriteHeaderSection(" AUTHORS", reference.Authors, txtWriter); WriteHeaderSection(" CONSRTM", reference.Consortiums, txtWriter); WriteHeaderSection(" TITLE", reference.Title, txtWriter); WriteHeaderSection(" JOURNAL", reference.Journal, txtWriter); WriteHeaderSection(" MEDLINE", reference.Medline, txtWriter); WriteHeaderSection(" PUBMED", reference.PubMed, txtWriter); WriteHeaderSection(" REMARK", reference.Remarks, txtWriter); } } }
/// <summary> /// Extracts sequence feature. /// </summary> /// <param name="metadata"> /// The metadata. /// </param> /// <returns> /// The <see cref="int"/>. /// </returns> public int ExtractSequenceFeature(GenBankMetadata metadata) { string name = metadata.Definition.ToLower(); if (name.Contains("mitochondrion")) { return Aliases.Feature.MitochondrionGenome; } else if (name.Contains("chloroplast")) { return Aliases.Feature.ChloroplastGenome; } else if (name.Contains("plasmid")) { return Aliases.Feature.Plasmid; } else if (name.Contains("plastid")) { return Aliases.Feature.Plastid; } else { return Aliases.Feature.FullGenome; } }
/// <summary> /// Validate TrnsitPeptide features /// </summary> /// <param name="nodeName">XML node name</param> /// <param name="genMetadata">GenBank Metadata</param> private void ValidateGenBankTrnsitPeptideFeature(string nodeName, GenBankMetadata genMetadata) { // Get Values from XML node. string expectedLocation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Location); string expectedAllele = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.AlleleNode); string featureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.QualifierCount); string expectedDbReference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.DbReferenceNode); string geneSymbol = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSymbol); string expectedCitation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.CitationNode); string expectedExperiment = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.ExperimentNode); string expectedFunction = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.FunctionNode); string expectedGeneSynonym = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSynonymNode); string expectedInference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.InferenceNode); string expectedLabel = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LabelNode); string expectedLocusTag = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LocusTagNode); string expectedNote = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Note); string expectedOldLocusTag = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.OldLocusTagNode); string expectedMap = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GenbankMapNode); List<TransitPeptide> tansitPeptideFeatureList = genMetadata.Features.TransitPeptides; var locBuilder = new LocationBuilder(); // Create a copy of transit peptide features. TransitPeptide cloneTransit = tansitPeptideFeatureList[0].Clone(); // Validate transit peptide qualifiers. Assert.AreEqual(tansitPeptideFeatureList.Count.ToString((IFormatProvider) null), featureCount); Assert.AreEqual(cloneTransit.GeneSymbol, geneSymbol); Assert.AreEqual(cloneTransit.DatabaseCrossReference[0], expectedDbReference); Assert.AreEqual(tansitPeptideFeatureList[0].Allele, expectedAllele); Assert.AreEqual(tansitPeptideFeatureList[0].Citation[0], expectedCitation); Assert.AreEqual(tansitPeptideFeatureList[0].Experiment[0], expectedExperiment); Assert.AreEqual(tansitPeptideFeatureList[0].GenomicMapPosition, expectedMap); Assert.AreEqual(tansitPeptideFeatureList[0].GeneSynonym[0], expectedGeneSynonym); Assert.AreEqual(tansitPeptideFeatureList[0].Inference[0], expectedInference); Assert.AreEqual(tansitPeptideFeatureList[0].Label, expectedLabel); Assert.AreEqual(locBuilder.GetLocationString( genMetadata.Features.TransitPeptides[0].Location), expectedLocation); Assert.AreEqual(tansitPeptideFeatureList[0].Note[0], expectedNote); Assert.AreEqual(tansitPeptideFeatureList[0].OldLocusTag[0], expectedOldLocusTag); Assert.AreEqual(tansitPeptideFeatureList[0].LocusTag[0], expectedLocusTag); Assert.AreEqual(tansitPeptideFeatureList[0].Function[0], expectedFunction); // Create a new TransitPeptide and validate the same. var tPeptide = new TransitPeptide(expectedLocation); var tPeptideWithILoc = new TransitPeptide( genMetadata.Features.TransitPeptides[0].Location); // Set qualifiers and validate them. tPeptide.Allele = expectedAllele; tPeptide.GeneSymbol = geneSymbol; tPeptideWithILoc.GenomicMapPosition = expectedMap; Assert.AreEqual(tPeptide.GeneSymbol, geneSymbol); Assert.AreEqual(tPeptide.Allele, expectedAllele); Assert.AreEqual(tPeptideWithILoc.GenomicMapPosition, expectedMap); }
/// <summary> /// Parses reference info. /// </summary> /// <param name="metadata">Metadata object</param> /// <param name="cellRange">Range of cells</param> /// <param name="rowIndex">Current index of row</param> /// <returns>Index of row</returns> private static int ParseReference(GenBankMetadata metadata, object[,] cellRange, int rowIndex) { string Key; string subKey; string value; string message; value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty; rowIndex++; CitationReference reference = new CitationReference(); if (!string.IsNullOrWhiteSpace(value)) { // check for start/end e.g. (bases 1 to 118), or prose notes Match m = Regex.Match(value, @"^(?<number>\d+)(\s+\((?<location>.*)\))?"); if (m.Success) { // create new reference string number = m.Groups["number"].Value; string location = m.Groups["location"].Value; int outValue; if (!int.TryParse(number, out outValue)) { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, REFERENCE); throw new FormatException(message); } reference.Number = outValue; reference.Location = location; } } while (rowIndex < cellRange.GetLength(0)) { if (3 > cellRange.GetLength(1)) { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, REFERENCE); throw new FormatException(message); } if (null != cellRange[rowIndex, KeyColumnIndex]) { Key = cellRange[rowIndex, KeyColumnIndex].ToString(); if (!string.IsNullOrWhiteSpace(Key)) { break; } } if (null == cellRange[rowIndex, SubKeyColumnIndex] || string.IsNullOrWhiteSpace(cellRange[rowIndex, SubKeyColumnIndex].ToString())) { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, REFERENCE); throw new FormatException(message); } subKey = cellRange[rowIndex, SubKeyColumnIndex].ToString().ToUpperInvariant(); value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty; if (string.IsNullOrWhiteSpace(value)) { continue; } switch (subKey) { case REFERENCE_AUTHORS: reference.Authors = value; break; case REFERENCE_CONSORTIUMS: reference.Consortiums = value; break; case REFERENCE_JOURNAL: reference.Journal = value; break; case REFERENCE_MEDLINE: reference.Medline = value; break; case REFERENCE_PUBMED: reference.PubMed = value; break; case REFERENCE_REMARK: reference.Remarks = value; break; case REFERENCE_TITLE: reference.Title = value; break; } rowIndex++; } metadata.References.Add(reference); return rowIndex; }
/// <summary> /// Parses locus info. /// </summary> /// <param name="metadata">Metadata object</param> /// <param name="cellRange">Range of cells</param> /// <param name="rowIndex">Current index of row</param> /// <returns>Index of row</returns> private static int ParseLocus(GenBankMetadata metadata, object[,] cellRange, int rowIndex) { string Key; string subKey; string value; string message; rowIndex++; while (rowIndex < cellRange.GetLength(0)) { if (3 > cellRange.GetLength(1)) { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, LOCUS); throw new FormatException(message); } if (null != cellRange[rowIndex, KeyColumnIndex]) { Key = cellRange[rowIndex, KeyColumnIndex].ToString(); if (!string.IsNullOrWhiteSpace(Key)) { break; } } if (null == cellRange[rowIndex, SubKeyColumnIndex] || string.IsNullOrWhiteSpace(cellRange[rowIndex, SubKeyColumnIndex].ToString())) { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, LOCUS); throw new FormatException(message); } if (metadata.Locus == null) { metadata.Locus = new GenBankLocusInfo(); } subKey = cellRange[rowIndex, SubKeyColumnIndex].ToString().ToUpperInvariant(); value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty; switch (subKey) { case LOCUS_NAME: metadata.Locus.Name = value; break; case LOCUS_SEQLEN: if (!string.IsNullOrWhiteSpace(value)) { metadata.Locus.SequenceLength = int.Parse(value); } break; case LOCUS_SEQTYPE: metadata.Locus.SequenceType = value; break; case LOCUS_MOLTYPE: MoleculeType moleculetype = MoleculeType.NA; if (!string.IsNullOrWhiteSpace(value) && Enum.TryParse<MoleculeType>(value, true, out moleculetype)) { metadata.Locus.MoleculeType = moleculetype; } else { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, LOCUS_MOLTYPE); throw new FormatException(message); } break; case LOCUS_STRANTTOPOLOGY: SequenceStrandTopology strandTopology = SequenceStrandTopology.None; if (!string.IsNullOrWhiteSpace(value) && Enum.TryParse<SequenceStrandTopology>(value, true, out strandTopology)) { metadata.Locus.StrandTopology = strandTopology; } else { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, LOCUS_STRANTTOPOLOGY); throw new FormatException(message); } break; case LOCUS_STRANDTYPE: SequenceStrandType strandtype = SequenceStrandType.None; if (!string.IsNullOrWhiteSpace(value) && Enum.TryParse<SequenceStrandType>(value, true, out strandtype)) { metadata.Locus.Strand = strandtype; } else { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, LOCUS_STRANDTYPE); throw new FormatException(message); } break; case LOCUS_DIVISIONCODE: SequenceDivisionCode divisionCode = SequenceDivisionCode.None; if (!string.IsNullOrWhiteSpace(value) && Enum.TryParse<SequenceDivisionCode>(value, true, out divisionCode)) { metadata.Locus.DivisionCode = divisionCode; } else { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, LOCUS_DIVISIONCODE); throw new FormatException(message); } break; case LOCUS_DATE: DateTime date; if (!string.IsNullOrWhiteSpace(value) && DateTime.TryParse(value, out date)) { metadata.Locus.Date = date; } else { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, LOCUS_DATE); throw new FormatException(message); } break; default: message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, LOCUS); throw new FormatException(message); } rowIndex++; } return rowIndex; }
/// <summary> /// Validate GenBank LTR features /// </summary> /// <param name="nodeName">XML node name</param> /// <param name="genMetadata">GenBank Metadata</param> private void ValidateGenBankLTRFeature(string nodeName, GenBankMetadata genMetadata) { // Get Values from XML node. string expectedLocation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Location); string expectedAllele = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.AlleleNode); string featureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.QualifierCount); string expectedDbReference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.DbReferenceNode); string geneSymbol = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSymbol); string expectedCitation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.CitationNode); string expectedExperiment = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.ExperimentNode); string expectedFunction = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.FunctionNode); string expectedGeneSynonym = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSynonymNode); string expectedInference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.InferenceNode); string expectedLabel = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LabelNode); string expectedLocusTag = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LocusTagNode); string expectedNote = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Note); string expectedOldLocusTag = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.OldLocusTagNode); string expectedMap = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GenbankMapNode); List<LongTerminalRepeat> LTRFeatureList = genMetadata.Features.LongTerminalRepeats; var locBuilder = new LocationBuilder(); // Create a copy of Long Terminal Repeat feature. LongTerminalRepeat cloneLTR = LTRFeatureList[0].Clone(); // Validate Long Terminal Repeat qualifiers. Assert.AreEqual(LTRFeatureList.Count.ToString((IFormatProvider) null), featureCount); Assert.AreEqual(cloneLTR.GeneSymbol, geneSymbol); Assert.AreEqual(cloneLTR.DatabaseCrossReference[0], expectedDbReference); Assert.AreEqual(LTRFeatureList[0].Allele, expectedAllele); Assert.AreEqual(LTRFeatureList[0].Citation[0], expectedCitation); Assert.AreEqual(LTRFeatureList[0].Experiment[0], expectedExperiment); Assert.AreEqual(LTRFeatureList[0].GenomicMapPosition, expectedMap); Assert.AreEqual(LTRFeatureList[0].GeneSynonym[0], expectedGeneSynonym); Assert.AreEqual(LTRFeatureList[0].Inference[0], expectedInference); Assert.AreEqual(LTRFeatureList[0].Label, expectedLabel); Assert.AreEqual(locBuilder.GetLocationString( genMetadata.Features.LongTerminalRepeats[0].Location), expectedLocation); Assert.AreEqual(LTRFeatureList[0].Note[0], expectedNote); Assert.AreEqual(LTRFeatureList[0].OldLocusTag[0], expectedOldLocusTag); Assert.AreEqual(LTRFeatureList[0].LocusTag[0], expectedLocusTag); Assert.AreEqual(LTRFeatureList[0].Function[0], expectedFunction); Assert.IsTrue(string.IsNullOrEmpty(LTRFeatureList[0].StandardName)); // Create a new LTR and validate. var ltr = new LongTerminalRepeat(expectedLocation); var ltrWithILoc = new LongTerminalRepeat( genMetadata.Features.LongTerminalRepeats[0].Location); // Set qualifiers and validate them. ltr.Allele = expectedAllele; ltr.GeneSymbol = geneSymbol; ltrWithILoc.GenomicMapPosition = expectedMap; Assert.AreEqual(ltr.GeneSymbol, geneSymbol); Assert.AreEqual(ltr.Allele, expectedAllele); Assert.AreEqual(ltrWithILoc.GenomicMapPosition, expectedMap); }
/// <summary> /// Validate StemLoop features /// </summary> /// <param name="nodeName">XML node name</param> /// <param name="genMetadata">GenBank Metadata</param> private void ValidateGenBankStemLoopFeature(string nodeName, GenBankMetadata genMetadata) { // Get Values from XML node. string expectedLocation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Location); string expectedAllele = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.AlleleNode); string featureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.QualifierCount); string expectedDbReference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.DbReferenceNode); string geneSymbol = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSymbol); string expectedCitation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.CitationNode); string expectedExperiment = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.ExperimentNode); string expectedFunction = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.FunctionNode); string expectedGeneSynonym = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSynonymNode); string expectedInference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.InferenceNode); string expectedLabel = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LabelNode); string expectedLocusTag = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LocusTagNode); string expectedNote = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Note); string expectedOldLocusTag = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.OldLocusTagNode); string expectedMap = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GenbankMapNode); List<StemLoop> sLoopFeatureList = genMetadata.Features.StemLoops; var locBuilder = new LocationBuilder(); // Create a copy of StemLoop feature. StemLoop cloneSLoop = sLoopFeatureList[0].Clone(); // Validate transit peptide qualifiers. Assert.AreEqual(sLoopFeatureList.Count.ToString((IFormatProvider) null), featureCount); Assert.AreEqual(cloneSLoop.GeneSymbol, geneSymbol); Assert.AreEqual(cloneSLoop.DatabaseCrossReference[0], expectedDbReference); Assert.AreEqual(sLoopFeatureList[0].Allele, expectedAllele); Assert.AreEqual(sLoopFeatureList[0].Citation[0], expectedCitation); Assert.AreEqual(sLoopFeatureList[0].Experiment[0], expectedExperiment); Assert.AreEqual(sLoopFeatureList[0].GenomicMapPosition, expectedMap); Assert.AreEqual(sLoopFeatureList[0].GeneSynonym[0], expectedGeneSynonym); Assert.AreEqual(sLoopFeatureList[0].Inference[0], expectedInference); Assert.AreEqual(sLoopFeatureList[0].Label, expectedLabel); Assert.AreEqual(locBuilder.GetLocationString( genMetadata.Features.StemLoops[0].Location), expectedLocation); Assert.AreEqual(sLoopFeatureList[0].Note[0], expectedNote); Assert.AreEqual(sLoopFeatureList[0].OldLocusTag[0], expectedOldLocusTag); Assert.AreEqual(sLoopFeatureList[0].LocusTag[0], expectedLocusTag); Assert.AreEqual(sLoopFeatureList[0].Function[0], expectedFunction); Assert.IsTrue(string.IsNullOrEmpty(sLoopFeatureList[0].Operon)); Assert.IsTrue(string.IsNullOrEmpty(sLoopFeatureList[0].StandardName)); // Create a new StemLoop and validate the same. var stemLoop = new StemLoop(expectedLocation); var stemLoopWithILoc = new StemLoop( genMetadata.Features.StemLoops[0].Location); // Set qualifiers and validate them. stemLoop.Allele = expectedAllele; stemLoop.GeneSymbol = geneSymbol; stemLoopWithILoc.GenomicMapPosition = expectedMap; Assert.AreEqual(stemLoop.GeneSymbol, geneSymbol); Assert.AreEqual(stemLoop.Allele, expectedAllele); Assert.AreEqual(stemLoopWithILoc.GenomicMapPosition, expectedMap); }
private static void WriteLocus(ISequence sequence, TextWriter txtWriter) { // determine molecule and sequence type GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; GenBankLocusInfo locusInfo = null; string molType = sequence.Alphabet.Name; if (metadata != null) { locusInfo = metadata.Locus; molType = locusInfo.MoleculeType.ToString(); } string seqType; if (sequence.Alphabet.Name != null) { if (molType == Alphabets.Protein.Name) { seqType = "aa"; molType = string.Empty; // protein files don't use molecule type } else { seqType = "bp"; } } else { if (sequence.Alphabet == Alphabets.Protein) { seqType = "aa"; molType = string.Empty; // protein files don't use molecule type } else { seqType = "bp"; if (sequence.Alphabet == Alphabets.DNA) { molType = Alphabets.DNA.Name; } else { molType = Alphabets.RNA.Name; } } } // retrieve metadata fields string strandType = string.Empty; string strandTopology = string.Empty; string division = string.Empty; DateTime date = DateTime.Now; if (locusInfo != null) { strandType = Helper.GetStrandType(locusInfo.Strand); strandTopology = Helper.GetStrandTopology(locusInfo.StrandTopology); if (locusInfo.DivisionCode != SequenceDivisionCode.None) { division = locusInfo.DivisionCode.ToString(); } date = locusInfo.Date; } txtWriter.WriteLine("{0,-12}{1,-16} {2,11} {3} {4,3}{5,-6} {6,-8} {7,3} {8}", "LOCUS", sequence.ID, sequence.Count, seqType, strandType, molType, strandTopology, division, date.ToString("dd-MMM-yyyy", CultureInfo.InvariantCulture).ToUpperInvariant()); }
/// <summary> /// Gives out string array of metadata and features just below metadata. /// </summary> /// <param name="metadata">GenBank Metadata</param> /// <returns>string array of metadata</returns> public static string[,] GenBankMetadataToRange(GenBankMetadata metadata) { List<string[]> excelData = new List<string[]>(); List<string> excelRow = new List<string>(); // Add the metadata headers excelRow.Add(Properties.Resources.GenbankMetadataHeader); excelData.Add(excelRow.ToArray()); excelRow.Clear(); if (metadata.Locus != null) { excelData.Add(new[] { Properties.Resources.GenbankMetadataLocus }); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataName, metadata.Locus.Name); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataSeqLength, metadata.Locus.SequenceLength.ToString()); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataSeqType, metadata.Locus.SequenceType); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataStrandType, Helper.GetStrandType(metadata.Locus.Strand)); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataMoleculeType, metadata.Locus.MoleculeType.ToString()); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataStrandTopology, Helper.GetStrandTopology(metadata.Locus.StrandTopology)); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataDivisionCode, metadata.Locus.DivisionCode.ToString()); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataDate, metadata.Locus.Date.ToString("dd-MMM-yyyy").ToUpper()); } if (!string.IsNullOrWhiteSpace(metadata.Definition)) { AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataDefinition, "", metadata.Definition); } if (metadata.Accession != null) { string secondaryAccession = string.Empty; foreach (string accession2 in metadata.Accession.Secondary) { secondaryAccession += accession2 == null ? " " : " " + accession2; } AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataAccession, "", metadata.Accession.Primary + secondaryAccession); } if (metadata.DbLinks != null) { foreach (var link in metadata.DbLinks) { string linkNumbers = string.Empty; foreach (string linkNumber in link.Numbers) { linkNumbers += linkNumber + ","; } AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataDBLink, "", link.Type.ToString() + ":" + linkNumbers); } } if (!string.IsNullOrWhiteSpace(metadata.DbSource)) { AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataDBSource, "", metadata.DbSource); } if (metadata.Version != null) { AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataVersion, "", (metadata.Version.Accession ?? string.Empty) + "." + (metadata.Version.Version ?? string.Empty) + " " + Properties.Resources.GenbankMetadataGI + (metadata.Version.GiNumber ?? string.Empty)); } if (metadata.Segment != null) { AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataSegment, "", metadata.Segment.Current + " of " + metadata.Segment.Count); } AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataKeywords, "", metadata.Keywords); if (metadata.Source != null) { AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataSource, "", metadata.Source.CommonName ?? string.Empty); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataOrganism, (metadata.Source.Organism.Genus ?? string.Empty) + " " + (metadata.Source.Organism.Species ?? string.Empty)); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataClassLevels, metadata.Source.Organism.ClassLevels ?? string.Empty); } foreach (CitationReference reference in metadata.References) { AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataReference, "", reference.Number.ToString() + " (" + reference.Location + ")"); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataAuthors, reference.Authors); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataTitle, reference.Title); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataJournal, reference.Journal); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataConsortiums, reference.Consortiums); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataMedLine, reference.Medline); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataPubMed, reference.PubMed); AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataRemarks, reference.Remarks); } if (!string.IsNullOrWhiteSpace(metadata.Primary)) { AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataPrimary, "", metadata.Primary); } if (metadata.Comments != null && metadata.Comments.Count > 0) { StringBuilder strbuilder = null; foreach (string str in metadata.Comments) { if (strbuilder == null) { strbuilder = new StringBuilder(); } else { strbuilder.Append(Environment.NewLine); } strbuilder.Append(str); } if (strbuilder != null) { AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataComment, "", strbuilder.ToString()); } } if (metadata.Features != null) { // Add the metadata headers excelRow.Add(Properties.Resources.GenbankFeaturesHeader); excelData.Add(excelRow.ToArray()); excelRow.Clear(); IList<FeatureItem> featureList = metadata.Features.All; foreach (FeatureItem featureItem in featureList) { LocationBuilder locBuilder = new LocationBuilder(); // Add the feature headers excelRow.Add(featureItem.Key); //excelRow.Add(""); // skip one column excelRow.Add(locBuilder.GetLocationString(featureItem.Location)); excelData.Add(excelRow.ToArray()); excelRow.Clear(); foreach (string key in featureItem.Qualifiers.Keys) { foreach (string value in featureItem.Qualifiers[key]) { AddNameValuePair(excelData, 1, key, value); } } } } if (!string.IsNullOrWhiteSpace(metadata.BaseCount)) { AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataBaseCount, "", metadata.BaseCount); } return ConvertToArray(excelData); }
/// <summary> /// Validate GenBank CDS features /// </summary> /// <param name="nodeName">XML node name</param> /// <param name="genMetadata">GenBank Metadata</param> private void ValidateGenBankCDSFeatures(string nodeName, GenBankMetadata genMetadata) { // Get Values from XML node. string expectedLocation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Location); string expectedAllele = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.AlleleNode); string featureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.QualifierCount); string expectedDbReference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.DbReferenceNode); string geneSymbol = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSymbol); string expectedCitation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.CitationNode); string expectedExperiment = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.ExperimentNode); string expectedInference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.InferenceNode); string expectedLabel = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LabelNode); string expectedNote = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Note); string expectedMap = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GenbankMapNode); string expectedTranslation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GenbankTranslationNode); string expectedCodonStart = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.CodonStartNode); List<CodingSequence> codingSequenceFeatureList = genMetadata.Features.CodingSequences; var locBuilder = new LocationBuilder(); // Create a copy of Coding Seq Region feature. CodingSequence cloneCDS = codingSequenceFeatureList[0].Clone(); // Validate Unsure Seq Region qualifiers. Assert.AreEqual(codingSequenceFeatureList.Count.ToString((IFormatProvider) null), featureCount); Assert.AreEqual(cloneCDS.DatabaseCrossReference[0], expectedDbReference); Assert.AreEqual(cloneCDS.GeneSymbol, geneSymbol); Assert.AreEqual(codingSequenceFeatureList[0].Allele, expectedAllele); Assert.AreEqual(codingSequenceFeatureList[0].Citation[0], expectedCitation); Assert.AreEqual(codingSequenceFeatureList[0].Experiment[0], expectedExperiment); Assert.AreEqual(codingSequenceFeatureList[0].GenomicMapPosition, expectedMap); Assert.AreEqual(codingSequenceFeatureList[0].Inference[0], expectedInference); Assert.AreEqual(codingSequenceFeatureList[0].Label, expectedLabel); Assert.AreEqual(locBuilder.GetLocationString( genMetadata.Features.CodingSequences[0].Location), expectedLocation); Assert.AreEqual(codingSequenceFeatureList[0].Note[0], expectedNote); Assert.AreEqual(codingSequenceFeatureList[0].GenomicMapPosition, expectedMap); Assert.AreEqual(codingSequenceFeatureList[0].CodonStart[0], expectedCodonStart); Assert.AreEqual(codingSequenceFeatureList[0].Translation, expectedTranslation); Assert.IsFalse(string.IsNullOrEmpty(codingSequenceFeatureList[0].Codon.ToString())); Assert.IsTrue(string.IsNullOrEmpty(codingSequenceFeatureList[0].EnzymeCommissionNumber)); Assert.IsTrue(string.IsNullOrEmpty(codingSequenceFeatureList[0].Number)); Assert.IsTrue(string.IsNullOrEmpty(codingSequenceFeatureList[0].Operon)); Assert.IsFalse(codingSequenceFeatureList[0].Pseudo); Assert.IsFalse(codingSequenceFeatureList[0].RibosomalSlippage); Assert.IsTrue(string.IsNullOrEmpty(codingSequenceFeatureList[0].StandardName)); Assert.IsFalse(string.IsNullOrEmpty(codingSequenceFeatureList[0].TranslationalExcept.ToString())); Assert.IsTrue(string.IsNullOrEmpty(codingSequenceFeatureList[0].TranslationTable)); Assert.IsFalse(codingSequenceFeatureList[0].TransSplicing); Assert.IsTrue(string.IsNullOrEmpty(codingSequenceFeatureList[0].Exception)); // Create a new CDS feature using constructor. var cds = new CodingSequence(expectedLocation); var cdsWithLoc = new CodingSequence( genMetadata.Features.CodingSequences[0].Location); Sequence seq = cds.GetTranslation(); Assert.IsNotNull(seq); // Set and validate qualifiers. cds.Allele = expectedAllele; cdsWithLoc.GeneSymbol = geneSymbol; cdsWithLoc.GenomicMapPosition = expectedMap; Assert.AreEqual(cdsWithLoc.GenomicMapPosition, expectedMap); Assert.AreEqual(cds.Allele, expectedAllele); Assert.AreEqual(cdsWithLoc.GeneSymbol, geneSymbol); }
/// <summary> /// Validate GenBank Non Coding RNA features /// </summary> /// <param name="nodeName">XML node name</param> /// <param name="genMetadata">GenBank Metadata</param> private void ValidateGenBankNonCodingRNA(string nodeName, GenBankMetadata genMetadata) { // Get Values from XML node. string expectedLocation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Location); string featureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.QualifierCount); string expectedLabel = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LabelNode); string expectedNonCodingRnaClass = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.NonCodingRnaClassNode); List<NonCodingRna> nonCodingRNAFeatureList = genMetadata.Features.NonCodingRNAs; var locBuilder = new LocationBuilder(); // Create a copy of Non coding RNA feature. NonCodingRna cloneNonCodingRNA = nonCodingRNAFeatureList[0].Clone(); // Validate Non Coding RNA Region qualifiers. Assert.AreEqual(nonCodingRNAFeatureList.Count.ToString((IFormatProvider) null), featureCount); Assert.AreEqual(nonCodingRNAFeatureList[0].NonCodingRnaClass, expectedNonCodingRnaClass); Assert.AreEqual(cloneNonCodingRNA.Label, expectedLabel); Assert.AreEqual(locBuilder.GetLocationString( genMetadata.Features.NonCodingRNAs[0].Location), expectedLocation); // Create a non Coding RNA and validate the same. var nRNA = new NonCodingRna(genMetadata.Features.NonCodingRNAs[0].Location); var nRNAWithLocation = new NonCodingRna(expectedLocation); // Set properties nRNA.NonCodingRnaClass = expectedNonCodingRnaClass; nRNAWithLocation.NonCodingRnaClass = expectedNonCodingRnaClass; // Validate created nRNA. Assert.AreEqual(nRNA.NonCodingRnaClass, expectedNonCodingRnaClass); Assert.AreEqual(nRNAWithLocation.NonCodingRnaClass, expectedNonCodingRnaClass); }
/// <summary> /// Validate GenBank RibosomeBindingSite features /// </summary> /// <param name="nodeName">XML node name</param> /// <param name="genMetadata">GenBank Metadata</param> private void ValidateGenBankRibosomeBindingSite(string nodeName, GenBankMetadata genMetadata) { // Get Values from XML node. string expectedLocation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Location); string expectedAllele = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.AlleleNode); string featureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.QualifierCount); string expectedDbReference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.DbReferenceNode); string geneSymbol = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSymbol); string expectedCitation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.CitationNode); string expectedExperiment = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.ExperimentNode); string expectedInference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.InferenceNode); string expectedLabel = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LabelNode); string expectedNote = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Note); string expectedMap = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GenbankMapNode); List<RibosomeBindingSite> ribosomeSite = genMetadata.Features.RibosomeBindingSites; // Create a copy of RibosomeBindigSite Region feature. RibosomeBindingSite cloneRibosomeSite = ribosomeSite[0].Clone(); var locBuilder = new LocationBuilder(); // Validate RibosomeBindigSite qualifiers. Assert.AreEqual(ribosomeSite.Count.ToString((IFormatProvider) null) , featureCount); Assert.AreEqual(cloneRibosomeSite.DatabaseCrossReference[0], expectedDbReference); Assert.AreEqual(cloneRibosomeSite.GeneSymbol, geneSymbol); Assert.AreEqual(ribosomeSite[0].Allele, expectedAllele); Assert.AreEqual(ribosomeSite[0].Citation[0], expectedCitation); Assert.AreEqual(ribosomeSite[0].Experiment[0], expectedExperiment); Assert.AreEqual(ribosomeSite[0].Inference[0], expectedInference); Assert.AreEqual(ribosomeSite[0].Label, expectedLabel); Assert.AreEqual(locBuilder.GetLocationString( genMetadata.Features.RibosomeBindingSites[0].Location), expectedLocation); Assert.AreEqual(ribosomeSite[0].Note[0], expectedNote); Assert.AreEqual(ribosomeSite[0].GenomicMapPosition, expectedMap); Assert.IsNotNull(ribosomeSite[0].OldLocusTag[0]); Assert.IsNotNull(ribosomeSite[0].LocusTag[0]); Assert.IsNotNull(ribosomeSite[0].StandardName); // Create a new RibosomeBindingSite feature using constructor. var ribosomeBindingSite = new RibosomeBindingSite(expectedLocation); var ribosomeBindingSiteLoc = new RibosomeBindingSite( genMetadata.Features.RibosomeBindingSites[0].Location); // Set and validate qualifiers. ribosomeBindingSite.Allele = expectedAllele; ribosomeBindingSiteLoc.GeneSymbol = geneSymbol; ribosomeBindingSiteLoc.GenomicMapPosition = expectedMap; Assert.AreEqual(ribosomeBindingSiteLoc.GenomicMapPosition, expectedMap); Assert.AreEqual(ribosomeBindingSite.Allele, expectedAllele); Assert.AreEqual(ribosomeBindingSiteLoc.GeneSymbol, geneSymbol); }
/// <summary> /// Validate GenBank UnsureSequenceRegion features /// </summary> /// <param name="nodeName">XML node name</param> /// <param name="genMetadata">GenBank Metadata</param> private void ValidateGenBankUnsureSequenceRegion(string nodeName, GenBankMetadata genMetadata) { // Get Values from XML node. string expectedLocation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Location); string expectedAllele = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.AlleleNode); string featureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.QualifierCount); string expectedDbReference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.DbReferenceNode); string geneSymbol = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSymbol); string expectedCitation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.CitationNode); string expectedExperiment = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.ExperimentNode); string expectedInference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.InferenceNode); string expectedLabel = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LabelNode); string expectedNote = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Note); string expectedMap = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GenbankMapNode); List<UnsureSequenceRegion> unsureSeqRegionFeatureList = genMetadata.Features.UnsureSequenceRegions; // Create a copy of Unsure Seq Region feature. UnsureSequenceRegion cloneUnSureSeqRegion = unsureSeqRegionFeatureList[0].Clone(); var locBuilder = new LocationBuilder(); // Validate Unsure Seq Region qualifiers. Assert.AreEqual(unsureSeqRegionFeatureList.Count.ToString((IFormatProvider) null) , featureCount); Assert.AreEqual(cloneUnSureSeqRegion.DatabaseCrossReference[0], expectedDbReference); Assert.AreEqual(cloneUnSureSeqRegion.GeneSymbol, geneSymbol); Assert.AreEqual(unsureSeqRegionFeatureList[0].Allele, expectedAllele); Assert.AreEqual(unsureSeqRegionFeatureList[0].Citation[0], expectedCitation); Assert.AreEqual(unsureSeqRegionFeatureList[0].Experiment[0], expectedExperiment); Assert.AreEqual(unsureSeqRegionFeatureList[0].GenomicMapPosition, expectedMap); Assert.AreEqual(unsureSeqRegionFeatureList[0].Inference[0], expectedInference); Assert.AreEqual(unsureSeqRegionFeatureList[0].Label, expectedLabel); Assert.AreEqual(locBuilder.GetLocationString( genMetadata.Features.UnsureSequenceRegions[0].Location), expectedLocation); Assert.AreEqual(unsureSeqRegionFeatureList[0].Note[0], expectedNote); Assert.AreEqual(unsureSeqRegionFeatureList[0].GenomicMapPosition, expectedMap); Assert.IsFalse(string.IsNullOrEmpty(unsureSeqRegionFeatureList[0].Compare.ToString())); Assert.IsTrue(string.IsNullOrEmpty(unsureSeqRegionFeatureList[0].Replace)); // Create a new Unsure feature using constructor. var unsureRegion = new UnsureSequenceRegion(expectedLocation); var unsureRegionWithLoc = new UnsureSequenceRegion( genMetadata.Features.UnsureSequenceRegions[0].Location); // Set and validate qualifiers. unsureRegion.Allele = expectedAllele; unsureRegionWithLoc.GeneSymbol = geneSymbol; unsureRegionWithLoc.GenomicMapPosition = expectedMap; Assert.AreEqual(unsureRegionWithLoc.GenomicMapPosition, expectedMap); Assert.AreEqual(unsureRegion.Allele, expectedAllele); Assert.AreEqual(unsureRegionWithLoc.GeneSymbol, geneSymbol); }
/// <summary> /// Validate GenBank Operon features /// </summary> /// <param name="nodeName">XML node name</param> /// <param name="genMetadata">GenBank Metadata</param> private void ValidateGenBankOperon(string nodeName, GenBankMetadata genMetadata) { // Get Values from XML node. string expectedLocation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Location); string expectedAllele = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.AlleleNode); string featureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.QualifierCount); string expectedDbReference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.DbReferenceNode); string expectedCitation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.CitationNode); string expectedExperiment = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.ExperimentNode); string expectedInference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.InferenceNode); string expectedLabel = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LabelNode); string expectedNote = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Note); string expectedMap = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GenbankMapNode); List<OperonRegion> operonFeatureList = genMetadata.Features.OperonRegions; var locBuilder = new LocationBuilder(); // Create a copy of Long Terminal Repeat feature. OperonRegion cloneOperon = operonFeatureList[0].Clone(); // Validate Operon region qualifiers. Assert.AreEqual(operonFeatureList.Count.ToString((IFormatProvider) null), featureCount); Assert.AreEqual(cloneOperon.DatabaseCrossReference[0], expectedDbReference); Assert.AreEqual(operonFeatureList[0].Allele, expectedAllele); Assert.AreEqual(operonFeatureList[0].Citation[0], expectedCitation); Assert.AreEqual(operonFeatureList[0].Experiment[0], expectedExperiment); Assert.AreEqual(operonFeatureList[0].GenomicMapPosition, expectedMap); Assert.AreEqual(operonFeatureList[0].Inference[0], expectedInference); Assert.AreEqual(operonFeatureList[0].Label, expectedLabel); Assert.AreEqual(locBuilder.GetLocationString( genMetadata.Features.OperonRegions[0].Location), expectedLocation); Assert.AreEqual(operonFeatureList[0].Note[0], expectedNote); Assert.IsFalse(string.IsNullOrEmpty(operonFeatureList[0].Function.ToString())); Assert.AreEqual(operonFeatureList[0].GenomicMapPosition, expectedMap); Assert.IsTrue(string.IsNullOrEmpty(operonFeatureList[0].Operon)); Assert.IsFalse(string.IsNullOrEmpty(operonFeatureList[0].Phenotype.ToString())); Assert.IsTrue(string.IsNullOrEmpty(operonFeatureList[0].StandardName)); Assert.IsFalse(operonFeatureList[0].Pseudo); // Create a new Operon feature using constructor. var operonRegion = new OperonRegion(expectedLocation); var operonRegionWithLoc = new OperonRegion( genMetadata.Features.OperonRegions[0].Location); // Set and validate qualifiers. operonRegion.Allele = expectedAllele; operonRegionWithLoc.GenomicMapPosition = expectedMap; Assert.AreEqual(operonRegionWithLoc.GenomicMapPosition, expectedMap); Assert.AreEqual(operonRegion.Allele, expectedAllele); }
/// <summary> /// Validate PrecursorRNA features /// </summary> /// <param name="nodeName">XML node name</param> /// <param name="genMetadata">GenBank Metadata</param> private void ValidateGenBankPrecursorRNAFeature(string nodeName, GenBankMetadata genMetadata) { // Get Values from XML node. string expectedLocation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Location); string expectedAllele = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.AlleleNode); string featureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.QualifierCount); string expectedDbReference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.DbReferenceNode); string geneSymbol = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSymbol); string expectedCitation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.CitationNode); string expectedExperiment = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.ExperimentNode); string expectedFunction = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.FunctionNode); string expectedGeneSynonym = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSynonymNode); string expectedInference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.InferenceNode); string expectedLabel = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LabelNode); string expectedLocusTag = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LocusTagNode); string expectedNote = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Note); string expectedOldLocusTag = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.OldLocusTagNode); string expectedMap = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GenbankMapNode); List<PrecursorRna> precursorRNAFeatureList = genMetadata.Features.PrecursorRNAs; var locBuilder = new LocationBuilder(); // Create a copy of Precursor RNA feature. PrecursorRna clonePrecursorRNA = precursorRNAFeatureList[0].Clone(); // Validate Precursor RNA qualifiers. Assert.AreEqual(precursorRNAFeatureList.Count.ToString((IFormatProvider) null), featureCount); Assert.AreEqual(clonePrecursorRNA.GeneSymbol, geneSymbol); Assert.AreEqual(clonePrecursorRNA.DatabaseCrossReference[0], expectedDbReference); Assert.AreEqual(precursorRNAFeatureList[0].Allele, expectedAllele); Assert.AreEqual(precursorRNAFeatureList[0].Citation[0], expectedCitation); Assert.AreEqual(precursorRNAFeatureList[0].Experiment[0], expectedExperiment); Assert.AreEqual(precursorRNAFeatureList[0].GenomicMapPosition, expectedMap); Assert.AreEqual(precursorRNAFeatureList[0].GeneSynonym[0], expectedGeneSynonym); Assert.AreEqual(precursorRNAFeatureList[0].Inference[0], expectedInference); Assert.AreEqual(precursorRNAFeatureList[0].Label, expectedLabel); Assert.AreEqual(locBuilder.GetLocationString( genMetadata.Features.PrecursorRNAs[0].Location), expectedLocation); Assert.AreEqual(precursorRNAFeatureList[0].Note[0], expectedNote); Assert.AreEqual(precursorRNAFeatureList[0].OldLocusTag[0], expectedOldLocusTag); Assert.AreEqual(precursorRNAFeatureList[0].LocusTag[0], expectedLocusTag); Assert.AreEqual(precursorRNAFeatureList[0].Function[0], expectedFunction); Assert.IsTrue(string.IsNullOrEmpty(precursorRNAFeatureList[0].StandardName)); Assert.IsFalse(string.IsNullOrEmpty(precursorRNAFeatureList[0].Product.ToString())); Assert.IsTrue(string.IsNullOrEmpty(precursorRNAFeatureList[0].Operon)); Assert.IsFalse(precursorRNAFeatureList[0].TransSplicing); // Create a new Precursor RNA and validate the same. var precursorRNA = new PrecursorRna(expectedLocation); var precursorRNAWithILoc = new PrecursorRna( genMetadata.Features.PrecursorRNAs[0].Location); // Set qualifiers and validate them. precursorRNA.Allele = expectedAllele; precursorRNA.GeneSymbol = geneSymbol; precursorRNAWithILoc.GenomicMapPosition = expectedMap; Assert.AreEqual(precursorRNA.GeneSymbol, geneSymbol); Assert.AreEqual(precursorRNA.Allele, expectedAllele); Assert.AreEqual(precursorRNAWithILoc.GenomicMapPosition, expectedMap); }
/// <summary> /// Validate ModifiedBase features /// </summary> /// <param name="nodeName">XML node name</param> /// <param name="genMetadata">GenBank Metadata</param> private void ValidateGenBankModifiedBaseFeature(string nodeName, GenBankMetadata genMetadata) { // Get Values from XML node. string expectedLocation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Location); string expectedAllele = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.AlleleNode); string featureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.QualifierCount); string expectedDbReference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.DbReferenceNode); string geneSymbol = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSymbol); string expectedCitation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.CitationNode); string expectedExperiment = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.ExperimentNode); string expectedGeneSynonym = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSynonymNode); string expectedInference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.InferenceNode); string expectedLabel = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LabelNode); string expectedLocusTag = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LocusTagNode); string expectedNote = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Note); string expectedOldLocusTag = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.OldLocusTagNode); string expectedMap = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GenbankMapNode); List<ModifiedBase> modifiedBaseFeatureList = genMetadata.Features.ModifiedBases; var locBuilder = new LocationBuilder(); // Create a copy of Modified base feature. ModifiedBase cloneModifiedBase = modifiedBaseFeatureList[0].Clone(); // Validate Modified Base qualifiers. Assert.AreEqual(modifiedBaseFeatureList.Count.ToString((IFormatProvider) null), featureCount); Assert.AreEqual(cloneModifiedBase.GeneSymbol, geneSymbol); Assert.AreEqual(cloneModifiedBase.DatabaseCrossReference[0], expectedDbReference); Assert.AreEqual(modifiedBaseFeatureList[0].Allele, expectedAllele); Assert.AreEqual(modifiedBaseFeatureList[0].Citation[0], expectedCitation); Assert.AreEqual(modifiedBaseFeatureList[0].Experiment[0], expectedExperiment); Assert.AreEqual(modifiedBaseFeatureList[0].GenomicMapPosition, expectedMap); Assert.AreEqual(modifiedBaseFeatureList[0].GeneSynonym[0], expectedGeneSynonym); Assert.AreEqual(modifiedBaseFeatureList[0].Inference[0], expectedInference); Assert.AreEqual(modifiedBaseFeatureList[0].Label, expectedLabel); Assert.AreEqual(locBuilder.GetLocationString( genMetadata.Features.ModifiedBases[0].Location), expectedLocation); Assert.AreEqual(modifiedBaseFeatureList[0].Note[0], expectedNote); Assert.AreEqual(modifiedBaseFeatureList[0].OldLocusTag[0], expectedOldLocusTag); Assert.AreEqual(modifiedBaseFeatureList[0].LocusTag[0], expectedLocusTag); Assert.IsFalse(string.IsNullOrEmpty(modifiedBaseFeatureList[0].ModifiedNucleotideBase.ToString())); // Create a new ModifiedBase and validate the same. var modifiedBase = new ModifiedBase(expectedLocation); var modifiedBaseWithILoc = new ModifiedBase( genMetadata.Features.ModifiedBases[0].Location); // Set qualifiers and validate them. modifiedBase.Allele = expectedAllele; modifiedBase.GeneSymbol = geneSymbol; modifiedBaseWithILoc.GenomicMapPosition = expectedMap; Assert.AreEqual(modifiedBase.GeneSymbol, geneSymbol); Assert.AreEqual(modifiedBase.Allele, expectedAllele); Assert.AreEqual(modifiedBaseWithILoc.GenomicMapPosition, expectedMap); }
/// <summary> /// Parses the GenBank Origin data from the GenBank file. /// </summary> /// <param name="line">parse line</param> /// <param name="metadata">The GenBank metadata.</param> /// <param name="stream">The stream reader.</param> private void ParseOrigin(ref string line, GenBankMetadata metadata, StreamReader stream) { // The origin line can contain optional data; don't put empty string into // metadata. string lineData = GetLineData(line, DataIndent); if (!String.IsNullOrEmpty(lineData)) { metadata.Origin = lineData; } line = GoToNextLine(line, stream); IAlphabet alphabet = null; var sequenceBuilder = new StringBuilder(); while ((line != null) && line[0] == ' ') { // Using a regex is too slow. int len = line.Length; int k = 0; while (k < len && (line[k] == ' ' || Char.IsNumber(line[k]))) k++; while (k < len) { string seqData = line.Substring(k, Math.Min(10, len - k)); sequenceBuilder.Append(seqData); k += 11; } line = GoToNextLine(line, stream); } var sequenceString = sequenceBuilder.ToString().Trim(); if (!string.IsNullOrEmpty(sequenceString)) { if (Alphabet == null) { byte[] tempData = UTF8Encoding.UTF8.GetBytes(sequenceString.ToUpperInvariant()); alphabet = Alphabets.AutoDetectAlphabet(tempData, 0, tempData.Length, alphabet); if (alphabet == null) { var message = String.Format(CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, line); Trace.Report(message); throw new InvalidDataException(message); } } else { alphabet = Alphabet; } sequenceWithData = new Sequence(alphabet, sequenceString); } }
/// <summary> /// Parses the GenBank headers from the GenBank file. /// parses everything before the features section /// </summary> /// <param name="sequence">The sequence.</param> /// <param name="noOfSequence">The current sequence index.</param> /// <param name="line">parse line</param> /// <param name="stream">The stream reader.</param> /// <returns>The parsed line.</returns> private string ParseHeaders(ref Sequence sequence, int noOfSequence, string line, StreamReader stream) { GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; string data; string[] tokens; // only allow one locus line bool haveParsedLocus = false; string lineData; if (noOfSequence == 0) { line = string.Empty; line = GoToNextLine(line, stream); } // parse until we hit the features or sequence section bool haveFinishedHeaders = false; while ((line != null) && !haveFinishedHeaders) { switch (GetLineHeader(line, DataIndent)) { case "LOCUS": if (haveParsedLocus) { string message = String.Format(CultureInfo.CurrentCulture, Properties.Resource.ParserSecondLocus); Trace.Report(message); throw new InvalidDataException(message); } line = ParseLocusByTokens(line, ref sequence, stream); metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; haveParsedLocus = true; // don't go to next line; current line still needs to be processed break; case "VERSION": lineData = GetLineData(line, DataIndent); tokens = lineData.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // first token contains accession and version Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$"); metadata.Version = new GenBankVersion(); if (m.Success) { metadata.Version.Version = m.Groups["version"].Value; // The first token in the data from the accession line is referred to as // the primary accession number, and should be the one used here in the // version line. string versionLineAccession = m.Groups["accession"].Value; if (metadata.Accession == null) { ApplicationLog.WriteLine("WARN: VERSION processed before ACCESSION"); } else { if (!versionLineAccession.Equals(metadata.Accession.Primary)) { ApplicationLog.WriteLine("WARN: VERSION tag doesn't match ACCESSION"); } else { metadata.Version.Accession = metadata.Accession.Primary; } } } if (tokens.Length > 1) { // second token contains primary ID m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)"); if (m.Success) { metadata.Version.GiNumber = m.Groups["primaryID"].Value; } } line = GoToNextLine(line, stream); break; case "PROJECT": lineData = GetLineData(line, DataIndent); tokens = lineData.Split(':'); if (tokens.Length == 2) { metadata.Project = new ProjectIdentifier { Name = tokens[0] }; tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { metadata.Project.Numbers.Add(tokens[i]); } } else { ApplicationLog.WriteLine("WARN: unexpected PROJECT header: " + line); } line = GoToNextLine(line, stream); break; case "SOURCE": line = ParseSource(line, ref sequence, stream); metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "REFERENCE": line = ParseReferences(line, ref sequence, stream); // can encounter more than one metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "COMMENT": line = ParseComments(line, ref sequence, stream); // can encounter more than one metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "PRIMARY": // This header is followed by sequence info in a table format that could be // stored in a custom object. The first line contains column headers. // For now, just validate the presence of the headers, and save the data // as a string. lineData = GetLineData(line, DataIndent); tokens = lineData.Split("\t ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); // Validating for minimum two headers. if (tokens.Length != 4) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserPrimaryLineError, line); Trace.Report(message); throw new InvalidDataException(message); } string primaryData = ParseMultiLineData(ref line, Environment.NewLine, DataIndent, stream); metadata.Primary = primaryData; // don't go to next line; current line still needs to be processed break; // all the following are extracted the same way - possibly multiline case "DEFINITION": metadata.Definition = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "ACCESSION": data = ParseMultiLineData(ref line, " ", DataIndent, stream); metadata.Accession = new GenBankAccession(); string[] accessions = data.Split(' '); metadata.Accession.Primary = accessions[0]; for (int i = 1; i < accessions.Length; i++) { metadata.Accession.Secondary.Add(accessions[i]); } break; case "DBLINK": data = ParseMultiLineData(ref line, "\n", DataIndent, stream); metadata.DbLinks = new List <CrossReferenceLink>(); foreach (string link in data.Split('\n')) { tokens = link.Split(':'); if (tokens.Length == 2) { CrossReferenceLink newLink = new CrossReferenceLink(); if (string.Compare(tokens[0], CrossReferenceType.Project.ToString(), StringComparison.OrdinalIgnoreCase) == 0) { newLink.Type = CrossReferenceType.Project; } else if (string.Compare(tokens[0], CrossReferenceType.BioProject.ToString(), StringComparison.OrdinalIgnoreCase) == 0) { newLink.Type = CrossReferenceType.BioProject; } else { newLink.Type = CrossReferenceType.None; if (string.Compare(tokens[0], TraceAssemblyArchive, StringComparison.OrdinalIgnoreCase) == 0) { newLink.Type = CrossReferenceType.TraceAssemblyArchive; } } tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { newLink.Numbers.Add(tokens[i]); } metadata.DbLinks.Add(newLink); } else { ApplicationLog.WriteLine("WARN: unexpected DBLINK header: " + line); } } break; case "DBSOURCE": metadata.DbSource = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "KEYWORDS": metadata.Keywords = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "SEGMENT": data = ParseMultiLineData(ref line, " ", DataIndent, stream); const string delimeter = "of"; tokens = data.Split(delimeter.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); if (tokens.Length == 2) { metadata.Segment = new SequenceSegment(); int outvalue; if (int.TryParse(tokens[0].Trim(), out outvalue)) { metadata.Segment.Current = outvalue; } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + line); } if (int.TryParse(tokens[1].Trim(), out outvalue)) { metadata.Segment.Count = outvalue; } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + line); } } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + line); } break; // all the following indicate sections beyond the headers parsed by this method case "FEATURES": case "BASE COUNT": case "ORIGIN": case "CONTIG": haveFinishedHeaders = true; break; default: string lineHeader = GetLineHeader(line, DataIndent); lineData = GetLineData(line, DataIndent); ApplicationLog.WriteLine(ToString() + "WARN: unknown {0} -> {1}", lineHeader, lineData); string errMessage = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParseHeaderError, lineHeader); Trace.Report(errMessage); throw new InvalidDataException(errMessage); } } // check for required features if (!haveParsedLocus) { string message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, Name); Trace.Report(message); throw new InvalidDataException(message); } return(line); }
/// <summary> /// Parses the GenBank source data from the GenBank file. /// </summary> /// <param name="line">parse line</param> /// <param name="sequence">The sequence.</param> /// <param name="stream">The stream reader.</param> /// <returns>The parsed line.</returns> private string ParseSource(string line, ref Sequence sequence, StreamReader stream) { string source = string.Empty; string organism = string.Empty; string classLevels = string.Empty; while (line != null) { string lineHeader = GetLineHeader(line, DataIndent); string lineData; if (lineHeader == "SOURCE") { // data can be multiline. spec says last line must end with period // (note: this doesn't apply unless multiline) bool lastDotted = true; lineData = GetLineData(line, DataIndent); source = lineData; line = GoToNextLine(line, stream); lineHeader = GetLineHeader(line, DataIndent); while ((line != null) && (lineHeader == string.Empty)) { source += " " + GetLineData(line, DataIndent); lastDotted = (source.EndsWith(".", StringComparison.Ordinal)); line = GoToNextLine(line, stream); lineHeader = GetLineHeader(line, DataIndent); } if (!lastDotted && Trace.Want(Trace.SeqWarnings)) { Trace.Report("GenBank.ParseSource", Properties.Resource.OutOfSpec, source); } // don't go to next line; current line still needs to be processed } else if (line[0] == ' ') { if (lineHeader != "ORGANISM") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidSourceField, lineHeader); Trace.Report(message); throw new InvalidDataException(message); } lineData = GetLineData(line, DataIndent); // this also can be multiline organism = lineData; line = GoToNextLine(line, stream); lineHeader = GetLineHeader(line, DataIndent); while ((line != null) && (lineHeader == string.Empty)) { if (line.EndsWith(";", StringComparison.Ordinal) || line.EndsWith(".", StringComparison.Ordinal)) { if (!String.IsNullOrEmpty(classLevels)) { classLevels += " "; } lineData = GetLineData(line, DataIndent); classLevels += lineData; } else { organism += " " + lineData; } line = GoToNextLine(line, stream); lineHeader = GetLineHeader(line, DataIndent); } // don't go to next line; current line still needs to be processed } else { // don't go to next line; current line still needs to be processed break; } } GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; metadata.Source = new SequenceSource { CommonName = source }; if (!string.IsNullOrEmpty(organism)) { int index = organism.IndexOf(" ", StringComparison.Ordinal); if (index > 0) { metadata.Source.Organism.Genus = organism.Substring(0, index); if (organism.Length > index) { index++; metadata.Source.Organism.Species = organism.Substring(index, organism.Length - index); } } else { metadata.Source.Organism.Genus = organism; } } metadata.Source.Organism.ClassLevels = classLevels; if (classLevels.TrimEnd('.').Length > 0) { string genus = classLevels.TrimEnd('.').Split(";".ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Last().Trim(); if (!genus.Equals(metadata.Source.Organism.Genus.Trim())) { metadata.Source.Organism.Species = organism; metadata.Source.Organism.Genus = genus; } } return(line); }
private void WriteSource(GenBankMetadata metadata, TextWriter txtWriter) { if (metadata.Source != null) { string commonname = string.Empty; if (!string.IsNullOrEmpty(metadata.Source.CommonName)) { commonname = metadata.Source.CommonName; } WriteHeaderSection("SOURCE", commonname, txtWriter); string organism = string.Empty; if (!commonname.Equals(metadata.Source.Organism.Species)) { if (!string.IsNullOrEmpty(metadata.Source.Organism.Genus)) { organism += metadata.Source.Organism.Genus; } organism += " "; } if (!string.IsNullOrEmpty(metadata.Source.Organism.Species)) { organism += metadata.Source.Organism.Species; } // Organism might be empty, trim the value to ensure that a string with one space is not written (writer fails on this) WriteHeaderSection(" ORGANISM", organism.Trim(), txtWriter); WriteHeaderSection(string.Empty, metadata.Source.Organism.ClassLevels, txtWriter); } }
/// <summary> /// Helper method to parse the feature of gen bank data /// </summary> /// <param name="metadata">Metadata object</param> /// <param name="cellRange">Range of cells</param> /// <param name="rowIndex">Current index of row</param> /// <returns>Index of row</returns> private static int ParseGenBankFeatures(GenBankMetadata metadata, object[,] cellRange, int rowIndex) { string message = string.Empty; string key; string subKey; string value = string.Empty; while (rowIndex < cellRange.GetLength(0)) { if (null != cellRange[rowIndex, KeyColumnIndex]) { key = cellRange[rowIndex, KeyColumnIndex].ToString().ToUpperInvariant(); if (key.Equals(METADATA)) { break; } } else { rowIndex++; continue; } if (3 > cellRange.GetLength(1)) { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, REFERENCE); throw new FormatException(message); } subKey = cellRange[rowIndex, SubKeyColumnIndex] != null ? cellRange[rowIndex, SubKeyColumnIndex].ToString() : string.Empty; value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty; if (key.Equals(BASECOUNT)) { metadata.BaseCount = value; rowIndex++; } else if (!string.IsNullOrWhiteSpace(value) && !string.IsNullOrWhiteSpace(key)) { FeatureItem featureItem = StandardFeatureMap.GetStandardFeatureItem(new FeatureItem(key, value)); if (metadata.Features == null) { metadata.Features = new SequenceFeatures(); } metadata.Features.All.Add(featureItem); rowIndex++; rowIndex = ParseQualifiers(featureItem, cellRange, rowIndex); } else { rowIndex++; } } return rowIndex; }
// Writes the comments, which are stored in a list of strings. private void WriteComments(GenBankMetadata metadata, TextWriter txtWriter) { foreach (string comment in metadata.Comments) { WriteHeaderSection("COMMENT", comment, txtWriter); } }
/// <summary> /// Validate GenBank GCSignal features /// </summary> /// <param name="nodeName">XML node name</param> /// <param name="genMetadata">GenBank Metadata</param> private void ValidateGenBankGCSignalFeature(string nodeName, GenBankMetadata genMetadata) { // Get Values from XML node. string expectedLocation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Location); string expectedAllele = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.AlleleNode); string featureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.QualifierCount); string expectedDbReference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.DbReferenceNode); string geneSymbol = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSymbol); string expectedCitation = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.CitationNode); string expectedExperiment = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.ExperimentNode); string expectedGeneSynonym = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GeneSynonymNode); string expectedInference = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.InferenceNode); string expectedLabel = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LabelNode); string expectedLocusTag = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.LocusTagNode); string expectedNote = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.Note); string expectedOldLocusTag = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.OldLocusTagNode); string expectedMap = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GenbankMapNode); List<GcSingal> gcSignalFeatureList = genMetadata.Features.GCSignals; var locBuilder = new LocationBuilder(); // Create a copy of GC_Signal feature. GcSingal cloneGCSignal = gcSignalFeatureList[0].Clone(); // Validate GC_Signal qualifiers. Assert.AreEqual(gcSignalFeatureList.Count.ToString((IFormatProvider) null), featureCount); Assert.AreEqual(cloneGCSignal.GeneSymbol, geneSymbol); Assert.AreEqual(cloneGCSignal.DatabaseCrossReference[0], expectedDbReference); Assert.AreEqual(gcSignalFeatureList[0].Allele, expectedAllele); Assert.AreEqual(gcSignalFeatureList[0].Citation[0], expectedCitation); Assert.AreEqual(gcSignalFeatureList[0].Experiment[0], expectedExperiment); Assert.AreEqual(gcSignalFeatureList[0].GenomicMapPosition, expectedMap); Assert.AreEqual(gcSignalFeatureList[0].GeneSynonym[0], expectedGeneSynonym); Assert.AreEqual(gcSignalFeatureList[0].Inference[0], expectedInference); Assert.AreEqual(gcSignalFeatureList[0].Label, expectedLabel); Assert.AreEqual(locBuilder.GetLocationString( genMetadata.Features.GCSignals[0].Location), expectedLocation); Assert.AreEqual(gcSignalFeatureList[0].Note[0], expectedNote); Assert.AreEqual(gcSignalFeatureList[0].OldLocusTag[0], expectedOldLocusTag); Assert.AreEqual(gcSignalFeatureList[0].LocusTag[0], expectedLocusTag); // Create a new GCSignal and validate the same. var gcSignal = new GcSingal(expectedLocation); var gcSignalWithILoc = new GcSingal( genMetadata.Features.GCSignals[0].Location); // Set qualifiers and validate them. gcSignal.Allele = expectedAllele; gcSignal.GeneSymbol = geneSymbol; gcSignalWithILoc.GenomicMapPosition = expectedMap; Assert.AreEqual(gcSignal.GeneSymbol, geneSymbol); Assert.AreEqual(gcSignal.Allele, expectedAllele); Assert.AreEqual(gcSignalWithILoc.GenomicMapPosition, expectedMap); }
/// <summary> /// Parses the GenBank Origin data from the GenBank file. /// </summary> /// <param name="line">parse line</param> /// <param name="metadata">The GenBank metadata.</param> /// <param name="stream">The stream reader.</param> private void ParseOrigin(ref string line, GenBankMetadata metadata, StreamReader stream) { // The origin line can contain optional data; don't put empty string into // metadata. string lineData = GetLineData(line, DataIndent); if (!String.IsNullOrEmpty(lineData)) { metadata.Origin = lineData; } line = GoToNextLine(line, stream); IAlphabet alphabet = null; var sequenceBuilder = new StringBuilder(); while ((line != null) && line[0] == ' ') { // Using a regex is too slow. int len = line.Length; int k = 0; while (k < len && (line[k] == ' ' || Char.IsNumber(line[k]))) { k++; } while (k < len) { string seqData = line.Substring(k, Math.Min(10, len - k)); sequenceBuilder.Append(seqData); k += 11; } line = GoToNextLine(line, stream); } var sequenceString = sequenceBuilder.ToString().Trim(); if (!string.IsNullOrEmpty(sequenceString)) { if (Alphabet == null) { byte[] tempData = UTF8Encoding.UTF8.GetBytes(sequenceString.ToUpperInvariant()); alphabet = Alphabets.AutoDetectAlphabet(tempData, 0, tempData.Length, alphabet); if (alphabet == null) { var message = String.Format(CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, line); Trace.Report(message); throw new InvalidDataException(message); } } else { alphabet = Alphabet; } sequenceWithData = new Sequence(alphabet, sequenceString); } }
/// <summary> /// Construct initialization. /// </summary> /// <param name="fragList">Fragment list.</param> /// <param name="maxOverlapLen">Minimum overlap length.</param> private void Init(ObservableCollection<Fragment> fragList, DesignerSettings settings) { this.Overlaps = new List<Overlap>(); this.Settings = settings; Thermodynamics.thal_results results = new Thermodynamics.thal_results(); Thermodynamics.p3_get_thermodynamic_values(Settings.TmThalParamPath, ref results); String message = new String(results.msg); message = message.Trim('\0'); if (!String.IsNullOrEmpty(message)) { throw new TmThalParamException(message); } //forward String seq_5 = ""; String seq_3 = ""; String name = ""; List<MiscFeature> featList = new List<MiscFeature>(); int pairIndex; int len_5; int len_3; for (int i = 0; i < fragList.Count; i++) { name += fragList[i].Name; seq_3 = fragList[i].GetString(); len_5 = Math.Min(settings.MaxLen_5, seq_5.Length); len_3 = Math.Min(settings.MaxLen_3, seq_3.Length); String overhang_5 = seq_5.Substring(seq_5.Length - len_5, len_5); String geneSpecific_3 = seq_3.Substring(0, len_3); String loc = (seq_5.Length + 1).ToString() + ".." + (seq_5.Length + seq_3.Length).ToString(); MiscFeature gene = new MiscFeature(loc); gene.StandardName = fragList[i].Name; featList.Add(gene); seq_5 += seq_3; if (i == 0) { pairIndex = fragList.Count; Overlaps.Add(new Overlap(Designer.VectorLabel + fragList[i].Name + "-fwd", new Sequence(Alphabets.AmbiguousDNA, geneSpecific_3), settings.TmThalSettings, pairIndex)); } else { pairIndex = 2 * fragList.Count - i; Overlaps.Add(new Overlap(fragList[i].Name + "-fwd", new Sequence(Alphabets.AmbiguousDNA, overhang_5), new Sequence(Alphabets.AmbiguousDNA, geneSpecific_3), settings.TmThalSettings, pairIndex)); } } this.Sequence = new Sequence(Alphabets.AmbiguousDNA, seq_5); //meta GenBankMetadata meta = new GenBankMetadata(); meta.Locus = new GenBankLocusInfo(); meta.Locus.MoleculeType = MoleculeType.DNA; meta.Locus.Name = name; meta.Locus.Date = System.DateTime.Now; meta.Locus.SequenceLength = seq_5.Length; meta.Comments.Add("designed with mufasa"); meta.Definition = "synthetic construct"; meta.Features = new SequenceFeatures(); meta.Features.All.AddRange(featList); this.Sequence.Metadata.Add("GenBank", meta); //reverse fragList.Add(new Fragment(fragList[0])); fragList.RemoveAt(0); seq_5 = ""; seq_3 = ""; for (int i = fragList.Count - 1; i >= 0; i--) { seq_5 = fragList[i].GetReverseComplementString(); len_5 = Math.Min(settings.MaxLen_5, seq_3.Length); len_3 = Math.Min(settings.MaxLen_3, seq_5.Length); String overhang_5 = seq_3.Substring(seq_3.Length - len_5, len_5); String geneSpecific_3 = seq_5.Substring(0, len_3); seq_3 += seq_5; if (i == fragList.Count - 1) { pairIndex = 0; Overlaps.Add(new Overlap(Designer.VectorLabel + fragList[i].Name + "-rev", new Sequence(Alphabets.AmbiguousDNA, geneSpecific_3), settings.TmThalSettings, pairIndex)); } else { pairIndex = i + 1; Overlaps.Add(new Overlap(fragList[i].Name + "-rev", new Sequence(Alphabets.AmbiguousDNA, overhang_5), new Sequence(Alphabets.AmbiguousDNA, geneSpecific_3), settings.TmThalSettings, pairIndex)); } } for (int i = 0; i < fragList.Count; i++) { //Duplex melting temperatures Overlaps[i].HeterodimerMeltingTemperature = Overlaps[i].GetDuplexTemperature(Overlaps[Overlaps[i].PairIndex]); } }
/// <summary> /// Parses the GenBank Reference information from the GenBank file. /// </summary> /// <param name="line">parse line</param> /// <param name="sequence">The sequence.</param> /// <param name="stream">The stream reader.</param> /// <returns>The parsed line.</returns> private string ParseReferences(string line, ref Sequence sequence, StreamReader stream) { GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; IList <CitationReference> referenceList = metadata.References; CitationReference reference = null; while (line != null) { string lineHeader = GetLineHeader(line, DataIndent); if (lineHeader == "REFERENCE") { // add previous reference if (reference != null) { referenceList.Add(reference); } // check for start/end e.g. (bases 1 to 118), or prose notes string lineData = GetLineData(line, DataIndent); Match m = Regex.Match(lineData, @"^(?<number>\d+)(\s+\((?<location>.*)\))?"); if (!m.Success) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserReferenceError, lineData); Trace.Report(message); throw new InvalidDataException(message); } // create new reference string number = m.Groups["number"].Value; string location = m.Groups["location"].Value; reference = new CitationReference(); int outValue; if (!int.TryParse(number, out outValue)) { throw new InvalidOperationException(string.Format(CultureInfo.CurrentCulture, Properties.Resource.InvalidRefNumber, number)); } reference.Number = outValue; reference.Location = location; line = GoToNextLine(line, stream); } else if (line.StartsWith(" ", StringComparison.Ordinal)) { switch (lineHeader) { // all the following are extracted the same way - possibly multiline case "AUTHORS": reference.Authors = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "CONSRTM": reference.Consortiums = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "TITLE": reference.Title = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "JOURNAL": reference.Journal = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "REMARK": reference.Remarks = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "MEDLINE": reference.Medline = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "PUBMED": reference.PubMed = ParseMultiLineData(ref line, " ", DataIndent, stream); break; default: string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidReferenceField, lineHeader); Trace.Report(message); throw new InvalidDataException(message); } } else { // add last reference if (reference != null) { referenceList.Add(reference); } // don't go to next line; current line still needs to be processed break; } } return(line); }
/// <summary> /// This method transfers all available features from the MBF sequence and /// populate them into biopatml features data type. /// In this version only its name, start and end location is populated. /// </summary> /// <param name="metadata"></param> /// <returns></returns> private FeatureList ExtractFeatures(GenBankMetadata metadata) { List<FeatureItem> mbfFeatures = metadata.Features.All; FeatureList bioFeatureList = new FeatureList(); foreach (FeatureItem item in mbfFeatures) { #region Constructs the feature outline first //Strand is always assumed to be forward +1 QUT.Bio.BioPatML.Sequences.Feature bioFeature = new QUT.Bio.BioPatML.Sequences.Feature (item.Key, item.Location.Start, item.Location.End, 1); bioFeatureList.Add(bioFeature); #endregion #region Adds the qualifier key and values to Feature using AnnotationList AnnotationList annList = new AnnotationList(); foreach (KeyValuePair<string, List<string>> qualitfier in item.Qualifiers) annList.Add(qualitfier.Key, qualitfier.Value[0]); bioFeature.AddAnnotations(annList); #endregion } return bioFeatureList; }
// parses everything before the features section private void ParseHeaders(BioTextReader bioReader, ref Sequence sequence) { GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; string data = string.Empty; string[] tokens = null; // set data indent for headers bioReader.DataIndent = _dataIndent; // only allow one locus line bool haveParsedLocus = false; // parse until we hit the features or sequence section bool haveFinishedHeaders = false; while (bioReader.HasLines && !haveFinishedHeaders) { switch (bioReader.LineHeader) { case "LOCUS": if (haveParsedLocus) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserSecondLocus, bioReader.LocationString); Trace.Report(message); throw new InvalidDataException(message); } ParseLocus(bioReader, ref sequence); metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; haveParsedLocus = true; // don't go to next line; current line still needs to be processed break; case "VERSION": tokens = bioReader.LineData.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // first token contains accession and version Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$"); metadata.Version = new GenBankVersion(); if (m.Success) { metadata.Version.Version = m.Groups["version"].Value; // The first token in the data from the accession line is referred to as // the primary accession number, and should be the one used here in the // version line. string versionLineAccession = m.Groups["accession"].Value; if (metadata.Accession == null) { ApplicationLog.WriteLine("WARN: VERSION processed before ACCESSION"); } else { if (!versionLineAccession.Equals(metadata.Accession.Primary)) { ApplicationLog.WriteLine("WARN: VERSION tag doesn't match ACCESSION"); } else { metadata.Version.Accession = metadata.Accession.Primary; } } } // second token contains primary ID m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)"); if (m.Success) { metadata.Version.GINumber = m.Groups["primaryID"].Value; } bioReader.GoToNextLine(); break; case "PROJECT": tokens = bioReader.LineData.Split(':'); if (tokens.Length == 2) { metadata.Project = new ProjectIdentifier(); metadata.Project.Name = tokens[0]; tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { metadata.Project.Numbers.Add(tokens[i]); } } else { ApplicationLog.WriteLine("WARN: unexpected PROJECT header: " + bioReader.Line); } bioReader.GoToNextLine(); break; case "SOURCE": ParseSource(bioReader, ref sequence); metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "REFERENCE": ParseReferences(bioReader, ref sequence); // can encounter more than one metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "COMMENT": ParseComments(bioReader, ref sequence); // can encounter more than one metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "PRIMARY": // This header is followed by sequence info in a table format that could be // stored in a custom object. The first line contains column headers. // For now, just validate the presence of the headers, and save the data // as a string. int[] locs = new int[4]; locs[0] = bioReader.LineData.IndexOf("TPA_SPAN", StringComparison.Ordinal); locs[1] = bioReader.LineData.IndexOf("PRIMARY_IDENTIFIER", StringComparison.Ordinal); locs[2] = bioReader.LineData.IndexOf("PRIMARY_SPAN", StringComparison.Ordinal); locs[3] = bioReader.LineData.IndexOf("COMP", StringComparison.Ordinal); if (locs[0] < 0 || locs[1] < 0 || locs[2] < 0 || locs[3] < 0) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserPrimaryLineError, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } string primaryData = ParseMultiLineData(bioReader, Environment.NewLine); metadata.Primary = primaryData; // don't go to next line; current line still needs to be processed break; // all the following are extracted the same way - possibly multiline case "DEFINITION": metadata.Definition = ParseMultiLineData(bioReader, " "); break; case "ACCESSION": data = ParseMultiLineData(bioReader, " "); metadata.Accession = new GenBankAccession(); string[] accessions = data.Split(' '); metadata.Accession.Primary = accessions[0]; for (int i = 1; i < accessions.Length; i++) { metadata.Accession.Secondary.Add(accessions[i]); } break; case "DBLINK": tokens = bioReader.LineData.Split(':'); if (tokens.Length == 2) { metadata.DBLink = new CrossReferenceLink(); if (string.Compare(tokens[0], CrossReferenceType.Project.ToString(), StringComparison.OrdinalIgnoreCase) == 0) { metadata.DBLink.Type = CrossReferenceType.Project; } else { metadata.DBLink.Type = CrossReferenceType.TraceAssemblyArchive; } tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { metadata.DBLink.Numbers.Add(tokens[i]); } } else { ApplicationLog.WriteLine("WARN: unexpected DBLINK header: " + bioReader.Line); } bioReader.GoToNextLine(); break; case "DBSOURCE": metadata.DBSource = ParseMultiLineData(bioReader, " "); break; case "KEYWORDS": metadata.Keywords = ParseMultiLineData(bioReader, " "); break; case "SEGMENT": data = ParseMultiLineData(bioReader, " "); string delimeter = "of"; tokens = data.Split(delimeter.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); int outvalue; if (tokens.Length == 2) { metadata.Segment = new SequenceSegment(); if (int.TryParse(tokens[0].Trim(), out outvalue)) { metadata.Segment.Current = outvalue; } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + bioReader.Line); } if (int.TryParse(tokens[1].Trim(), out outvalue)) { metadata.Segment.Count = outvalue; } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + bioReader.Line); } } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + bioReader.Line); } break; // all the following indicate sections beyond the headers parsed by this method case "FEATURES": case "BASE COUNT": case "ORIGIN": case "CONTIG": haveFinishedHeaders = true; break; default: ApplicationLog.WriteLine(ToString() + "WARN: unknown {0} -> {1}", bioReader.LineHeader, bioReader.LineData); string errMessage = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParseHeaderError, bioReader.LineHeader); Trace.Report(errMessage); throw new InvalidDataException(errMessage); } } // check for required features if (!haveParsedLocus) { string message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name); Trace.Report(message); throw new InvalidDataException(message); } }
// LOCUS is the first line in a GenBank record private void ParseLocus(BioTextReader bioReader, ref Sequence sequence) { GenBankLocusInfo locusInfo = new GenBankLocusInfo(); // GenBank spec recommends token rather than position-based parsing, but this // is only partially possible without making extra assumptions about the presence // of optional fields. string[] tokens = bioReader.LineData.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); sequence.ID = tokens[0]; locusInfo.Name = tokens[0]; int sequenceLength; if (!int.TryParse(tokens[1], out sequenceLength)) { throw new InvalidOperationException(); } locusInfo.SequenceLength = sequenceLength; string seqType = tokens[2]; if (seqType != "bp" && seqType != "aa") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidLocus, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } // Determine format version and parse the remaining fields by position. string strandType; string strandTopology; string division; string rawDate; string molType = string.Empty; if (Helper.StringHasMatch(bioReader.GetLineField(31, 32), "bp", "aa")) { // older format strandType = bioReader.GetLineField(34, 36).Trim(); strandTopology = bioReader.GetLineField(43, 52).Trim(); division = bioReader.GetLineField(53, 56).Trim(); rawDate = bioReader.GetLineField(63).Trim(); // molecule type field is not used for amino acid chains if (seqType != "aa") { molType = bioReader.GetLineField(37, 42).Trim(); } } else { // newer format strandType = bioReader.GetLineField(45, 47).Trim(); strandTopology = bioReader.GetLineField(56, 63).Trim(); division = bioReader.GetLineField(65, 67).Trim(); rawDate = bioReader.GetLineField(69).Trim(); // molecule type field is not used for amino acid chains if (seqType != "aa") { molType = bioReader.GetLineField(48, 53).Trim(); } } // process strand type if (!Helper.StringHasMatch(strandType, string.Empty, "ss-", "ds-", "ms-")) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidLocus, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } locusInfo.Strand = Helper.GetStrandType(strandType); // process strand topology if (!Helper.StringHasMatch(strandTopology, string.Empty, "linear", "circular")) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidStrand, strandTopology); Trace.Report(message); throw new InvalidDataException(message); } locusInfo.StrandTopology = Helper.GetStrandTopology(strandTopology); // process division try { locusInfo.DivisionCode = (SequenceDivisionCode)Enum.Parse(typeof(SequenceDivisionCode), division); } catch (ArgumentException) { locusInfo.DivisionCode = SequenceDivisionCode.None; } // process date DateTime date; if (!DateTime.TryParse(rawDate, out date)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidDate, rawDate); Trace.Report(message); throw new FormatException(message); } locusInfo.Date = date; locusInfo.SequenceType = seqType; // process sequence type and molecule type MoleculeType moleculeType; if (seqType == "aa") { moleculeType = MoleculeType.Protein; } else { moleculeType = GetMoleculeType(molType); if (moleculeType == MoleculeType.Invalid) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidLocus, bioReader.Line); Trace.Report(message); throw new FormatException(message); } } IAlphabet alphabet = GetAlphabet(moleculeType); if (alphabet != sequence.Alphabet) { if (Alphabet != null && Alphabet != alphabet) { string message = Properties.Resource.ParserIncorrectAlphabet; Trace.Report(message); throw new InvalidDataException(message); } sequence = new Sequence(alphabet, Encoding, sequence); sequence.IsReadOnly = false; } sequence.MoleculeType = moleculeType; locusInfo.MoleculeType = moleculeType; GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; metadata.Locus = locusInfo; bioReader.GoToNextLine(); }
/// <summary> /// Given range may contain normal metadata and features, if you can find /// the heading 'Features' in any row of the range, anything below it is /// part of features otherwise, try to parse everything as metadata. /// </summary> /// <param name="ranges">ranges</param> /// <returns></returns> public static GenBankMetadata RangeToGenBankMetadata(IList<Range> ranges) { GenBankMetadata metadata = new GenBankMetadata(); int height = 0, width = 0; object[,] cellRange; foreach (Range r in ranges) { height += r.Rows.Count; width = width < r.Columns.Count ? r.Columns.Count : width; } cellRange = new object[height + 1, width + 1]; int k = 1; foreach (Range r in ranges) { for (int i = 1; i <= r.Rows.Count; i++, k++) { for (int j = 1; j <= r.Columns.Count; j++) { cellRange[k, j] = r[i, j].Value2 as object; } } } int rowIndex = 1; while (rowIndex < cellRange.GetLength(0)) { if (null != cellRange[rowIndex, 1]) { string cellValue = cellRange[rowIndex, 1].ToString().ToUpperInvariant(); switch (cellValue) { case METADATA: rowIndex++; rowIndex = ParseGenBankMetadata(metadata, cellRange, rowIndex); break; case FEATURES: rowIndex++; rowIndex = ParseGenBankFeatures(metadata, cellRange, rowIndex); break; default: rowIndex++; break; } } else { rowIndex++; } } return metadata; }
private static void ParseSource(BioTextReader bioReader, ref Sequence sequence) { string source = string.Empty; string organism = string.Empty; string classLevels = string.Empty; while (bioReader.HasLines) { if (bioReader.LineHeader == "SOURCE") { // data can be multiline. spec says last line must end with period // (note: this doesn't apply unless multiline) bool lastDotted = true; source = bioReader.LineData; bioReader.GoToNextLine(); while (bioReader.HasLines && !bioReader.LineHasHeader) { source += " " + bioReader.LineData; lastDotted = (source.EndsWith(".", StringComparison.Ordinal)); bioReader.GoToNextLine(); } if (!lastDotted && Trace.Want(Trace.SeqWarnings)) { Trace.Report("GenBank.ParseSource", Properties.Resource.OutOfSpec, source); } // don't go to next line; current line still needs to be processed } else if (bioReader.Line[0] == ' ') { if (bioReader.LineHeader != "ORGANISM") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidSourceField, bioReader.LineHeader); Trace.Report(message); throw new InvalidDataException(message); } // this also can be multiline organism = bioReader.LineData; bioReader.GoToNextLine(); while (bioReader.HasLines && !bioReader.LineHasHeader) { if (bioReader.Line.EndsWith(";", StringComparison.Ordinal) || bioReader.Line.EndsWith(".", StringComparison.Ordinal)) { if (!String.IsNullOrEmpty(classLevels)) { classLevels += " "; } classLevels += bioReader.LineData; } else { organism += " " + bioReader.LineData; } bioReader.GoToNextLine(); } // don't go to next line; current line still needs to be processed } else { // don't go to next line; current line still needs to be processed break; } } GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; metadata.Source = new SequenceSource(); metadata.Source.CommonName = source; if (!string.IsNullOrEmpty(organism)) { int index = organism.IndexOf(" ", StringComparison.Ordinal); if (index > 0) { metadata.Source.Organism.Genus = organism.Substring(0, index); if (organism.Length > index) { index++; metadata.Source.Organism.Species = organism.Substring(index, organism.Length - index); } } else { metadata.Source.Organism.Genus = organism; } } metadata.Source.Organism.ClassLevels = classLevels; }
/// <summary> /// Parses source info. /// </summary> /// <param name="metadata">Metadata object</param> /// <param name="cellRange">Range of cells</param> /// <param name="rowIndex">Current index of row</param> /// <returns>Index of row</returns> private static int ParseSource(GenBankMetadata metadata, object[,] cellRange, int rowIndex) { string Key; string subKey; string value; string message; value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty; rowIndex++; while (rowIndex < cellRange.GetLength(0)) { if (3 > cellRange.GetLength(1)) { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, SOURCE); throw new FormatException(message); } if (null != cellRange[rowIndex, KeyColumnIndex]) { Key = cellRange[rowIndex, KeyColumnIndex].ToString(); if (!string.IsNullOrWhiteSpace(Key)) { break; } } if (null == cellRange[rowIndex, SubKeyColumnIndex] || string.IsNullOrWhiteSpace(cellRange[rowIndex, SubKeyColumnIndex].ToString())) { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, SOURCE); throw new FormatException(message); } if (metadata.Source == null) { metadata.Source = new SequenceSource(); metadata.Source.CommonName = value; } subKey = cellRange[rowIndex, SubKeyColumnIndex].ToString().ToUpperInvariant(); value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty; if (metadata.Source.Organism == null) { metadata.Source.Organism = new OrganismInfo(); } switch (subKey) { case SOURCE_ORGANISM: if (string.IsNullOrWhiteSpace(value)) { break; } string[] tokens = value.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); metadata.Source.Organism.Genus = tokens[0]; if (tokens.Length > 1) { metadata.Source.Organism.Species = tokens[1]; for (int i = 2; i < tokens.Length; i++) { metadata.Source.Organism.Species = metadata.Source.Organism.Species + " " + tokens[i]; } } break; case SOURCE_CLASSLEVELS: metadata.Source.Organism.ClassLevels = value; break; } rowIndex++; } return rowIndex; }
// Handle optional BASE COUNT, then ORIGIN and sequence data. private void ParseSequence(BioTextReader bioReader, ref Sequence sequence) { string message = string.Empty; GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // set data indent for sequence headers bioReader.DataIndent = _dataIndent; while (bioReader.HasLines) { if (bioReader.Line.StartsWith("//", StringComparison.Ordinal)) { bioReader.GoToNextLine(); break; // end of sequence record } switch (bioReader.LineHeader) { case "BASE COUNT": // The BASE COUNT linetype is obsolete and was removed // from the GenBank flatfile format in October 2003. But if it is // present, we will use it. We get the untrimmed version since it // starts with a right justified column. metadata.BaseCount = bioReader.Line.Substring(_dataIndent); bioReader.GoToNextLine(); break; case "ORIGIN": // The origin line can contain optional data; don't put empty string into // metadata. if (!String.IsNullOrEmpty(bioReader.LineData)) { metadata.Origin = bioReader.LineData; } bioReader.GoToNextLine(); IAlphabet alphabet = null; while (bioReader.HasLines && bioReader.Line[0] == ' ') { // Using a regex is too slow. int len = bioReader.Line.Length; int k = 10; while (k < len) { string seqData = bioReader.Line.Substring(k, Math.Min(10, len - k)); if (Alphabet == null) { alphabet = IdentifyAlphabet(alphabet, seqData); if (alphabet == null) { message = String.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } if (sequence.Alphabet != alphabet) { Sequence seq = new Sequence(alphabet, Encoding, sequence); seq.MoleculeType = sequence.MoleculeType; seq.IsReadOnly = false; sequence.Clear(); sequence = seq; } } sequence.InsertRange(sequence.Count, seqData); k += 11; } bioReader.GoToNextLine(); } break; case "CONTIG": metadata.Contig = ParseMultiLineData(bioReader, Environment.NewLine); // don't go to next line; current line still needs to be processed break; default: message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserUnexpectedLineInSequence, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } } }
/// <summary> /// Helper method to parse the metadata of gen bank data /// </summary> /// <param name="metadata">Metadata object</param> /// <param name="cellRange">Range of cells</param> /// <param name="rowIndex">Current index of row</param> /// <returns>Index of row</returns> private static int ParseGenBankMetadata(GenBankMetadata metadata, object[,] cellRange, int rowIndex) { string message = string.Empty; string key; string subKey; string value = string.Empty; while (rowIndex < cellRange.GetLength(0)) { if (null != cellRange[rowIndex, KeyColumnIndex] && !string.IsNullOrWhiteSpace(cellRange[rowIndex, KeyColumnIndex].ToString())) { key = cellRange[rowIndex, KeyColumnIndex].ToString().ToUpperInvariant(); if (key.Equals(FEATURES)) { break; } } else { rowIndex++; continue; } subKey = cellRange[rowIndex, SubKeyColumnIndex] != null ? cellRange[rowIndex, SubKeyColumnIndex].ToString() : string.Empty; value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty; string[] tokens; switch (key) { case LOCUS: rowIndex = ParseLocus(metadata, cellRange, rowIndex); rowIndex--; break; case DEFINITION: metadata.Definition = value; break; case ACCESSION: metadata.Accession = new GenBankAccession(); if (string.IsNullOrWhiteSpace(value)) { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, ACCESSION); throw new FormatException(message); } string[] accessions = value.Split(' '); metadata.Accession.Primary = accessions[0]; for (int i = 1; i < accessions.Length; i++) { metadata.Accession.Secondary.Add(accessions[i]); } break; case DBLINK: if (!string.IsNullOrWhiteSpace(value)) { break; } tokens = value.Split(':'); if (tokens.Length == 2) { if (metadata.DbLinks == null) { metadata.DbLinks = new List<CrossReferenceLink>(2); } var curLink = new CrossReferenceLink(); if (string.Compare(tokens[0], CrossReferenceType.Project.ToString(), StringComparison.OrdinalIgnoreCase) == 0) { curLink.Type = CrossReferenceType.Project; } else if (string.Compare(tokens[0], CrossReferenceType.BioProject.ToString(), StringComparison.OrdinalIgnoreCase) == 0) { curLink.Type = CrossReferenceType.BioProject; } else { curLink.Type = CrossReferenceType.TraceAssemblyArchive; } tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { curLink.Numbers.Add(tokens[i]); } metadata.DbLinks.Add(curLink); } break; case DBSOURCE: metadata.DbSource = value; break; case VERSION: if (string.IsNullOrWhiteSpace(value)) { break; } tokens = value.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // first token contains accession and version Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$"); metadata.Version = new GenBankVersion(); if (m.Success) { metadata.Version.Version = m.Groups["version"].Value; // The first token in the data from the accession line is referred to as // the primary accession number, and should be the one used here in the // version line. metadata.Version.Accession = m.Groups["accession"].Value; } if (tokens.Length > 1) { // second token contains primary ID m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)"); if (m.Success) { metadata.Version.GiNumber = m.Groups["primaryID"].Value; } } break; case SEGMENT: if (string.IsNullOrWhiteSpace(value)) { break; } tokens = value.Split(" of ".ToArray(), StringSplitOptions.RemoveEmptyEntries); if (tokens.Length == 2) { int current; int count; if (int.TryParse(tokens[0], out current)) { if (int.TryParse(tokens[1], out count)) { metadata.Segment = new SequenceSegment(); metadata.Segment.Current = current; metadata.Segment.Count = count; } } } if (metadata.Segment == null) { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, ACCESSION); throw new FormatException(message); } break; case KEYWORDS: metadata.Keywords = value; break; case SOURCE: rowIndex = ParseSource(metadata, cellRange, rowIndex); rowIndex--; break; case REFERENCE: rowIndex = ParseReference(metadata, cellRange, rowIndex); rowIndex--; break; case PRIMARY: metadata.Primary = value; break; case COMMENT: if (!string.IsNullOrWhiteSpace(value)) { tokens = value.Split(Environment.NewLine.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); foreach (string str in tokens) { metadata.Comments.Add(str); } } break; } rowIndex++; } return rowIndex; }
/// <summary> /// Validate GenBank features for medium size sequences. /// </summary> /// <param name="nodeName">xml node name.</param> /// <param name="methodName">DNA,RNA or Protein method</param> private void ValidateGenBankFeatures(string nodeName, string methodName) { // Get Values from XML node. string filePath = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.FilePathNode); string mRNAFeatureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.mRNACount); string exonFeatureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.ExonCount); string intronFeatureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.IntronCount); string cdsFeatureCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.CDSCount); string allFeaturesCount = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.GenBankFeaturesCount); string expectedCDSKey = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.CDSKey); string expectedIntronKey = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.IntronKey); string expectedExonKey = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.ExonKey); string mRNAKey = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.mRNAKey); string sourceKeyName = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.SourceKey); // Parse a file. ISequenceParser parserObj = new GenBankParser(); { IEnumerable<ISequence> sequenceList = parserObj.Parse(filePath); // GenBank metadata. var metadata = new GenBankMetadata(); if (1 == sequenceList.Count()) { metadata = sequenceList.ElementAt(0).Metadata[Constants.GenBank] as GenBankMetadata; } else { metadata = sequenceList.ElementAt(1).Metadata[Constants.GenBank] as GenBankMetadata; } // Validate GenBank Features. Assert.AreEqual(metadata.Features.All.Count, Convert.ToInt32(allFeaturesCount, null)); Assert.AreEqual(metadata.Features.CodingSequences.Count, Convert.ToInt32(cdsFeatureCount, null)); Assert.AreEqual(metadata.Features.Exons.Count, Convert.ToInt32(exonFeatureCount, null)); Assert.AreEqual(metadata.Features.Introns.Count, Convert.ToInt32(intronFeatureCount, null)); Assert.AreEqual(metadata.Features.MessengerRNAs.Count, Convert.ToInt32(mRNAFeatureCount, null)); Assert.AreEqual(metadata.Features.Attenuators.Count, 0); Assert.AreEqual(metadata.Features.CAATSignals.Count, 0); Assert.AreEqual(metadata.Features.DisplacementLoops.Count, 0); Assert.AreEqual(metadata.Features.Enhancers.Count, 0); // Validate GenBank feature list. if ((0 == string.Compare(methodName, "DNA", CultureInfo.CurrentCulture, CompareOptions.IgnoreCase)) || (0 == string.Compare(methodName, "RNA", CultureInfo.CurrentCulture, CompareOptions.IgnoreCase))) { IList<FeatureItem> featureList = metadata.Features.All; Assert.AreEqual(featureList[0].Key.ToString(null), sourceKeyName); Assert.AreEqual(featureList[1].Key.ToString(null), expectedCDSKey); Assert.AreEqual(featureList[2].Key.ToString(null), expectedCDSKey); Assert.AreEqual(featureList[10].Key.ToString(null), mRNAKey); Assert.AreEqual(featureList[12].Key.ToString(null), expectedExonKey); Assert.AreEqual(featureList[18].Key.ToString(null), expectedIntronKey); ApplicationLog.WriteLine( "GenBank Features P1: Successfully validated the GenBank Features"); } else if ((0 == string.Compare(methodName, "Protein", CultureInfo.CurrentCulture, CompareOptions.IgnoreCase))) { IList<FeatureItem> featureList = metadata.Features.All; Assert.AreEqual(featureList[10].Key.ToString(null), expectedIntronKey); Assert.AreEqual(featureList[18].Key.ToString(null), expectedExonKey); ApplicationLog.WriteLine( "GenBank Features P1: Successfully validated the GenBank Features"); } } }