Пример #1
0
        /// <summary>
        /// Private Constructor for clone method.
        /// </summary>
        /// <param name="other">GenBankMetadata instance to clone.</param>
        private GenBankMetadata(GenBankMetadata other)
        {
            if (other.Locus != null)
            {
                Locus = other.Locus.Clone();
            }

            Definition = other.Definition;
            if (other.Accession != null)
            {
                Accession = other.Accession.Clone();
            }

            if (other.Version != null)
            {
                Version = other.Version.Clone();
            }

            if (other.Project != null)
            {
                Project = other.Project.Clone();
            }
            if (other.DbLinks != null)
            {
                DbLinks = other.DbLinks.ToList();
            }

            DbSource = other.DbSource;
            Keywords = other.Keywords;
            if (other.Segment != null)
            {
                Segment = other.Segment.Clone();
            }

            if (other.Source != null)
            {
                Source = other.Source.Clone();
            }

            References = new List<CitationReference>();
            foreach (CitationReference reference in other.References)
            {
                References.Add(reference.Clone());
            }

            Comments = new List<string>(other.Comments);
            Primary = other.Primary;

            if (other.Features != null)
            {
                Features = other.Features.Clone();
            }

            BaseCount = other.BaseCount;
            Origin = other.Origin;
            Contig = other.Contig;
        }
Пример #2
0
        // Write all the header sections that come before the features section.
        private void WriteHeaders(ISequence sequence, TextWriter txtWriter)
        {
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            if (metadata != null)
            {
                WriteLocus(sequence, txtWriter);
                WriteHeaderSection("DEFINITION", metadata.Definition, txtWriter);

                if (metadata.Accession != null)
                {
                    WriteHeaderSection("ACCESSION", Helper.GetGenBankAccession(metadata.Accession), txtWriter);

                    string version;
                    if (metadata.Version != null)
                    {
                        version = metadata.Accession.Primary + "." + metadata.Version.Version;

                        if (!string.IsNullOrEmpty(metadata.Version.GiNumber))
                        {
                            version += "  GI:" + metadata.Version.GiNumber;
                        }
                        if (version.Length > 0)
                        {
                            WriteHeaderSection("VERSION", version, txtWriter);
                        }
                    }
                }

                if (metadata.Project != null)
                {
                    WriteHeaderSection("PROJECT", Helper.GetProjectIdentifier(metadata.Project), txtWriter);
                }

                if (metadata.DbLinks != null && metadata.DbLinks.Count > 0)
                {
                    WriteHeaderSection("DBLINK", Helper.GetCrossReferenceLink(metadata.DbLinks), txtWriter);
                }

                WriteHeaderSection("DBSOURCE", metadata.DbSource, txtWriter);
                WriteHeaderSection("KEYWORDS", metadata.Keywords, txtWriter);

                if (metadata.Segment != null)
                {
                    WriteHeaderSection("SEGMENT", Helper.GetSequenceSegment(metadata.Segment), txtWriter);
                }

                WriteSource(metadata, txtWriter);
                WriteReferences(metadata, txtWriter);
                WriteComments(metadata, txtWriter);
                WriteHeaderSection("PRIMARY", metadata.Primary, txtWriter);
            }
        }
Пример #3
0
        private void WriteFeatures(ISequence sequence, TextWriter txtWriter)
        {
            ILocationBuilder locBuilder = LocationBuilder;

            if (locBuilder == null)
            {
                throw new InvalidOperationException(Properties.Resource.NullLocationBuild);
            }
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            if (metadata != null && metadata.Features != null)
            {
                WriteFeatureSection("FEATURES", "Location/Qualifiers", txtWriter);

                // write the features in the order they were put in the list
                foreach (FeatureItem feature in metadata.Features.All)
                {
                    WriteFeatureSection(FeatureHeaderIndentString + feature.Key, locBuilder.GetLocationString(feature.Location), txtWriter);

                    // The sub-items of a feature are referred to as qualifiers.  These do not have
                    // unique keys, so they are stored as lists in the SubItems dictionary.
                    foreach (KeyValuePair <string, List <string> > qualifierList in feature.Qualifiers)
                    {
                        foreach (string qualifierValue in qualifierList.Value)
                        {
                            string data = "/" + qualifierList.Key;
                            if (qualifierValue != null)
                            {
                                if (qualifierValue != string.Empty)
                                {
                                    data += "=";
                                    string s = "" + (char)34;

                                    if (qualifierValue.StartsWith(s) == false)
                                    {
                                        data += s;
                                    }

                                    data += qualifierValue;
                                    if (qualifierValue.EndsWith(s) == false)
                                    {
                                        data += s;
                                    }
                                }
                            }

                            // use a blank header; the qualifier key is part of the data
                            WriteFeatureSection(string.Empty, data, txtWriter);
                        }
                    }
                }
            }
        }
Пример #4
0
        /// <summary>
        /// Parses the GenBank Sequence from the GenBank file.
        /// Handle optional BASE COUNT, then ORIGIN and sequence data.
        /// </summary>
        /// <param name="line">parse line</param>
        /// <param name="sequence">The sequence.</param>
        /// <param name="stream">The stream reader.</param>
        private void ParseSequence(ref string line, ref Sequence sequence, StreamReader stream)
        {
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            while (line != null)
            {
                if (line.StartsWith("//", StringComparison.Ordinal))
                {
                    line = GoToNextLine(line, stream);
                    break;
                    // end of sequence record
                }

                // set data indent for sequence headers
                string lineHeader = GetLineHeader(line, DataIndent);
                switch (lineHeader)
                {
                case "BASE COUNT":
                    // The BASE COUNT linetype is obsolete and was removed
                    // from the GenBank flat-file format in October 2003.  But if it is
                    // present, we will use it.  We get the untrimmed version since it
                    // starts with a right justified column.
                    metadata.BaseCount = line.Substring(DataIndent);
                    line = GoToNextLine(line, stream);
                    break;

                case "ORIGIN":
                    // Change Note: The original implementation would validate the alphabet every line
                    // which would greatly impact performance on large sequences.  This updates the method
                    // to improve performance by validating the alphabet after parsing the sequence.
                    ParseOrigin(ref line, metadata, stream);
                    break;

                case "CONTIG":
                    metadata.Contig = ParseMultiLineData(ref line, Environment.NewLine, DataIndent, stream);
                    // don't go to next line; current line still needs to be processed
                    break;

                default:
                    string message = String.Format(
                        CultureInfo.CurrentCulture,
                        Properties.Resource.ParserUnexpectedLineInSequence,
                        line);
                    Trace.Report(message);
                    throw new InvalidDataException(message);
                }
            }
        }
        /// <summary>
        /// Extracts supposed sequence name from metadata.
        /// </summary>
        /// <param name="metadata">
        /// The metadata.
        /// </param>
        /// <returns>
        /// Supposed name as <see cref="string"/>.
        /// </returns>
        /// <exception cref="Exception">
        /// Thrown if all name fields are contradictory.
        /// </exception>
        public static string ExtractSequenceName(GenBankMetadata metadata)
        {
            string species = metadata.Source.Organism.Species.GetLargestRepeatingSubstring();
            string commonName = metadata.Source.CommonName;
            string definition = metadata.Definition.TrimEnd(", complete genome.")
                                                   .TrimEnd(", complete sequence.")
                                                   .TrimEnd(", complete CDS.")
                                                   .TrimEnd(", complete cds.")
                                                   .TrimEnd(", genome.");

            if (commonName.Contains(species))
            {
                if (definition.Contains(commonName))
                {
                    return definition;
                }

                if (commonName.Contains(definition))
                {
                    return commonName;
                }

                return commonName + " | " + definition;
            }

            if (species.Contains(commonName))
            {
                if (definition.Contains(species))
                {
                    return definition;
                }

                if (species.Contains(definition))
                {
                    return species;
                }

                return species + " | " + definition;
            }

            throw new Exception("Sequences names are not equal. CommonName = " + commonName +
                                ", Species = " + species +
                                ", Definition = " + definition);
        }
Пример #6
0
        private void WriteReferences(GenBankMetadata metadata, TextWriter txtWriter)
        {
            if (metadata.References != null)
            {
                foreach (CitationReference reference in metadata.References)
                {
                    // format the data for the first line
                    string data = reference.Number.ToString(CultureInfo.InvariantCulture);
                    if (!string.IsNullOrEmpty(reference.Location))
                    {
                        data = data.PadRight(3) + "(" + reference.Location + ")";
                    }

                    WriteHeaderSection("REFERENCE", data, txtWriter);
                    WriteHeaderSection("  AUTHORS", reference.Authors, txtWriter);
                    WriteHeaderSection("  CONSRTM", reference.Consortiums, txtWriter);
                    WriteHeaderSection("  TITLE", reference.Title, txtWriter);
                    WriteHeaderSection("  JOURNAL", reference.Journal, txtWriter);
                    WriteHeaderSection("  MEDLINE", reference.Medline, txtWriter);
                    WriteHeaderSection("  PUBMED", reference.PubMed, txtWriter);
                    WriteHeaderSection("  REMARK", reference.Remarks, txtWriter);
                }
            }
        }
 /// <summary>
 /// Extracts sequence feature.
 /// </summary>
 /// <param name="metadata">
 /// The metadata.
 /// </param>
 /// <returns>
 /// The <see cref="int"/>.
 /// </returns>
 public int ExtractSequenceFeature(GenBankMetadata metadata)
 {
     string name = metadata.Definition.ToLower();
     if (name.Contains("mitochondrion"))
     {
         return Aliases.Feature.MitochondrionGenome;
     }
     else if (name.Contains("chloroplast"))
     {
         return Aliases.Feature.ChloroplastGenome;
     }
     else if (name.Contains("plasmid"))
     {
         return Aliases.Feature.Plasmid;
     }
     else if (name.Contains("plastid"))
     {
         return Aliases.Feature.Plastid;
     }
     else
     {
         return Aliases.Feature.FullGenome;
     }
 }
Пример #8
0
        /// <summary>
        ///     Validate TrnsitPeptide features
        /// </summary>
        /// <param name="nodeName">XML node name</param>
        /// <param name="genMetadata">GenBank Metadata</param>
        private void ValidateGenBankTrnsitPeptideFeature(string nodeName,
                                                         GenBankMetadata genMetadata)
        {
            // Get Values from XML node.           
            string expectedLocation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Location);
            string expectedAllele = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.AlleleNode);
            string featureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.QualifierCount);
            string expectedDbReference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.DbReferenceNode);
            string geneSymbol = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSymbol);
            string expectedCitation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.CitationNode);
            string expectedExperiment = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.ExperimentNode);
            string expectedFunction = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.FunctionNode);
            string expectedGeneSynonym = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSynonymNode);
            string expectedInference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.InferenceNode);
            string expectedLabel = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LabelNode);
            string expectedLocusTag = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LocusTagNode);
            string expectedNote = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Note);
            string expectedOldLocusTag = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.OldLocusTagNode);
            string expectedMap = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GenbankMapNode);

            List<TransitPeptide> tansitPeptideFeatureList =
                genMetadata.Features.TransitPeptides;
            var locBuilder = new LocationBuilder();

            // Create a copy of transit peptide features.
            TransitPeptide cloneTransit = tansitPeptideFeatureList[0].Clone();

            // Validate transit peptide qualifiers.
            Assert.AreEqual(tansitPeptideFeatureList.Count.ToString((IFormatProvider) null), featureCount);
            Assert.AreEqual(cloneTransit.GeneSymbol, geneSymbol);
            Assert.AreEqual(cloneTransit.DatabaseCrossReference[0],
                            expectedDbReference);
            Assert.AreEqual(tansitPeptideFeatureList[0].Allele,
                            expectedAllele);
            Assert.AreEqual(tansitPeptideFeatureList[0].Citation[0],
                            expectedCitation);
            Assert.AreEqual(tansitPeptideFeatureList[0].Experiment[0],
                            expectedExperiment);
            Assert.AreEqual(tansitPeptideFeatureList[0].GenomicMapPosition,
                            expectedMap);
            Assert.AreEqual(tansitPeptideFeatureList[0].GeneSynonym[0],
                            expectedGeneSynonym);
            Assert.AreEqual(tansitPeptideFeatureList[0].Inference[0],
                            expectedInference);
            Assert.AreEqual(tansitPeptideFeatureList[0].Label,
                            expectedLabel);
            Assert.AreEqual(locBuilder.GetLocationString(
                genMetadata.Features.TransitPeptides[0].Location),
                            expectedLocation);
            Assert.AreEqual(tansitPeptideFeatureList[0].Note[0],
                            expectedNote);
            Assert.AreEqual(tansitPeptideFeatureList[0].OldLocusTag[0],
                            expectedOldLocusTag);
            Assert.AreEqual(tansitPeptideFeatureList[0].LocusTag[0],
                            expectedLocusTag);
            Assert.AreEqual(tansitPeptideFeatureList[0].Function[0],
                            expectedFunction);

            // Create a new TransitPeptide and validate the same.
            var tPeptide = new TransitPeptide(expectedLocation);
            var tPeptideWithILoc = new TransitPeptide(
                genMetadata.Features.TransitPeptides[0].Location);

            // Set qualifiers and validate them.
            tPeptide.Allele = expectedAllele;
            tPeptide.GeneSymbol = geneSymbol;
            tPeptideWithILoc.GenomicMapPosition = expectedMap;
            Assert.AreEqual(tPeptide.GeneSymbol, geneSymbol);
            Assert.AreEqual(tPeptide.Allele, expectedAllele);
            Assert.AreEqual(tPeptideWithILoc.GenomicMapPosition,
                            expectedMap);
        }
Пример #9
0
        /// <summary>
        /// Parses reference info.
        /// </summary>
        /// <param name="metadata">Metadata object</param>
        /// <param name="cellRange">Range of cells</param>
        /// <param name="rowIndex">Current index of row</param>
        /// <returns>Index of row</returns>
        private static int ParseReference(GenBankMetadata metadata, object[,] cellRange, int rowIndex)
        {
            string Key;
            string subKey;
            string value;
            string message;

            value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty;
            rowIndex++;
            CitationReference reference = new CitationReference();
            if (!string.IsNullOrWhiteSpace(value))
            {
                // check for start/end e.g. (bases 1 to 118), or prose notes
                Match m = Regex.Match(value,
                    @"^(?<number>\d+)(\s+\((?<location>.*)\))?");
                if (m.Success)
                {
                    // create new reference
                    string number = m.Groups["number"].Value;
                    string location = m.Groups["location"].Value;
                    int outValue;
                    if (!int.TryParse(number, out outValue))
                    {
                        message = String.Format(
                         CultureInfo.InvariantCulture,
                         Resources.UnrecognizedGenBankMetadataFormat,
                         REFERENCE);
                        throw new FormatException(message);
                    }

                    reference.Number = outValue;
                    reference.Location = location;
                }
            }

            while (rowIndex < cellRange.GetLength(0))
            {
                if (3 > cellRange.GetLength(1))
                {
                    message = String.Format(
                                CultureInfo.InvariantCulture,
                                Resources.UnrecognizedGenBankMetadataFormat,
                                REFERENCE);
                    throw new FormatException(message);
                }

                if (null != cellRange[rowIndex, KeyColumnIndex])
                {
                    Key = cellRange[rowIndex, KeyColumnIndex].ToString();
                    if (!string.IsNullOrWhiteSpace(Key))
                    {
                        break;
                    }
                }

                if (null == cellRange[rowIndex, SubKeyColumnIndex] || string.IsNullOrWhiteSpace(cellRange[rowIndex, SubKeyColumnIndex].ToString()))
                {
                    message = String.Format(
                              CultureInfo.InvariantCulture,
                              Resources.UnrecognizedGenBankMetadataFormat,
                              REFERENCE);
                    throw new FormatException(message);
                }

                subKey = cellRange[rowIndex, SubKeyColumnIndex].ToString().ToUpperInvariant();
                value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty;
                if (string.IsNullOrWhiteSpace(value))
                {
                    continue;
                }

                switch (subKey)
                {
                    case REFERENCE_AUTHORS:
                        reference.Authors = value;
                        break;
                    case REFERENCE_CONSORTIUMS:
                        reference.Consortiums = value;
                        break;
                    case REFERENCE_JOURNAL:
                        reference.Journal = value;
                        break;
                    case REFERENCE_MEDLINE:
                        reference.Medline = value;
                        break;
                    case REFERENCE_PUBMED:
                        reference.PubMed = value;
                        break;
                    case REFERENCE_REMARK:
                        reference.Remarks = value;
                        break;
                    case REFERENCE_TITLE:
                        reference.Title = value;
                        break;
                }

                rowIndex++;
            }

            metadata.References.Add(reference);

            return rowIndex;
        }
Пример #10
0
        /// <summary>
        /// Parses locus info.
        /// </summary>
        /// <param name="metadata">Metadata object</param>
        /// <param name="cellRange">Range of cells</param>
        /// <param name="rowIndex">Current index of row</param>
        /// <returns>Index of row</returns>
        private static int ParseLocus(GenBankMetadata metadata, object[,] cellRange, int rowIndex)
        {
            string Key;
            string subKey;
            string value;
            string message;
            rowIndex++;

            while (rowIndex < cellRange.GetLength(0))
            {
                if (3 > cellRange.GetLength(1))
                {
                    message = String.Format(
                                CultureInfo.InvariantCulture,
                                Resources.UnrecognizedGenBankMetadataFormat,
                                LOCUS);
                    throw new FormatException(message);
                }

                if (null != cellRange[rowIndex, KeyColumnIndex])
                {
                    Key = cellRange[rowIndex, KeyColumnIndex].ToString();
                    if (!string.IsNullOrWhiteSpace(Key))
                    {
                        break;
                    }
                }

                if (null == cellRange[rowIndex, SubKeyColumnIndex] || string.IsNullOrWhiteSpace(cellRange[rowIndex, SubKeyColumnIndex].ToString()))
                {
                    message = String.Format(
                              CultureInfo.InvariantCulture,
                              Resources.UnrecognizedGenBankMetadataFormat,
                              LOCUS);
                    throw new FormatException(message);
                }

                if (metadata.Locus == null)
                {
                    metadata.Locus = new GenBankLocusInfo();
                }

                subKey = cellRange[rowIndex, SubKeyColumnIndex].ToString().ToUpperInvariant();
                value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty;
                switch (subKey)
                {
                    case LOCUS_NAME:
                        metadata.Locus.Name = value;
                        break;
                    case LOCUS_SEQLEN:
                        if (!string.IsNullOrWhiteSpace(value))
                        {
                            metadata.Locus.SequenceLength = int.Parse(value);
                        }

                        break;
                    case LOCUS_SEQTYPE:
                        metadata.Locus.SequenceType = value;
                        break;
                    case LOCUS_MOLTYPE:
                        MoleculeType moleculetype = MoleculeType.NA;

                        if (!string.IsNullOrWhiteSpace(value) && Enum.TryParse<MoleculeType>(value, true, out moleculetype))
                        {
                            metadata.Locus.MoleculeType = moleculetype;
                        }
                        else
                        {
                            message = String.Format(
                               CultureInfo.InvariantCulture,
                               Resources.UnrecognizedGenBankMetadataFormat,
                               LOCUS_MOLTYPE);
                            throw new FormatException(message);
                        }
                        break;
                    case LOCUS_STRANTTOPOLOGY:
                        SequenceStrandTopology strandTopology = SequenceStrandTopology.None;
                        if (!string.IsNullOrWhiteSpace(value) && Enum.TryParse<SequenceStrandTopology>(value, true, out strandTopology))
                        {
                            metadata.Locus.StrandTopology = strandTopology;
                        }
                        else
                        {
                            message = String.Format(
                             CultureInfo.InvariantCulture,
                             Resources.UnrecognizedGenBankMetadataFormat,
                             LOCUS_STRANTTOPOLOGY);
                            throw new FormatException(message);
                        }

                        break;

                    case LOCUS_STRANDTYPE:

                        SequenceStrandType strandtype = SequenceStrandType.None;
                        if (!string.IsNullOrWhiteSpace(value) && Enum.TryParse<SequenceStrandType>(value, true, out strandtype))
                        {
                            metadata.Locus.Strand = strandtype;
                        }
                        else
                        {
                            message = String.Format(
                            CultureInfo.InvariantCulture,
                            Resources.UnrecognizedGenBankMetadataFormat,
                            LOCUS_STRANDTYPE);
                            throw new FormatException(message);
                        }

                        break;
                    case LOCUS_DIVISIONCODE:
                        SequenceDivisionCode divisionCode = SequenceDivisionCode.None;
                        if (!string.IsNullOrWhiteSpace(value) && Enum.TryParse<SequenceDivisionCode>(value, true, out divisionCode))
                        {
                            metadata.Locus.DivisionCode = divisionCode;
                        }
                        else
                        {
                            message = String.Format(
                           CultureInfo.InvariantCulture,
                           Resources.UnrecognizedGenBankMetadataFormat,
                           LOCUS_DIVISIONCODE);
                            throw new FormatException(message);
                        }

                        break;
                    case LOCUS_DATE:
                        DateTime date;
                        if (!string.IsNullOrWhiteSpace(value) && DateTime.TryParse(value, out date))
                        {
                            metadata.Locus.Date = date;
                        }
                        else
                        {
                            message = String.Format(
                            CultureInfo.InvariantCulture,
                            Resources.UnrecognizedGenBankMetadataFormat,
                            LOCUS_DATE);
                            throw new FormatException(message);
                        }

                        break;
                    default:
                        message = String.Format(
                             CultureInfo.InvariantCulture,
                             Resources.UnrecognizedGenBankMetadataFormat,
                             LOCUS);
                        throw new FormatException(message);
                }

                rowIndex++;
            }

            return rowIndex;
        }
Пример #11
0
        /// <summary>
        ///     Validate GenBank LTR features
        /// </summary>
        /// <param name="nodeName">XML node name</param>
        /// <param name="genMetadata">GenBank Metadata</param>
        private void ValidateGenBankLTRFeature(string nodeName,
                                               GenBankMetadata genMetadata)
        {
            // Get Values from XML node.           
            string expectedLocation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Location);
            string expectedAllele = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.AlleleNode);
            string featureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.QualifierCount);
            string expectedDbReference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.DbReferenceNode);
            string geneSymbol = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSymbol);
            string expectedCitation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.CitationNode);
            string expectedExperiment = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.ExperimentNode);
            string expectedFunction = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.FunctionNode);
            string expectedGeneSynonym = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSynonymNode);
            string expectedInference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.InferenceNode);
            string expectedLabel = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LabelNode);
            string expectedLocusTag = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LocusTagNode);
            string expectedNote = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Note);
            string expectedOldLocusTag = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.OldLocusTagNode);
            string expectedMap = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GenbankMapNode);

            List<LongTerminalRepeat> LTRFeatureList =
                genMetadata.Features.LongTerminalRepeats;
            var locBuilder = new LocationBuilder();

            // Create a copy of Long Terminal Repeat feature.
            LongTerminalRepeat cloneLTR = LTRFeatureList[0].Clone();

            // Validate Long Terminal Repeat qualifiers.
            Assert.AreEqual(LTRFeatureList.Count.ToString((IFormatProvider) null),
                            featureCount);
            Assert.AreEqual(cloneLTR.GeneSymbol, geneSymbol);
            Assert.AreEqual(cloneLTR.DatabaseCrossReference[0],
                            expectedDbReference);
            Assert.AreEqual(LTRFeatureList[0].Allele,
                            expectedAllele);
            Assert.AreEqual(LTRFeatureList[0].Citation[0],
                            expectedCitation);
            Assert.AreEqual(LTRFeatureList[0].Experiment[0],
                            expectedExperiment);
            Assert.AreEqual(LTRFeatureList[0].GenomicMapPosition,
                            expectedMap);
            Assert.AreEqual(LTRFeatureList[0].GeneSynonym[0],
                            expectedGeneSynonym);
            Assert.AreEqual(LTRFeatureList[0].Inference[0],
                            expectedInference);
            Assert.AreEqual(LTRFeatureList[0].Label,
                            expectedLabel);
            Assert.AreEqual(locBuilder.GetLocationString(
                genMetadata.Features.LongTerminalRepeats[0].Location),
                            expectedLocation);
            Assert.AreEqual(LTRFeatureList[0].Note[0],
                            expectedNote);
            Assert.AreEqual(LTRFeatureList[0].OldLocusTag[0],
                            expectedOldLocusTag);
            Assert.AreEqual(LTRFeatureList[0].LocusTag[0],
                            expectedLocusTag);
            Assert.AreEqual(LTRFeatureList[0].Function[0],
                            expectedFunction);
            Assert.IsTrue(string.IsNullOrEmpty(LTRFeatureList[0].StandardName));

            // Create a new LTR and validate.
            var ltr =
                new LongTerminalRepeat(expectedLocation);
            var ltrWithILoc = new LongTerminalRepeat(
                genMetadata.Features.LongTerminalRepeats[0].Location);

            // Set qualifiers and validate them.
            ltr.Allele = expectedAllele;
            ltr.GeneSymbol = geneSymbol;
            ltrWithILoc.GenomicMapPosition = expectedMap;
            Assert.AreEqual(ltr.GeneSymbol, geneSymbol);
            Assert.AreEqual(ltr.Allele, expectedAllele);
            Assert.AreEqual(ltrWithILoc.GenomicMapPosition,
                            expectedMap);
        }
Пример #12
0
        private void WriteReferences(GenBankMetadata metadata, TextWriter txtWriter)
        {
            if (metadata.References != null)
            {
                foreach (CitationReference reference in metadata.References)
                {
                    // format the data for the first line
                    string data = reference.Number.ToString(CultureInfo.InvariantCulture);
                    if (!string.IsNullOrEmpty(reference.Location))
                    {
                        data = data.PadRight(3) + "(" + reference.Location + ")";
                    }

                    WriteHeaderSection("REFERENCE", data, txtWriter);
                    WriteHeaderSection("  AUTHORS", reference.Authors, txtWriter);
                    WriteHeaderSection("  CONSRTM", reference.Consortiums, txtWriter);
                    WriteHeaderSection("  TITLE", reference.Title, txtWriter);
                    WriteHeaderSection("  JOURNAL", reference.Journal, txtWriter);
                    WriteHeaderSection("  MEDLINE", reference.Medline, txtWriter);
                    WriteHeaderSection("  PUBMED", reference.PubMed, txtWriter);
                    WriteHeaderSection("  REMARK", reference.Remarks, txtWriter);
                }
            }
        }
Пример #13
0
        /// <summary>
        ///     Validate StemLoop features
        /// </summary>
        /// <param name="nodeName">XML node name</param>
        /// <param name="genMetadata">GenBank Metadata</param>
        private void ValidateGenBankStemLoopFeature(string nodeName,
                                                    GenBankMetadata genMetadata)
        {
            // Get Values from XML node.
            string expectedLocation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Location);
            string expectedAllele = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.AlleleNode);
            string featureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.QualifierCount);
            string expectedDbReference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.DbReferenceNode);
            string geneSymbol = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSymbol);
            string expectedCitation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.CitationNode);
            string expectedExperiment = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.ExperimentNode);
            string expectedFunction = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.FunctionNode);
            string expectedGeneSynonym = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSynonymNode);
            string expectedInference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.InferenceNode);
            string expectedLabel = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LabelNode);
            string expectedLocusTag = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LocusTagNode);
            string expectedNote = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Note);
            string expectedOldLocusTag = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.OldLocusTagNode);
            string expectedMap = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GenbankMapNode);

            List<StemLoop> sLoopFeatureList = genMetadata.Features.StemLoops;
            var locBuilder = new LocationBuilder();

            // Create a copy of StemLoop feature.
            StemLoop cloneSLoop = sLoopFeatureList[0].Clone();

            // Validate transit peptide qualifiers.
            Assert.AreEqual(sLoopFeatureList.Count.ToString((IFormatProvider) null),
                            featureCount);
            Assert.AreEqual(cloneSLoop.GeneSymbol, geneSymbol);
            Assert.AreEqual(cloneSLoop.DatabaseCrossReference[0],
                            expectedDbReference);
            Assert.AreEqual(sLoopFeatureList[0].Allele,
                            expectedAllele);
            Assert.AreEqual(sLoopFeatureList[0].Citation[0],
                            expectedCitation);
            Assert.AreEqual(sLoopFeatureList[0].Experiment[0],
                            expectedExperiment);
            Assert.AreEqual(sLoopFeatureList[0].GenomicMapPosition,
                            expectedMap);
            Assert.AreEqual(sLoopFeatureList[0].GeneSynonym[0],
                            expectedGeneSynonym);
            Assert.AreEqual(sLoopFeatureList[0].Inference[0],
                            expectedInference);
            Assert.AreEqual(sLoopFeatureList[0].Label,
                            expectedLabel);
            Assert.AreEqual(locBuilder.GetLocationString(
                genMetadata.Features.StemLoops[0].Location),
                            expectedLocation);
            Assert.AreEqual(sLoopFeatureList[0].Note[0],
                            expectedNote);
            Assert.AreEqual(sLoopFeatureList[0].OldLocusTag[0],
                            expectedOldLocusTag);
            Assert.AreEqual(sLoopFeatureList[0].LocusTag[0],
                            expectedLocusTag);
            Assert.AreEqual(sLoopFeatureList[0].Function[0],
                            expectedFunction);
            Assert.IsTrue(string.IsNullOrEmpty(sLoopFeatureList[0].Operon));
            Assert.IsTrue(string.IsNullOrEmpty(sLoopFeatureList[0].StandardName));

            // Create a new StemLoop and validate the same.
            var stemLoop = new StemLoop(expectedLocation);
            var stemLoopWithILoc = new StemLoop(
                genMetadata.Features.StemLoops[0].Location);

            // Set qualifiers and validate them.
            stemLoop.Allele = expectedAllele;
            stemLoop.GeneSymbol = geneSymbol;
            stemLoopWithILoc.GenomicMapPosition = expectedMap;
            Assert.AreEqual(stemLoop.GeneSymbol, geneSymbol);
            Assert.AreEqual(stemLoop.Allele, expectedAllele);
            Assert.AreEqual(stemLoopWithILoc.GenomicMapPosition,
                            expectedMap);
        }
Пример #14
0
        private static void WriteLocus(ISequence sequence, TextWriter txtWriter)
        {
            // determine molecule and sequence type
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            GenBankLocusInfo locusInfo = null;
            string           molType   = sequence.Alphabet.Name;

            if (metadata != null)
            {
                locusInfo = metadata.Locus;
                molType   = locusInfo.MoleculeType.ToString();
            }

            string seqType;

            if (sequence.Alphabet.Name != null)
            {
                if (molType == Alphabets.Protein.Name)
                {
                    seqType = "aa";
                    molType = string.Empty; // protein files don't use molecule type
                }
                else
                {
                    seqType = "bp";
                }
            }
            else
            {
                if (sequence.Alphabet == Alphabets.Protein)
                {
                    seqType = "aa";
                    molType = string.Empty; // protein files don't use molecule type
                }
                else
                {
                    seqType = "bp";

                    if (sequence.Alphabet == Alphabets.DNA)
                    {
                        molType = Alphabets.DNA.Name;
                    }
                    else
                    {
                        molType = Alphabets.RNA.Name;
                    }
                }
            }

            // retrieve metadata fields
            string   strandType     = string.Empty;
            string   strandTopology = string.Empty;
            string   division       = string.Empty;
            DateTime date           = DateTime.Now;

            if (locusInfo != null)
            {
                strandType = Helper.GetStrandType(locusInfo.Strand);

                strandTopology = Helper.GetStrandTopology(locusInfo.StrandTopology);
                if (locusInfo.DivisionCode != SequenceDivisionCode.None)
                {
                    division = locusInfo.DivisionCode.ToString();
                }

                date = locusInfo.Date;
            }

            txtWriter.WriteLine("{0,-12}{1,-16} {2,11} {3} {4,3}{5,-6}  {6,-8} {7,3} {8}",
                                "LOCUS",
                                sequence.ID,
                                sequence.Count,
                                seqType,
                                strandType,
                                molType,
                                strandTopology,
                                division,
                                date.ToString("dd-MMM-yyyy", CultureInfo.InvariantCulture).ToUpperInvariant());
        }
Пример #15
0
        /// <summary>
        /// Gives out string array of metadata and features just below metadata.
        /// </summary>
        /// <param name="metadata">GenBank Metadata</param>
        /// <returns>string array of metadata</returns>
        public static string[,] GenBankMetadataToRange(GenBankMetadata metadata)
        {
            List<string[]> excelData = new List<string[]>();
            List<string> excelRow = new List<string>();

            // Add the metadata headers
            excelRow.Add(Properties.Resources.GenbankMetadataHeader);
            excelData.Add(excelRow.ToArray());
            excelRow.Clear();

            if (metadata.Locus != null)
            {
                excelData.Add(new[] { Properties.Resources.GenbankMetadataLocus });
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataName, metadata.Locus.Name);
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataSeqLength, metadata.Locus.SequenceLength.ToString());
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataSeqType, metadata.Locus.SequenceType);
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataStrandType, Helper.GetStrandType(metadata.Locus.Strand));
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataMoleculeType, metadata.Locus.MoleculeType.ToString());
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataStrandTopology, Helper.GetStrandTopology(metadata.Locus.StrandTopology));
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataDivisionCode, metadata.Locus.DivisionCode.ToString());
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataDate, metadata.Locus.Date.ToString("dd-MMM-yyyy").ToUpper());
            }

            if (!string.IsNullOrWhiteSpace(metadata.Definition))
            {
                AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataDefinition, "", metadata.Definition);
            }

            if (metadata.Accession != null)
            {
                string secondaryAccession = string.Empty;
                foreach (string accession2 in metadata.Accession.Secondary)
                {
                    secondaryAccession += accession2 == null ? " " : " " + accession2;
                }
                AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataAccession, "", metadata.Accession.Primary + secondaryAccession);
            }

            if (metadata.DbLinks != null)
            {
                foreach (var link in metadata.DbLinks)
                {
                    string linkNumbers = string.Empty;
                    foreach (string linkNumber in link.Numbers)
                    {
                        linkNumbers += linkNumber + ",";
                    }
                    AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataDBLink, "", link.Type.ToString() + ":" + linkNumbers);
                }
            }

            if (!string.IsNullOrWhiteSpace(metadata.DbSource))
            {
                AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataDBSource, "", metadata.DbSource);
            }

            if (metadata.Version != null)
            {
                AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataVersion, "", (metadata.Version.Accession ?? string.Empty) + "." + 
                                    (metadata.Version.Version ?? string.Empty) + " " + Properties.Resources.GenbankMetadataGI + (metadata.Version.GiNumber ?? string.Empty));
            }

            if (metadata.Segment != null)
            {
                AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataSegment, "", metadata.Segment.Current + " of " + metadata.Segment.Count);
            }

            AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataKeywords, "", metadata.Keywords);

            if (metadata.Source != null)
            {
                AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataSource, "", metadata.Source.CommonName ?? string.Empty);
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataOrganism, (metadata.Source.Organism.Genus ?? string.Empty) + " " + (metadata.Source.Organism.Species ?? string.Empty));
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataClassLevels, metadata.Source.Organism.ClassLevels ?? string.Empty);
            }

            foreach (CitationReference reference in metadata.References)
            {
                AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataReference, "", reference.Number.ToString() + " (" + reference.Location + ")");
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataAuthors, reference.Authors);
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataTitle, reference.Title);
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataJournal, reference.Journal);
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataConsortiums, reference.Consortiums);
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataMedLine, reference.Medline);
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataPubMed, reference.PubMed);
                AddNameValuePair(excelData, 1, Properties.Resources.GenbankMetadataRemarks, reference.Remarks);
            }

            if (!string.IsNullOrWhiteSpace(metadata.Primary))
            {
                AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataPrimary, "", metadata.Primary);
            }

            if (metadata.Comments != null && metadata.Comments.Count > 0)
            {
                StringBuilder strbuilder = null;

                foreach (string str in metadata.Comments)
                {
                    if (strbuilder == null)
                    {
                        strbuilder = new StringBuilder();
                    }
                    else
                    {
                        strbuilder.Append(Environment.NewLine);
                    }

                    strbuilder.Append(str);
                }

                if (strbuilder != null)
                {
                    AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataComment, "", strbuilder.ToString());
                }
            }

            if (metadata.Features != null)
            {
                // Add the metadata headers
                excelRow.Add(Properties.Resources.GenbankFeaturesHeader);
                excelData.Add(excelRow.ToArray());
                excelRow.Clear();

                IList<FeatureItem> featureList = metadata.Features.All;
                foreach (FeatureItem featureItem in featureList)
                {
                    LocationBuilder locBuilder = new LocationBuilder();
                    // Add the feature headers
                    excelRow.Add(featureItem.Key);
                    //excelRow.Add(""); // skip one column
                    excelRow.Add(locBuilder.GetLocationString(featureItem.Location));
                    excelData.Add(excelRow.ToArray());
                    excelRow.Clear();

                    foreach (string key in featureItem.Qualifiers.Keys)
                    {
                        foreach (string value in featureItem.Qualifiers[key])
                        {
                            AddNameValuePair(excelData, 1, key, value);
                        }
                    }
                }
            }

            if (!string.IsNullOrWhiteSpace(metadata.BaseCount))
            {
                AddNameValuePair(excelData, 0, Properties.Resources.GenbankMetadataBaseCount, "", metadata.BaseCount);
            }

            return ConvertToArray(excelData);
        }
Пример #16
0
        /// <summary>
        ///     Validate GenBank CDS features
        /// </summary>
        /// <param name="nodeName">XML node name</param>
        /// <param name="genMetadata">GenBank Metadata</param>
        private void ValidateGenBankCDSFeatures(string nodeName,
                                                GenBankMetadata genMetadata)
        {
            // Get Values from XML node.            
            string expectedLocation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Location);
            string expectedAllele = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.AlleleNode);
            string featureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.QualifierCount);
            string expectedDbReference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.DbReferenceNode);
            string geneSymbol = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSymbol);
            string expectedCitation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.CitationNode);
            string expectedExperiment = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.ExperimentNode);
            string expectedInference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.InferenceNode);
            string expectedLabel = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LabelNode);
            string expectedNote = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Note);
            string expectedMap = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GenbankMapNode);
            string expectedTranslation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GenbankTranslationNode);
            string expectedCodonStart = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.CodonStartNode);

            List<CodingSequence> codingSequenceFeatureList =
                genMetadata.Features.CodingSequences;
            var locBuilder = new LocationBuilder();

            // Create a copy of Coding Seq Region feature.
            CodingSequence cloneCDS = codingSequenceFeatureList[0].Clone();

            // Validate Unsure Seq Region qualifiers.
            Assert.AreEqual(codingSequenceFeatureList.Count.ToString((IFormatProvider) null),
                            featureCount);
            Assert.AreEqual(cloneCDS.DatabaseCrossReference[0],
                            expectedDbReference);
            Assert.AreEqual(cloneCDS.GeneSymbol, geneSymbol);
            Assert.AreEqual(codingSequenceFeatureList[0].Allele,
                            expectedAllele);
            Assert.AreEqual(codingSequenceFeatureList[0].Citation[0],
                            expectedCitation);
            Assert.AreEqual(codingSequenceFeatureList[0].Experiment[0],
                            expectedExperiment);
            Assert.AreEqual(codingSequenceFeatureList[0].GenomicMapPosition,
                            expectedMap);
            Assert.AreEqual(codingSequenceFeatureList[0].Inference[0],
                            expectedInference);
            Assert.AreEqual(codingSequenceFeatureList[0].Label,
                            expectedLabel);
            Assert.AreEqual(locBuilder.GetLocationString(
                genMetadata.Features.CodingSequences[0].Location),
                            expectedLocation);
            Assert.AreEqual(codingSequenceFeatureList[0].Note[0],
                            expectedNote);
            Assert.AreEqual(codingSequenceFeatureList[0].GenomicMapPosition,
                            expectedMap);
            Assert.AreEqual(codingSequenceFeatureList[0].CodonStart[0],
                            expectedCodonStart);
            Assert.AreEqual(codingSequenceFeatureList[0].Translation,
                            expectedTranslation);
            Assert.IsFalse(string.IsNullOrEmpty(codingSequenceFeatureList[0].Codon.ToString()));
            Assert.IsTrue(string.IsNullOrEmpty(codingSequenceFeatureList[0].EnzymeCommissionNumber));
            Assert.IsTrue(string.IsNullOrEmpty(codingSequenceFeatureList[0].Number));
            Assert.IsTrue(string.IsNullOrEmpty(codingSequenceFeatureList[0].Operon));
            Assert.IsFalse(codingSequenceFeatureList[0].Pseudo);
            Assert.IsFalse(codingSequenceFeatureList[0].RibosomalSlippage);
            Assert.IsTrue(string.IsNullOrEmpty(codingSequenceFeatureList[0].StandardName));
            Assert.IsFalse(string.IsNullOrEmpty(codingSequenceFeatureList[0].TranslationalExcept.ToString()));
            Assert.IsTrue(string.IsNullOrEmpty(codingSequenceFeatureList[0].TranslationTable));
            Assert.IsFalse(codingSequenceFeatureList[0].TransSplicing);
            Assert.IsTrue(string.IsNullOrEmpty(codingSequenceFeatureList[0].Exception));

            // Create a new CDS feature using constructor.
            var cds = new CodingSequence(expectedLocation);
            var cdsWithLoc = new CodingSequence(
                genMetadata.Features.CodingSequences[0].Location);
            Sequence seq = cds.GetTranslation();
            Assert.IsNotNull(seq);

            // Set and validate qualifiers.
            cds.Allele = expectedAllele;
            cdsWithLoc.GeneSymbol = geneSymbol;
            cdsWithLoc.GenomicMapPosition = expectedMap;
            Assert.AreEqual(cdsWithLoc.GenomicMapPosition, expectedMap);
            Assert.AreEqual(cds.Allele, expectedAllele);
            Assert.AreEqual(cdsWithLoc.GeneSymbol, geneSymbol);
        }
Пример #17
0
        /// <summary>
        ///     Validate GenBank Non Coding RNA features
        /// </summary>
        /// <param name="nodeName">XML node name</param>
        /// <param name="genMetadata">GenBank Metadata</param>
        private void ValidateGenBankNonCodingRNA(string nodeName,
                                                 GenBankMetadata genMetadata)
        {
            // Get Values from XML node.           
            string expectedLocation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Location);
            string featureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.QualifierCount);
            string expectedLabel = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LabelNode);
            string expectedNonCodingRnaClass = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.NonCodingRnaClassNode);

            List<NonCodingRna> nonCodingRNAFeatureList =
                genMetadata.Features.NonCodingRNAs;
            var locBuilder = new LocationBuilder();

            // Create a copy of Non coding RNA feature.
            NonCodingRna cloneNonCodingRNA =
                nonCodingRNAFeatureList[0].Clone();

            // Validate Non Coding RNA Region qualifiers.
            Assert.AreEqual(nonCodingRNAFeatureList.Count.ToString((IFormatProvider) null),
                            featureCount);
            Assert.AreEqual(nonCodingRNAFeatureList[0].NonCodingRnaClass,
                            expectedNonCodingRnaClass);
            Assert.AreEqual(cloneNonCodingRNA.Label,
                            expectedLabel);
            Assert.AreEqual(locBuilder.GetLocationString(
                genMetadata.Features.NonCodingRNAs[0].Location),
                            expectedLocation);

            // Create a non Coding RNA and validate the same.
            var nRNA =
                new NonCodingRna(genMetadata.Features.NonCodingRNAs[0].Location);
            var nRNAWithLocation =
                new NonCodingRna(expectedLocation);

            // Set properties 
            nRNA.NonCodingRnaClass = expectedNonCodingRnaClass;
            nRNAWithLocation.NonCodingRnaClass = expectedNonCodingRnaClass;

            // Validate created nRNA.
            Assert.AreEqual(nRNA.NonCodingRnaClass,
                            expectedNonCodingRnaClass);
            Assert.AreEqual(nRNAWithLocation.NonCodingRnaClass,
                            expectedNonCodingRnaClass);
        }
Пример #18
0
        /// <summary>
        ///     Validate GenBank RibosomeBindingSite features
        /// </summary>
        /// <param name="nodeName">XML node name</param>
        /// <param name="genMetadata">GenBank Metadata</param>
        private void ValidateGenBankRibosomeBindingSite(string nodeName,
                                                        GenBankMetadata genMetadata)
        {
            // Get Values from XML node.           
            string expectedLocation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Location);
            string expectedAllele = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.AlleleNode);
            string featureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.QualifierCount);
            string expectedDbReference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.DbReferenceNode);
            string geneSymbol = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSymbol);
            string expectedCitation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.CitationNode);
            string expectedExperiment = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.ExperimentNode);
            string expectedInference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.InferenceNode);
            string expectedLabel = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LabelNode);
            string expectedNote = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Note);
            string expectedMap = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GenbankMapNode);

            List<RibosomeBindingSite> ribosomeSite =
                genMetadata.Features.RibosomeBindingSites;

            // Create a copy of RibosomeBindigSite  Region feature.
            RibosomeBindingSite cloneRibosomeSite =
                ribosomeSite[0].Clone();
            var locBuilder = new LocationBuilder();

            // Validate RibosomeBindigSite qualifiers.
            Assert.AreEqual(ribosomeSite.Count.ToString((IFormatProvider) null)
                            , featureCount);
            Assert.AreEqual(cloneRibosomeSite.DatabaseCrossReference[0],
                            expectedDbReference);
            Assert.AreEqual(cloneRibosomeSite.GeneSymbol,
                            geneSymbol);
            Assert.AreEqual(ribosomeSite[0].Allele,
                            expectedAllele);
            Assert.AreEqual(ribosomeSite[0].Citation[0],
                            expectedCitation);
            Assert.AreEqual(ribosomeSite[0].Experiment[0],
                            expectedExperiment);
            Assert.AreEqual(ribosomeSite[0].Inference[0],
                            expectedInference);
            Assert.AreEqual(ribosomeSite[0].Label,
                            expectedLabel);
            Assert.AreEqual(locBuilder.GetLocationString(
                genMetadata.Features.RibosomeBindingSites[0].Location),
                            expectedLocation);
            Assert.AreEqual(ribosomeSite[0].Note[0],
                            expectedNote);
            Assert.AreEqual(ribosomeSite[0].GenomicMapPosition,
                            expectedMap);
            Assert.IsNotNull(ribosomeSite[0].OldLocusTag[0]);
            Assert.IsNotNull(ribosomeSite[0].LocusTag[0]);
            Assert.IsNotNull(ribosomeSite[0].StandardName);

            // Create a new RibosomeBindingSite feature using constructor.
            var ribosomeBindingSite =
                new RibosomeBindingSite(expectedLocation);
            var ribosomeBindingSiteLoc =
                new RibosomeBindingSite(
                    genMetadata.Features.RibosomeBindingSites[0].Location);

            // Set and validate qualifiers.
            ribosomeBindingSite.Allele = expectedAllele;
            ribosomeBindingSiteLoc.GeneSymbol = geneSymbol;
            ribosomeBindingSiteLoc.GenomicMapPosition = expectedMap;
            Assert.AreEqual(ribosomeBindingSiteLoc.GenomicMapPosition,
                            expectedMap);
            Assert.AreEqual(ribosomeBindingSite.Allele, expectedAllele);
            Assert.AreEqual(ribosomeBindingSiteLoc.GeneSymbol,
                            geneSymbol);
        }
Пример #19
0
        /// <summary>
        ///     Validate GenBank UnsureSequenceRegion features
        /// </summary>
        /// <param name="nodeName">XML node name</param>
        /// <param name="genMetadata">GenBank Metadata</param>
        private void ValidateGenBankUnsureSequenceRegion(string nodeName,
                                                         GenBankMetadata genMetadata)
        {
            // Get Values from XML node.           
            string expectedLocation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Location);
            string expectedAllele = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.AlleleNode);
            string featureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.QualifierCount);
            string expectedDbReference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.DbReferenceNode);
            string geneSymbol = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSymbol);
            string expectedCitation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.CitationNode);
            string expectedExperiment = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.ExperimentNode);
            string expectedInference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.InferenceNode);
            string expectedLabel = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LabelNode);
            string expectedNote = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Note);
            string expectedMap = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GenbankMapNode);

            List<UnsureSequenceRegion> unsureSeqRegionFeatureList =
                genMetadata.Features.UnsureSequenceRegions;

            // Create a copy of Unsure Seq Region feature.
            UnsureSequenceRegion cloneUnSureSeqRegion =
                unsureSeqRegionFeatureList[0].Clone();
            var locBuilder = new LocationBuilder();

            // Validate Unsure Seq Region qualifiers.
            Assert.AreEqual(unsureSeqRegionFeatureList.Count.ToString((IFormatProvider) null)
                            , featureCount);
            Assert.AreEqual(cloneUnSureSeqRegion.DatabaseCrossReference[0],
                            expectedDbReference);
            Assert.AreEqual(cloneUnSureSeqRegion.GeneSymbol,
                            geneSymbol);
            Assert.AreEqual(unsureSeqRegionFeatureList[0].Allele,
                            expectedAllele);
            Assert.AreEqual(unsureSeqRegionFeatureList[0].Citation[0],
                            expectedCitation);
            Assert.AreEqual(unsureSeqRegionFeatureList[0].Experiment[0],
                            expectedExperiment);
            Assert.AreEqual(unsureSeqRegionFeatureList[0].GenomicMapPosition,
                            expectedMap);
            Assert.AreEqual(unsureSeqRegionFeatureList[0].Inference[0],
                            expectedInference);
            Assert.AreEqual(unsureSeqRegionFeatureList[0].Label,
                            expectedLabel);
            Assert.AreEqual(locBuilder.GetLocationString(
                genMetadata.Features.UnsureSequenceRegions[0].Location),
                            expectedLocation);
            Assert.AreEqual(unsureSeqRegionFeatureList[0].Note[0],
                            expectedNote);
            Assert.AreEqual(unsureSeqRegionFeatureList[0].GenomicMapPosition,
                            expectedMap);
            Assert.IsFalse(string.IsNullOrEmpty(unsureSeqRegionFeatureList[0].Compare.ToString()));
            Assert.IsTrue(string.IsNullOrEmpty(unsureSeqRegionFeatureList[0].Replace));

            // Create a new Unsure feature using constructor.
            var unsureRegion =
                new UnsureSequenceRegion(expectedLocation);
            var unsureRegionWithLoc =
                new UnsureSequenceRegion(
                    genMetadata.Features.UnsureSequenceRegions[0].Location);

            // Set and validate qualifiers.
            unsureRegion.Allele = expectedAllele;
            unsureRegionWithLoc.GeneSymbol = geneSymbol;
            unsureRegionWithLoc.GenomicMapPosition = expectedMap;
            Assert.AreEqual(unsureRegionWithLoc.GenomicMapPosition,
                            expectedMap);
            Assert.AreEqual(unsureRegion.Allele, expectedAllele);
            Assert.AreEqual(unsureRegionWithLoc.GeneSymbol,
                            geneSymbol);
        }
Пример #20
0
        /// <summary>
        ///     Validate GenBank Operon features
        /// </summary>
        /// <param name="nodeName">XML node name</param>
        /// <param name="genMetadata">GenBank Metadata</param>
        private void ValidateGenBankOperon(string nodeName,
                                           GenBankMetadata genMetadata)
        {
            // Get Values from XML node.            
            string expectedLocation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Location);
            string expectedAllele = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.AlleleNode);
            string featureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.QualifierCount);
            string expectedDbReference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.DbReferenceNode);
            string expectedCitation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.CitationNode);
            string expectedExperiment = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.ExperimentNode);
            string expectedInference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.InferenceNode);
            string expectedLabel = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LabelNode);
            string expectedNote = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Note);
            string expectedMap = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GenbankMapNode);

            List<OperonRegion> operonFeatureList =
                genMetadata.Features.OperonRegions;
            var locBuilder = new LocationBuilder();

            // Create a copy of Long Terminal Repeat feature.
            OperonRegion cloneOperon = operonFeatureList[0].Clone();

            // Validate Operon region qualifiers.
            Assert.AreEqual(operonFeatureList.Count.ToString((IFormatProvider) null),
                            featureCount);
            Assert.AreEqual(cloneOperon.DatabaseCrossReference[0],
                            expectedDbReference);
            Assert.AreEqual(operonFeatureList[0].Allele,
                            expectedAllele);
            Assert.AreEqual(operonFeatureList[0].Citation[0],
                            expectedCitation);
            Assert.AreEqual(operonFeatureList[0].Experiment[0],
                            expectedExperiment);
            Assert.AreEqual(operonFeatureList[0].GenomicMapPosition,
                            expectedMap);
            Assert.AreEqual(operonFeatureList[0].Inference[0],
                            expectedInference);
            Assert.AreEqual(operonFeatureList[0].Label,
                            expectedLabel);
            Assert.AreEqual(locBuilder.GetLocationString(
                genMetadata.Features.OperonRegions[0].Location),
                            expectedLocation);
            Assert.AreEqual(operonFeatureList[0].Note[0],
                            expectedNote);
            Assert.IsFalse(string.IsNullOrEmpty(operonFeatureList[0].Function.ToString()));
            Assert.AreEqual(operonFeatureList[0].GenomicMapPosition,
                            expectedMap);
            Assert.IsTrue(string.IsNullOrEmpty(operonFeatureList[0].Operon));
            Assert.IsFalse(string.IsNullOrEmpty(operonFeatureList[0].Phenotype.ToString()));
            Assert.IsTrue(string.IsNullOrEmpty(operonFeatureList[0].StandardName));
            Assert.IsFalse(operonFeatureList[0].Pseudo);

            // Create a new Operon feature using constructor.
            var operonRegion =
                new OperonRegion(expectedLocation);
            var operonRegionWithLoc = new OperonRegion(
                genMetadata.Features.OperonRegions[0].Location);

            // Set and validate qualifiers.
            operonRegion.Allele = expectedAllele;
            operonRegionWithLoc.GenomicMapPosition = expectedMap;
            Assert.AreEqual(operonRegionWithLoc.GenomicMapPosition,
                            expectedMap);
            Assert.AreEqual(operonRegion.Allele, expectedAllele);
        }
Пример #21
0
        /// <summary>
        ///     Validate PrecursorRNA features
        /// </summary>
        /// <param name="nodeName">XML node name</param>
        /// <param name="genMetadata">GenBank Metadata</param>
        private void ValidateGenBankPrecursorRNAFeature(string nodeName,
                                                        GenBankMetadata genMetadata)
        {
            // Get Values from XML node.           
            string expectedLocation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Location);
            string expectedAllele = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.AlleleNode);
            string featureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.QualifierCount);
            string expectedDbReference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.DbReferenceNode);
            string geneSymbol = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSymbol);
            string expectedCitation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.CitationNode);
            string expectedExperiment = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.ExperimentNode);
            string expectedFunction = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.FunctionNode);
            string expectedGeneSynonym = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSynonymNode);
            string expectedInference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.InferenceNode);
            string expectedLabel = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LabelNode);
            string expectedLocusTag = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LocusTagNode);
            string expectedNote = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Note);
            string expectedOldLocusTag = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.OldLocusTagNode);
            string expectedMap = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GenbankMapNode);

            List<PrecursorRna> precursorRNAFeatureList =
                genMetadata.Features.PrecursorRNAs;
            var locBuilder = new LocationBuilder();

            // Create a copy of Precursor RNA feature.
            PrecursorRna clonePrecursorRNA =
                precursorRNAFeatureList[0].Clone();

            // Validate Precursor RNA qualifiers.
            Assert.AreEqual(precursorRNAFeatureList.Count.ToString((IFormatProvider) null),
                            featureCount);
            Assert.AreEqual(clonePrecursorRNA.GeneSymbol,
                            geneSymbol);
            Assert.AreEqual(clonePrecursorRNA.DatabaseCrossReference[0],
                            expectedDbReference);
            Assert.AreEqual(precursorRNAFeatureList[0].Allele,
                            expectedAllele);
            Assert.AreEqual(precursorRNAFeatureList[0].Citation[0],
                            expectedCitation);
            Assert.AreEqual(precursorRNAFeatureList[0].Experiment[0],
                            expectedExperiment);
            Assert.AreEqual(precursorRNAFeatureList[0].GenomicMapPosition,
                            expectedMap);
            Assert.AreEqual(precursorRNAFeatureList[0].GeneSynonym[0],
                            expectedGeneSynonym);
            Assert.AreEqual(precursorRNAFeatureList[0].Inference[0],
                            expectedInference);
            Assert.AreEqual(precursorRNAFeatureList[0].Label, expectedLabel);
            Assert.AreEqual(locBuilder.GetLocationString(
                genMetadata.Features.PrecursorRNAs[0].Location),
                            expectedLocation);
            Assert.AreEqual(precursorRNAFeatureList[0].Note[0],
                            expectedNote);
            Assert.AreEqual(precursorRNAFeatureList[0].OldLocusTag[0],
                            expectedOldLocusTag);
            Assert.AreEqual(precursorRNAFeatureList[0].LocusTag[0],
                            expectedLocusTag);
            Assert.AreEqual(precursorRNAFeatureList[0].Function[0],
                            expectedFunction);
            Assert.IsTrue(string.IsNullOrEmpty(precursorRNAFeatureList[0].StandardName));
            Assert.IsFalse(string.IsNullOrEmpty(precursorRNAFeatureList[0].Product.ToString()));
            Assert.IsTrue(string.IsNullOrEmpty(precursorRNAFeatureList[0].Operon));
            Assert.IsFalse(precursorRNAFeatureList[0].TransSplicing);

            // Create a new Precursor RNA and validate the same.
            var precursorRNA = new PrecursorRna(expectedLocation);
            var precursorRNAWithILoc = new PrecursorRna(
                genMetadata.Features.PrecursorRNAs[0].Location);

            // Set qualifiers and validate them.
            precursorRNA.Allele = expectedAllele;
            precursorRNA.GeneSymbol = geneSymbol;
            precursorRNAWithILoc.GenomicMapPosition = expectedMap;
            Assert.AreEqual(precursorRNA.GeneSymbol, geneSymbol);
            Assert.AreEqual(precursorRNA.Allele, expectedAllele);
            Assert.AreEqual(precursorRNAWithILoc.GenomicMapPosition, expectedMap);
        }
Пример #22
0
        /// <summary>
        ///     Validate ModifiedBase features
        /// </summary>
        /// <param name="nodeName">XML node name</param>
        /// <param name="genMetadata">GenBank Metadata</param>
        private void ValidateGenBankModifiedBaseFeature(string nodeName,
                                                        GenBankMetadata genMetadata)
        {
            // Get Values from XML node.           
            string expectedLocation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Location);
            string expectedAllele = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.AlleleNode);
            string featureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.QualifierCount);
            string expectedDbReference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.DbReferenceNode);
            string geneSymbol = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSymbol);
            string expectedCitation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.CitationNode);
            string expectedExperiment = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.ExperimentNode);
            string expectedGeneSynonym = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSynonymNode);
            string expectedInference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.InferenceNode);
            string expectedLabel = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LabelNode);
            string expectedLocusTag = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LocusTagNode);
            string expectedNote = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Note);
            string expectedOldLocusTag = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.OldLocusTagNode);
            string expectedMap = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GenbankMapNode);

            List<ModifiedBase> modifiedBaseFeatureList =
                genMetadata.Features.ModifiedBases;
            var locBuilder = new LocationBuilder();

            // Create a copy of Modified base feature.
            ModifiedBase cloneModifiedBase = modifiedBaseFeatureList[0].Clone();

            // Validate Modified Base qualifiers.
            Assert.AreEqual(modifiedBaseFeatureList.Count.ToString((IFormatProvider) null),
                            featureCount);
            Assert.AreEqual(cloneModifiedBase.GeneSymbol,
                            geneSymbol);
            Assert.AreEqual(cloneModifiedBase.DatabaseCrossReference[0],
                            expectedDbReference);
            Assert.AreEqual(modifiedBaseFeatureList[0].Allele,
                            expectedAllele);
            Assert.AreEqual(modifiedBaseFeatureList[0].Citation[0],
                            expectedCitation);
            Assert.AreEqual(modifiedBaseFeatureList[0].Experiment[0],
                            expectedExperiment);
            Assert.AreEqual(modifiedBaseFeatureList[0].GenomicMapPosition,
                            expectedMap);
            Assert.AreEqual(modifiedBaseFeatureList[0].GeneSynonym[0],
                            expectedGeneSynonym);
            Assert.AreEqual(modifiedBaseFeatureList[0].Inference[0],
                            expectedInference);
            Assert.AreEqual(modifiedBaseFeatureList[0].Label,
                            expectedLabel);
            Assert.AreEqual(locBuilder.GetLocationString(
                genMetadata.Features.ModifiedBases[0].Location),
                            expectedLocation);
            Assert.AreEqual(modifiedBaseFeatureList[0].Note[0],
                            expectedNote);
            Assert.AreEqual(modifiedBaseFeatureList[0].OldLocusTag[0],
                            expectedOldLocusTag);
            Assert.AreEqual(modifiedBaseFeatureList[0].LocusTag[0],
                            expectedLocusTag);
            Assert.IsFalse(string.IsNullOrEmpty(modifiedBaseFeatureList[0].ModifiedNucleotideBase.ToString()));

            // Create a new ModifiedBase and validate the same.
            var modifiedBase = new ModifiedBase(expectedLocation);
            var modifiedBaseWithILoc = new ModifiedBase(
                genMetadata.Features.ModifiedBases[0].Location);

            // Set qualifiers and validate them.
            modifiedBase.Allele = expectedAllele;
            modifiedBase.GeneSymbol = geneSymbol;
            modifiedBaseWithILoc.GenomicMapPosition = expectedMap;
            Assert.AreEqual(modifiedBase.GeneSymbol, geneSymbol);
            Assert.AreEqual(modifiedBase.Allele, expectedAllele);
            Assert.AreEqual(modifiedBaseWithILoc.GenomicMapPosition,
                            expectedMap);
        }
Пример #23
0
        /// <summary>
        /// Parses the GenBank Origin data from the GenBank file. 
        /// </summary>
        /// <param name="line">parse line</param>
        /// <param name="metadata">The GenBank metadata.</param>
        /// <param name="stream">The stream reader.</param>
        private void ParseOrigin(ref string line, GenBankMetadata metadata, StreamReader stream)
        {
            // The origin line can contain optional data; don't put empty string into
            // metadata.
            string lineData = GetLineData(line, DataIndent);
            if (!String.IsNullOrEmpty(lineData))
            {
                metadata.Origin = lineData;
            }

            line = GoToNextLine(line, stream);
            IAlphabet alphabet = null;

            var sequenceBuilder = new StringBuilder();
            while ((line != null) && line[0] == ' ')
            {
                // Using a regex is too slow.
                int len = line.Length;

                int k = 0;
                while (k < len && (line[k] == ' ' || Char.IsNumber(line[k])))
                    k++;

                while (k < len)
                {
                    string seqData = line.Substring(k, Math.Min(10, len - k));

                    sequenceBuilder.Append(seqData);
                    k += 11;
                }

                line = GoToNextLine(line, stream);
            }

            var sequenceString = sequenceBuilder.ToString().Trim();
            if (!string.IsNullOrEmpty(sequenceString))
            {
                if (Alphabet == null)
                {
                    byte[] tempData = UTF8Encoding.UTF8.GetBytes(sequenceString.ToUpperInvariant());
                    alphabet = Alphabets.AutoDetectAlphabet(tempData, 0, tempData.Length, alphabet);

                    if (alphabet == null)
                    {
                        var message = String.Format(CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, line);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                }
                else
                {
                    alphabet = Alphabet;
                }

                sequenceWithData = new Sequence(alphabet, sequenceString);
            }
        }
Пример #24
0
        /// <summary>
        /// Parses the GenBank headers from the GenBank file.
        /// parses everything before the features section
        /// </summary>
        /// <param name="sequence">The sequence.</param>
        /// <param name="noOfSequence">The current sequence index.</param>
        /// <param name="line">parse line</param>
        /// <param name="stream">The stream reader.</param>
        /// <returns>The parsed line.</returns>
        private string ParseHeaders(ref Sequence sequence, int noOfSequence, string line, StreamReader stream)
        {
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
            string          data;

            string[] tokens;

            // only allow one locus line
            bool   haveParsedLocus = false;
            string lineData;

            if (noOfSequence == 0)
            {
                line = string.Empty;
                line = GoToNextLine(line, stream);
            }

            // parse until we hit the features or sequence section
            bool haveFinishedHeaders = false;

            while ((line != null) && !haveFinishedHeaders)
            {
                switch (GetLineHeader(line, DataIndent))
                {
                case "LOCUS":
                    if (haveParsedLocus)
                    {
                        string message = String.Format(CultureInfo.CurrentCulture, Properties.Resource.ParserSecondLocus);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    line            = ParseLocusByTokens(line, ref sequence, stream);
                    metadata        = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    haveParsedLocus = true;
                    // don't go to next line; current line still needs to be processed
                    break;

                case "VERSION":
                    lineData = GetLineData(line, DataIndent);

                    tokens = lineData.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                    // first token contains accession and version
                    Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$");
                    metadata.Version = new GenBankVersion();

                    if (m.Success)
                    {
                        metadata.Version.Version = m.Groups["version"].Value;
                        // The first token in the data from the accession line is referred to as
                        // the primary accession number, and should be the one used here in the
                        // version line.
                        string versionLineAccession = m.Groups["accession"].Value;
                        if (metadata.Accession == null)
                        {
                            ApplicationLog.WriteLine("WARN: VERSION processed before ACCESSION");
                        }
                        else
                        {
                            if (!versionLineAccession.Equals(metadata.Accession.Primary))
                            {
                                ApplicationLog.WriteLine("WARN: VERSION tag doesn't match ACCESSION");
                            }
                            else
                            {
                                metadata.Version.Accession = metadata.Accession.Primary;
                            }
                        }
                    }

                    if (tokens.Length > 1)
                    {
                        // second token contains primary ID
                        m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)");
                        if (m.Success)
                        {
                            metadata.Version.GiNumber = m.Groups["primaryID"].Value;
                        }
                    }

                    line = GoToNextLine(line, stream);
                    break;

                case "PROJECT":
                    lineData = GetLineData(line, DataIndent);
                    tokens   = lineData.Split(':');

                    if (tokens.Length == 2)
                    {
                        metadata.Project = new ProjectIdentifier {
                            Name = tokens[0]
                        };
                        tokens = tokens[1].Split(',');
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            metadata.Project.Numbers.Add(tokens[i]);
                        }
                    }
                    else
                    {
                        ApplicationLog.WriteLine("WARN: unexpected PROJECT header: " + line);
                    }

                    line = GoToNextLine(line, stream);
                    break;

                case "SOURCE":
                    line     = ParseSource(line, ref sequence, stream);
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "REFERENCE":
                    line     = ParseReferences(line, ref sequence, stream);   // can encounter more than one
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "COMMENT":
                    line     = ParseComments(line, ref sequence, stream);   // can encounter more than one
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "PRIMARY":
                    // This header is followed by sequence info in a table format that could be
                    // stored in a custom object.  The first line contains column headers.
                    // For now, just validate the presence of the headers, and save the data
                    // as a string.
                    lineData = GetLineData(line, DataIndent);
                    tokens   = lineData.Split("\t ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);

                    // Validating for minimum two headers.
                    if (tokens.Length != 4)
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserPrimaryLineError,
                            line);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    string primaryData = ParseMultiLineData(ref line, Environment.NewLine, DataIndent, stream);
                    metadata.Primary = primaryData;

                    // don't go to next line; current line still needs to be processed
                    break;

                // all the following are extracted the same way - possibly multiline
                case "DEFINITION":
                    metadata.Definition = ParseMultiLineData(ref line, " ", DataIndent, stream);
                    break;

                case "ACCESSION":
                    data = ParseMultiLineData(ref line, " ", DataIndent, stream);
                    metadata.Accession = new GenBankAccession();
                    string[] accessions = data.Split(' ');
                    metadata.Accession.Primary = accessions[0];

                    for (int i = 1; i < accessions.Length; i++)
                    {
                        metadata.Accession.Secondary.Add(accessions[i]);
                    }

                    break;

                case "DBLINK":
                    data             = ParseMultiLineData(ref line, "\n", DataIndent, stream);
                    metadata.DbLinks = new List <CrossReferenceLink>();
                    foreach (string link in data.Split('\n'))
                    {
                        tokens = link.Split(':');

                        if (tokens.Length == 2)
                        {
                            CrossReferenceLink newLink = new CrossReferenceLink();
                            if (string.Compare(tokens[0], CrossReferenceType.Project.ToString(), StringComparison.OrdinalIgnoreCase) == 0)
                            {
                                newLink.Type = CrossReferenceType.Project;
                            }
                            else if (string.Compare(tokens[0], CrossReferenceType.BioProject.ToString(), StringComparison.OrdinalIgnoreCase) == 0)
                            {
                                newLink.Type = CrossReferenceType.BioProject;
                            }
                            else
                            {
                                newLink.Type = CrossReferenceType.None;
                                if (string.Compare(tokens[0], TraceAssemblyArchive, StringComparison.OrdinalIgnoreCase) == 0)
                                {
                                    newLink.Type = CrossReferenceType.TraceAssemblyArchive;
                                }
                            }
                            tokens = tokens[1].Split(',');
                            for (int i = 0; i < tokens.Length; i++)
                            {
                                newLink.Numbers.Add(tokens[i]);
                            }
                            metadata.DbLinks.Add(newLink);
                        }
                        else
                        {
                            ApplicationLog.WriteLine("WARN: unexpected DBLINK header: " + line);
                        }
                    }
                    break;

                case "DBSOURCE":
                    metadata.DbSource = ParseMultiLineData(ref line, " ", DataIndent, stream);
                    break;

                case "KEYWORDS":
                    metadata.Keywords = ParseMultiLineData(ref line, " ", DataIndent, stream);
                    break;

                case "SEGMENT":
                    data = ParseMultiLineData(ref line, " ", DataIndent, stream);
                    const string delimeter = "of";
                    tokens = data.Split(delimeter.ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
                    if (tokens.Length == 2)
                    {
                        metadata.Segment = new SequenceSegment();
                        int outvalue;
                        if (int.TryParse(tokens[0].Trim(), out outvalue))
                        {
                            metadata.Segment.Current = outvalue;
                        }
                        else
                        {
                            ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + line);
                        }

                        if (int.TryParse(tokens[1].Trim(), out outvalue))
                        {
                            metadata.Segment.Count = outvalue;
                        }
                        else
                        {
                            ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + line);
                        }
                    }
                    else
                    {
                        ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + line);
                    }

                    break;

                // all the following indicate sections beyond the headers parsed by this method
                case "FEATURES":
                case "BASE COUNT":
                case "ORIGIN":
                case "CONTIG":
                    haveFinishedHeaders = true;
                    break;

                default:
                    string lineHeader = GetLineHeader(line, DataIndent);
                    lineData = GetLineData(line, DataIndent);
                    ApplicationLog.WriteLine(ToString() + "WARN: unknown {0} -> {1}", lineHeader, lineData);
                    string errMessage = String.Format(
                        CultureInfo.CurrentCulture,
                        Properties.Resource.ParseHeaderError,
                        lineHeader);
                    Trace.Report(errMessage);
                    throw new InvalidDataException(errMessage);
                }
            }

            // check for required features
            if (!haveParsedLocus)
            {
                string message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, Name);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }

            return(line);
        }
Пример #25
0
        /// <summary>
        /// Parses the GenBank source data from the GenBank file.
        /// </summary>
        /// <param name="line">parse line</param>
        /// <param name="sequence">The sequence.</param>
        /// <param name="stream">The stream reader.</param>
        /// <returns>The parsed line.</returns>
        private string ParseSource(string line, ref Sequence sequence, StreamReader stream)
        {
            string source      = string.Empty;
            string organism    = string.Empty;
            string classLevels = string.Empty;

            while (line != null)
            {
                string lineHeader = GetLineHeader(line, DataIndent);
                string lineData;
                if (lineHeader == "SOURCE")
                {
                    // data can be multiline. spec says last line must end with period
                    // (note: this doesn't apply unless multiline)
                    bool lastDotted = true;
                    lineData = GetLineData(line, DataIndent);
                    source   = lineData;

                    line       = GoToNextLine(line, stream);
                    lineHeader = GetLineHeader(line, DataIndent);
                    while ((line != null) && (lineHeader == string.Empty))
                    {
                        source    += " " + GetLineData(line, DataIndent);
                        lastDotted = (source.EndsWith(".", StringComparison.Ordinal));
                        line       = GoToNextLine(line, stream);
                        lineHeader = GetLineHeader(line, DataIndent);
                    }

                    if (!lastDotted && Trace.Want(Trace.SeqWarnings))
                    {
                        Trace.Report("GenBank.ParseSource", Properties.Resource.OutOfSpec, source);
                    }

                    // don't go to next line; current line still needs to be processed
                }
                else if (line[0] == ' ')
                {
                    if (lineHeader != "ORGANISM")
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserInvalidSourceField,
                            lineHeader);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    lineData = GetLineData(line, DataIndent);

                    // this also can be multiline
                    organism = lineData;

                    line       = GoToNextLine(line, stream);
                    lineHeader = GetLineHeader(line, DataIndent);
                    while ((line != null) && (lineHeader == string.Empty))
                    {
                        if (line.EndsWith(";", StringComparison.Ordinal) || line.EndsWith(".", StringComparison.Ordinal))
                        {
                            if (!String.IsNullOrEmpty(classLevels))
                            {
                                classLevels += " ";
                            }

                            lineData     = GetLineData(line, DataIndent);
                            classLevels += lineData;
                        }
                        else
                        {
                            organism += " " + lineData;
                        }

                        line       = GoToNextLine(line, stream);
                        lineHeader = GetLineHeader(line, DataIndent);
                    }

                    // don't go to next line; current line still needs to be processed
                }
                else
                {
                    // don't go to next line; current line still needs to be processed
                    break;
                }
            }

            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            metadata.Source = new SequenceSource {
                CommonName = source
            };
            if (!string.IsNullOrEmpty(organism))
            {
                int index = organism.IndexOf(" ", StringComparison.Ordinal);
                if (index > 0)
                {
                    metadata.Source.Organism.Genus = organism.Substring(0, index);
                    if (organism.Length > index)
                    {
                        index++;
                        metadata.Source.Organism.Species = organism.Substring(index, organism.Length - index);
                    }
                }
                else
                {
                    metadata.Source.Organism.Genus = organism;
                }
            }

            metadata.Source.Organism.ClassLevels = classLevels;
            if (classLevels.TrimEnd('.').Length > 0)
            {
                string genus = classLevels.TrimEnd('.').Split(";".ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Last().Trim();
                if (!genus.Equals(metadata.Source.Organism.Genus.Trim()))
                {
                    metadata.Source.Organism.Species = organism;
                    metadata.Source.Organism.Genus   = genus;
                }
            }

            return(line);
        }
Пример #26
0
        private void WriteSource(GenBankMetadata metadata, TextWriter txtWriter)
        {
            if (metadata.Source != null)
            {
                string commonname = string.Empty;
                if (!string.IsNullOrEmpty(metadata.Source.CommonName))
                {
                    commonname = metadata.Source.CommonName;
                }

                WriteHeaderSection("SOURCE", commonname, txtWriter);

                string organism = string.Empty;
                if (!commonname.Equals(metadata.Source.Organism.Species))
                {
                    if (!string.IsNullOrEmpty(metadata.Source.Organism.Genus))
                    {
                        organism += metadata.Source.Organism.Genus;
                    }
                    organism += " ";
                }

                if (!string.IsNullOrEmpty(metadata.Source.Organism.Species))
                {
                    organism += metadata.Source.Organism.Species;
                }

                // Organism might be empty, trim the value to ensure that a string with one space is not written (writer fails on this)
                WriteHeaderSection("  ORGANISM", organism.Trim(), txtWriter);
                WriteHeaderSection(string.Empty, metadata.Source.Organism.ClassLevels, txtWriter);
            }
        }
Пример #27
0
        /// <summary>
        /// Helper method to parse the feature of gen bank data
        /// </summary>
        /// <param name="metadata">Metadata object</param>
        /// <param name="cellRange">Range of cells</param>
        /// <param name="rowIndex">Current index of row</param>
        /// <returns>Index of row</returns>
        private static int ParseGenBankFeatures(GenBankMetadata metadata, object[,] cellRange, int rowIndex)
        {

            string message = string.Empty;
            string key;
            string subKey;
            string value = string.Empty;

            while (rowIndex < cellRange.GetLength(0))
            {
                if (null != cellRange[rowIndex, KeyColumnIndex])
                {
                    key = cellRange[rowIndex, KeyColumnIndex].ToString().ToUpperInvariant();
                    if (key.Equals(METADATA))
                    {
                        break;
                    }
                }
                else
                {
                    rowIndex++;
                    continue;
                }

                if (3 > cellRange.GetLength(1))
                {
                    message = String.Format(
                                CultureInfo.InvariantCulture,
                                Resources.UnrecognizedGenBankMetadataFormat,
                                REFERENCE);
                    throw new FormatException(message);
                }

                subKey = cellRange[rowIndex, SubKeyColumnIndex] != null ? cellRange[rowIndex, SubKeyColumnIndex].ToString() : string.Empty;
                value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty;

                if (key.Equals(BASECOUNT))
                {
                    metadata.BaseCount = value;
                    rowIndex++;
                }
                else if (!string.IsNullOrWhiteSpace(value) && !string.IsNullOrWhiteSpace(key))
                {
                    FeatureItem featureItem = StandardFeatureMap.GetStandardFeatureItem(new FeatureItem(key, value));
                    if (metadata.Features == null)
                    {
                        metadata.Features = new SequenceFeatures();
                    }

                    metadata.Features.All.Add(featureItem);
                    rowIndex++;
                    rowIndex = ParseQualifiers(featureItem, cellRange, rowIndex);
                }
                else
                {
                    rowIndex++;
                }
            }

            return rowIndex;
        }
Пример #28
0
 // Writes the comments, which are stored in a list of strings.
 private void WriteComments(GenBankMetadata metadata, TextWriter txtWriter)
 {
     foreach (string comment in metadata.Comments)
     {
         WriteHeaderSection("COMMENT", comment, txtWriter);
     }
 }
Пример #29
0
        /// <summary>
        ///     Validate GenBank GCSignal features
        /// </summary>
        /// <param name="nodeName">XML node name</param>
        /// <param name="genMetadata">GenBank Metadata</param>
        private void ValidateGenBankGCSignalFeature(string nodeName,
                                                    GenBankMetadata genMetadata)
        {
            // Get Values from XML node.            
            string expectedLocation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Location);
            string expectedAllele = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.AlleleNode);
            string featureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.QualifierCount);
            string expectedDbReference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.DbReferenceNode);
            string geneSymbol = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSymbol);
            string expectedCitation = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.CitationNode);
            string expectedExperiment = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.ExperimentNode);
            string expectedGeneSynonym = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GeneSynonymNode);
            string expectedInference = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.InferenceNode);
            string expectedLabel = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LabelNode);
            string expectedLocusTag = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.LocusTagNode);
            string expectedNote = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.Note);
            string expectedOldLocusTag = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.OldLocusTagNode);
            string expectedMap = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GenbankMapNode);

            List<GcSingal> gcSignalFeatureList = genMetadata.Features.GCSignals;
            var locBuilder = new LocationBuilder();

            // Create a copy of GC_Signal feature.
            GcSingal cloneGCSignal = gcSignalFeatureList[0].Clone();

            // Validate GC_Signal qualifiers.
            Assert.AreEqual(gcSignalFeatureList.Count.ToString((IFormatProvider) null),
                            featureCount);
            Assert.AreEqual(cloneGCSignal.GeneSymbol,
                            geneSymbol);
            Assert.AreEqual(cloneGCSignal.DatabaseCrossReference[0],
                            expectedDbReference);
            Assert.AreEqual(gcSignalFeatureList[0].Allele,
                            expectedAllele);
            Assert.AreEqual(gcSignalFeatureList[0].Citation[0],
                            expectedCitation);
            Assert.AreEqual(gcSignalFeatureList[0].Experiment[0],
                            expectedExperiment);
            Assert.AreEqual(gcSignalFeatureList[0].GenomicMapPosition,
                            expectedMap);
            Assert.AreEqual(gcSignalFeatureList[0].GeneSynonym[0],
                            expectedGeneSynonym);
            Assert.AreEqual(gcSignalFeatureList[0].Inference[0],
                            expectedInference);
            Assert.AreEqual(gcSignalFeatureList[0].Label,
                            expectedLabel);
            Assert.AreEqual(locBuilder.GetLocationString(
                genMetadata.Features.GCSignals[0].Location),
                            expectedLocation);
            Assert.AreEqual(gcSignalFeatureList[0].Note[0],
                            expectedNote);
            Assert.AreEqual(gcSignalFeatureList[0].OldLocusTag[0],
                            expectedOldLocusTag);
            Assert.AreEqual(gcSignalFeatureList[0].LocusTag[0],
                            expectedLocusTag);

            // Create a new GCSignal and validate the same.
            var gcSignal = new GcSingal(expectedLocation);
            var gcSignalWithILoc = new GcSingal(
                genMetadata.Features.GCSignals[0].Location);

            // Set qualifiers and validate them.
            gcSignal.Allele = expectedAllele;
            gcSignal.GeneSymbol = geneSymbol;
            gcSignalWithILoc.GenomicMapPosition = expectedMap;
            Assert.AreEqual(gcSignal.GeneSymbol, geneSymbol);
            Assert.AreEqual(gcSignal.Allele, expectedAllele);
            Assert.AreEqual(gcSignalWithILoc.GenomicMapPosition,
                            expectedMap);
        }
Пример #30
0
        /// <summary>
        /// Parses the GenBank Origin data from the GenBank file.
        /// </summary>
        /// <param name="line">parse line</param>
        /// <param name="metadata">The GenBank metadata.</param>
        /// <param name="stream">The stream reader.</param>
        private void ParseOrigin(ref string line, GenBankMetadata metadata, StreamReader stream)
        {
            // The origin line can contain optional data; don't put empty string into
            // metadata.
            string lineData = GetLineData(line, DataIndent);

            if (!String.IsNullOrEmpty(lineData))
            {
                metadata.Origin = lineData;
            }

            line = GoToNextLine(line, stream);
            IAlphabet alphabet = null;

            var sequenceBuilder = new StringBuilder();

            while ((line != null) && line[0] == ' ')
            {
                // Using a regex is too slow.
                int len = line.Length;

                int k = 0;
                while (k < len && (line[k] == ' ' || Char.IsNumber(line[k])))
                {
                    k++;
                }

                while (k < len)
                {
                    string seqData = line.Substring(k, Math.Min(10, len - k));

                    sequenceBuilder.Append(seqData);
                    k += 11;
                }

                line = GoToNextLine(line, stream);
            }

            var sequenceString = sequenceBuilder.ToString().Trim();

            if (!string.IsNullOrEmpty(sequenceString))
            {
                if (Alphabet == null)
                {
                    byte[] tempData = UTF8Encoding.UTF8.GetBytes(sequenceString.ToUpperInvariant());
                    alphabet = Alphabets.AutoDetectAlphabet(tempData, 0, tempData.Length, alphabet);

                    if (alphabet == null)
                    {
                        var message = String.Format(CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, line);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                }
                else
                {
                    alphabet = Alphabet;
                }

                sequenceWithData = new Sequence(alphabet, sequenceString);
            }
        }
Пример #31
0
        /// <summary>
        /// Construct initialization.
        /// </summary>
        /// <param name="fragList">Fragment list.</param>
        /// <param name="maxOverlapLen">Minimum overlap length.</param>
        private void Init(ObservableCollection<Fragment> fragList, DesignerSettings settings)
        {
            this.Overlaps = new List<Overlap>();
            this.Settings = settings;

            Thermodynamics.thal_results results = new Thermodynamics.thal_results();
            Thermodynamics.p3_get_thermodynamic_values(Settings.TmThalParamPath, ref results);
            String message = new String(results.msg);
            message = message.Trim('\0');

            if (!String.IsNullOrEmpty(message))
            {
                throw new TmThalParamException(message);
            }

            //forward
            String seq_5 = "";
            String seq_3 = "";
            String name = "";
            List<MiscFeature> featList = new List<MiscFeature>();

            int pairIndex;
            int len_5;
            int len_3;

            for (int i = 0; i < fragList.Count; i++)
            {
                name += fragList[i].Name;
                seq_3 = fragList[i].GetString();
                len_5 = Math.Min(settings.MaxLen_5, seq_5.Length);
                len_3 = Math.Min(settings.MaxLen_3, seq_3.Length);
                String overhang_5 = seq_5.Substring(seq_5.Length - len_5, len_5);
                String geneSpecific_3 = seq_3.Substring(0, len_3);
                String loc = (seq_5.Length + 1).ToString() + ".." + (seq_5.Length + seq_3.Length).ToString();
                MiscFeature gene = new MiscFeature(loc);
                gene.StandardName = fragList[i].Name;
                featList.Add(gene);
                seq_5 += seq_3;


                if (i == 0)
                {
                    pairIndex = fragList.Count;
                    Overlaps.Add(new Overlap(Designer.VectorLabel + fragList[i].Name + "-fwd", new Sequence(Alphabets.AmbiguousDNA, geneSpecific_3), settings.TmThalSettings, pairIndex));
                }
                else
                {
                    pairIndex = 2 * fragList.Count - i;
                    Overlaps.Add(new Overlap(fragList[i].Name + "-fwd", new Sequence(Alphabets.AmbiguousDNA, overhang_5), new Sequence(Alphabets.AmbiguousDNA, geneSpecific_3), settings.TmThalSettings, pairIndex));
                }
            }

            this.Sequence = new Sequence(Alphabets.AmbiguousDNA, seq_5);
            //meta
            GenBankMetadata meta = new GenBankMetadata();
            meta.Locus = new GenBankLocusInfo();
            meta.Locus.MoleculeType = MoleculeType.DNA;
            meta.Locus.Name = name;
            meta.Locus.Date = System.DateTime.Now;
            meta.Locus.SequenceLength = seq_5.Length;
            meta.Comments.Add("designed with mufasa");
            meta.Definition = "synthetic construct";
            meta.Features = new SequenceFeatures();
            meta.Features.All.AddRange(featList);
            this.Sequence.Metadata.Add("GenBank", meta);

            //reverse
            fragList.Add(new Fragment(fragList[0]));
            fragList.RemoveAt(0);
            seq_5 = "";
            seq_3 = "";
            for (int i = fragList.Count - 1; i >= 0; i--)
            {
                seq_5 = fragList[i].GetReverseComplementString();
                len_5 = Math.Min(settings.MaxLen_5, seq_3.Length);
                len_3 = Math.Min(settings.MaxLen_3, seq_5.Length);
                String overhang_5 = seq_3.Substring(seq_3.Length - len_5, len_5);
                String geneSpecific_3 = seq_5.Substring(0, len_3);
                seq_3 += seq_5;

                if (i == fragList.Count - 1)
                {
                    pairIndex = 0;
                    Overlaps.Add(new Overlap(Designer.VectorLabel + fragList[i].Name + "-rev", new Sequence(Alphabets.AmbiguousDNA, geneSpecific_3), settings.TmThalSettings, pairIndex));
                }
                else
                {
                    pairIndex = i + 1;
                    Overlaps.Add(new Overlap(fragList[i].Name + "-rev", new Sequence(Alphabets.AmbiguousDNA, overhang_5), new Sequence(Alphabets.AmbiguousDNA, geneSpecific_3), settings.TmThalSettings, pairIndex));
                }
            }

            for (int i = 0; i < fragList.Count; i++)
            {
                //Duplex melting temperatures
                Overlaps[i].HeterodimerMeltingTemperature = Overlaps[i].GetDuplexTemperature(Overlaps[Overlaps[i].PairIndex]);
            }
        }
Пример #32
0
        /// <summary>
        /// Parses the GenBank Reference information from the GenBank file.
        /// </summary>
        /// <param name="line">parse line</param>
        /// <param name="sequence">The sequence.</param>
        /// <param name="stream">The stream reader.</param>
        /// <returns>The parsed line.</returns>
        private string ParseReferences(string line, ref Sequence sequence, StreamReader stream)
        {
            GenBankMetadata           metadata      = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
            IList <CitationReference> referenceList = metadata.References;
            CitationReference         reference     = null;

            while (line != null)
            {
                string lineHeader = GetLineHeader(line, DataIndent);
                if (lineHeader == "REFERENCE")
                {
                    // add previous reference
                    if (reference != null)
                    {
                        referenceList.Add(reference);
                    }

                    // check for start/end e.g. (bases 1 to 118), or prose notes
                    string lineData = GetLineData(line, DataIndent);

                    Match m = Regex.Match(lineData, @"^(?<number>\d+)(\s+\((?<location>.*)\))?");
                    if (!m.Success)
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserReferenceError,
                            lineData);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    // create new reference
                    string number   = m.Groups["number"].Value;
                    string location = m.Groups["location"].Value;
                    reference = new CitationReference();
                    int outValue;
                    if (!int.TryParse(number, out outValue))
                    {
                        throw new InvalidOperationException(string.Format(CultureInfo.CurrentCulture, Properties.Resource.InvalidRefNumber, number));
                    }
                    reference.Number   = outValue;
                    reference.Location = location;
                    line = GoToNextLine(line, stream);
                }
                else if (line.StartsWith(" ", StringComparison.Ordinal))
                {
                    switch (lineHeader)
                    {
                    // all the following are extracted the same way - possibly multiline
                    case "AUTHORS":
                        reference.Authors = ParseMultiLineData(ref line, " ", DataIndent, stream);
                        break;

                    case "CONSRTM":
                        reference.Consortiums = ParseMultiLineData(ref line, " ", DataIndent, stream);
                        break;

                    case "TITLE":
                        reference.Title = ParseMultiLineData(ref line, " ", DataIndent, stream);
                        break;

                    case "JOURNAL":
                        reference.Journal = ParseMultiLineData(ref line, " ", DataIndent, stream);
                        break;

                    case "REMARK":
                        reference.Remarks = ParseMultiLineData(ref line, " ", DataIndent, stream);
                        break;

                    case "MEDLINE":
                        reference.Medline = ParseMultiLineData(ref line, " ", DataIndent, stream);
                        break;

                    case "PUBMED":
                        reference.PubMed = ParseMultiLineData(ref line, " ", DataIndent, stream);
                        break;

                    default:
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserInvalidReferenceField,
                            lineHeader);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                }
                else
                {
                    // add last reference
                    if (reference != null)
                    {
                        referenceList.Add(reference);
                    }

                    // don't go to next line; current line still needs to be processed
                    break;
                }
            }

            return(line);
        }
        /// <summary>
        /// This method transfers all available features from the MBF sequence and 
        /// populate them into biopatml features data type.
        /// In this version only its name, start and end location is populated.
        /// </summary>
        /// <param name="metadata"></param>
        /// <returns></returns>
        private FeatureList ExtractFeatures(GenBankMetadata metadata)
        {
            List<FeatureItem> mbfFeatures = metadata.Features.All;
            FeatureList bioFeatureList = new FeatureList();

            foreach (FeatureItem item in mbfFeatures)
            {
                #region Constructs the feature outline first

                //Strand is always assumed to be forward +1
                QUT.Bio.BioPatML.Sequences.Feature bioFeature = new QUT.Bio.BioPatML.Sequences.Feature
                                        (item.Key, item.Location.Start, item.Location.End, 1);

                bioFeatureList.Add(bioFeature);

                #endregion

                #region Adds the qualifier key and values to Feature using AnnotationList

                AnnotationList annList = new AnnotationList();

                foreach (KeyValuePair<string, List<string>> qualitfier in item.Qualifiers)
                    annList.Add(qualitfier.Key, qualitfier.Value[0]);

                bioFeature.AddAnnotations(annList);

                #endregion
            }

            return bioFeatureList;
        }
Пример #34
0
        // parses everything before the features section
        private void ParseHeaders(BioTextReader bioReader, ref Sequence sequence)
        {
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
            string          data     = string.Empty;

            string[] tokens = null;
            // set data indent for headers
            bioReader.DataIndent = _dataIndent;

            // only allow one locus line
            bool haveParsedLocus = false;

            // parse until we hit the features or sequence section
            bool haveFinishedHeaders = false;

            while (bioReader.HasLines && !haveFinishedHeaders)
            {
                switch (bioReader.LineHeader)
                {
                case "LOCUS":
                    if (haveParsedLocus)
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserSecondLocus,
                            bioReader.LocationString);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                    ParseLocus(bioReader, ref sequence);
                    metadata        = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    haveParsedLocus = true;
                    // don't go to next line; current line still needs to be processed
                    break;

                case "VERSION":
                    tokens = bioReader.LineData.Split(new char[] { ' ' },
                                                      StringSplitOptions.RemoveEmptyEntries);
                    // first token contains accession and version
                    Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$");
                    metadata.Version = new GenBankVersion();

                    if (m.Success)
                    {
                        metadata.Version.Version = m.Groups["version"].Value;
                        // The first token in the data from the accession line is referred to as
                        // the primary accession number, and should be the one used here in the
                        // version line.
                        string versionLineAccession = m.Groups["accession"].Value;
                        if (metadata.Accession == null)
                        {
                            ApplicationLog.WriteLine("WARN: VERSION processed before ACCESSION");
                        }
                        else
                        {
                            if (!versionLineAccession.Equals(metadata.Accession.Primary))
                            {
                                ApplicationLog.WriteLine("WARN: VERSION tag doesn't match ACCESSION");
                            }
                            else
                            {
                                metadata.Version.Accession = metadata.Accession.Primary;
                            }
                        }
                    }
                    // second token contains primary ID
                    m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)");
                    if (m.Success)
                    {
                        metadata.Version.GINumber = m.Groups["primaryID"].Value;
                    }
                    bioReader.GoToNextLine();
                    break;

                case "PROJECT":
                    tokens = bioReader.LineData.Split(':');
                    if (tokens.Length == 2)
                    {
                        metadata.Project      = new ProjectIdentifier();
                        metadata.Project.Name = tokens[0];
                        tokens = tokens[1].Split(',');
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            metadata.Project.Numbers.Add(tokens[i]);
                        }
                    }
                    else
                    {
                        ApplicationLog.WriteLine("WARN: unexpected PROJECT header: " + bioReader.Line);
                    }
                    bioReader.GoToNextLine();
                    break;

                case "SOURCE":
                    ParseSource(bioReader, ref sequence);
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "REFERENCE":
                    ParseReferences(bioReader, ref sequence);       // can encounter more than one
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "COMMENT":
                    ParseComments(bioReader, ref sequence);       // can encounter more than one
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "PRIMARY":
                    // This header is followed by sequence info in a table format that could be
                    // stored in a custom object.  The first line contains column headers.
                    // For now, just validate the presence of the headers, and save the data
                    // as a string.
                    int[] locs = new int[4];
                    locs[0] = bioReader.LineData.IndexOf("TPA_SPAN", StringComparison.Ordinal);
                    locs[1] = bioReader.LineData.IndexOf("PRIMARY_IDENTIFIER", StringComparison.Ordinal);
                    locs[2] = bioReader.LineData.IndexOf("PRIMARY_SPAN", StringComparison.Ordinal);
                    locs[3] = bioReader.LineData.IndexOf("COMP", StringComparison.Ordinal);
                    if (locs[0] < 0 || locs[1] < 0 || locs[2] < 0 || locs[3] < 0)
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserPrimaryLineError,
                            bioReader.Line);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                    string primaryData = ParseMultiLineData(bioReader, Environment.NewLine);
                    metadata.Primary = primaryData;
                    // don't go to next line; current line still needs to be processed
                    break;

                // all the following are extracted the same way - possibly multiline
                case "DEFINITION":
                    metadata.Definition = ParseMultiLineData(bioReader, " ");
                    break;

                case "ACCESSION":
                    data = ParseMultiLineData(bioReader, " ");
                    metadata.Accession = new GenBankAccession();
                    string[] accessions = data.Split(' ');
                    metadata.Accession.Primary = accessions[0];

                    for (int i = 1; i < accessions.Length; i++)
                    {
                        metadata.Accession.Secondary.Add(accessions[i]);
                    }
                    break;

                case "DBLINK":
                    tokens = bioReader.LineData.Split(':');
                    if (tokens.Length == 2)
                    {
                        metadata.DBLink = new CrossReferenceLink();
                        if (string.Compare(tokens[0],
                                           CrossReferenceType.Project.ToString(),
                                           StringComparison.OrdinalIgnoreCase) == 0)
                        {
                            metadata.DBLink.Type = CrossReferenceType.Project;
                        }
                        else
                        {
                            metadata.DBLink.Type = CrossReferenceType.TraceAssemblyArchive;
                        }

                        tokens = tokens[1].Split(',');
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            metadata.DBLink.Numbers.Add(tokens[i]);
                        }
                    }
                    else
                    {
                        ApplicationLog.WriteLine("WARN: unexpected DBLINK header: " + bioReader.Line);
                    }
                    bioReader.GoToNextLine();
                    break;

                case "DBSOURCE":
                    metadata.DBSource = ParseMultiLineData(bioReader, " ");
                    break;

                case "KEYWORDS":
                    metadata.Keywords = ParseMultiLineData(bioReader, " ");
                    break;

                case "SEGMENT":
                    data = ParseMultiLineData(bioReader, " ");
                    string delimeter = "of";
                    tokens = data.Split(delimeter.ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
                    int outvalue;
                    if (tokens.Length == 2)
                    {
                        metadata.Segment = new SequenceSegment();
                        if (int.TryParse(tokens[0].Trim(), out outvalue))
                        {
                            metadata.Segment.Current = outvalue;
                        }
                        else
                        {
                            ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + bioReader.Line);
                        }

                        if (int.TryParse(tokens[1].Trim(), out outvalue))
                        {
                            metadata.Segment.Count = outvalue;
                        }
                        else
                        {
                            ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + bioReader.Line);
                        }
                    }
                    else
                    {
                        ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + bioReader.Line);
                    }
                    break;

                // all the following indicate sections beyond the headers parsed by this method
                case "FEATURES":
                case "BASE COUNT":
                case "ORIGIN":
                case "CONTIG":
                    haveFinishedHeaders = true;
                    break;

                default:
                    ApplicationLog.WriteLine(ToString() + "WARN: unknown {0} -> {1}", bioReader.LineHeader, bioReader.LineData);
                    string errMessage = String.Format(
                        CultureInfo.CurrentCulture,
                        Properties.Resource.ParseHeaderError,
                        bioReader.LineHeader);
                    Trace.Report(errMessage);
                    throw new InvalidDataException(errMessage);
                }
            }

            // check for required features
            if (!haveParsedLocus)
            {
                string message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }
        }
Пример #35
0
        // LOCUS is the first line in a GenBank record
        private void ParseLocus(BioTextReader bioReader, ref Sequence sequence)
        {
            GenBankLocusInfo locusInfo = new GenBankLocusInfo();

            // GenBank spec recommends token rather than position-based parsing, but this
            // is only partially possible without making extra assumptions about the presence
            // of optional fields.
            string[] tokens = bioReader.LineData.Split(new char[] { ' ' },
                                                       StringSplitOptions.RemoveEmptyEntries);
            sequence.ID    = tokens[0];
            locusInfo.Name = tokens[0];

            int sequenceLength;

            if (!int.TryParse(tokens[1], out sequenceLength))
            {
                throw new InvalidOperationException();
            }
            locusInfo.SequenceLength = sequenceLength;

            string seqType = tokens[2];

            if (seqType != "bp" && seqType != "aa")
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidLocus,
                    bioReader.Line);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }

            // Determine format version and parse the remaining fields by position.
            string strandType;
            string strandTopology;
            string division;
            string rawDate;
            string molType = string.Empty;

            if (Helper.StringHasMatch(bioReader.GetLineField(31, 32), "bp", "aa"))
            {
                // older format
                strandType     = bioReader.GetLineField(34, 36).Trim();
                strandTopology = bioReader.GetLineField(43, 52).Trim();
                division       = bioReader.GetLineField(53, 56).Trim();
                rawDate        = bioReader.GetLineField(63).Trim();

                // molecule type field is not used for amino acid chains
                if (seqType != "aa")
                {
                    molType = bioReader.GetLineField(37, 42).Trim();
                }
            }
            else
            {
                // newer format
                strandType     = bioReader.GetLineField(45, 47).Trim();
                strandTopology = bioReader.GetLineField(56, 63).Trim();
                division       = bioReader.GetLineField(65, 67).Trim();
                rawDate        = bioReader.GetLineField(69).Trim();

                // molecule type field is not used for amino acid chains
                if (seqType != "aa")
                {
                    molType = bioReader.GetLineField(48, 53).Trim();
                }
            }

            // process strand type
            if (!Helper.StringHasMatch(strandType, string.Empty, "ss-", "ds-", "ms-"))
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidLocus,
                    bioReader.Line);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }
            locusInfo.Strand = Helper.GetStrandType(strandType);

            // process strand topology
            if (!Helper.StringHasMatch(strandTopology, string.Empty, "linear", "circular"))
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidStrand,
                    strandTopology);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }

            locusInfo.StrandTopology = Helper.GetStrandTopology(strandTopology);

            // process division
            try
            {
                locusInfo.DivisionCode = (SequenceDivisionCode)Enum.Parse(typeof(SequenceDivisionCode), division);
            }
            catch (ArgumentException)
            {
                locusInfo.DivisionCode = SequenceDivisionCode.None;
            }

            // process date
            DateTime date;

            if (!DateTime.TryParse(rawDate, out date))
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidDate,
                    rawDate);
                Trace.Report(message);
                throw new FormatException(message);
            }

            locusInfo.Date         = date;
            locusInfo.SequenceType = seqType;

            // process sequence type and molecule type
            MoleculeType moleculeType;

            if (seqType == "aa")
            {
                moleculeType = MoleculeType.Protein;
            }
            else
            {
                moleculeType = GetMoleculeType(molType);

                if (moleculeType == MoleculeType.Invalid)
                {
                    string message = String.Format(
                        CultureInfo.CurrentCulture,
                        Properties.Resource.ParserInvalidLocus,
                        bioReader.Line);
                    Trace.Report(message);
                    throw new FormatException(message);
                }
            }

            IAlphabet alphabet = GetAlphabet(moleculeType);

            if (alphabet != sequence.Alphabet)
            {
                if (Alphabet != null && Alphabet != alphabet)
                {
                    string message = Properties.Resource.ParserIncorrectAlphabet;
                    Trace.Report(message);
                    throw new InvalidDataException(message);
                }
                sequence            = new Sequence(alphabet, Encoding, sequence);
                sequence.IsReadOnly = false;
            }

            sequence.MoleculeType  = moleculeType;
            locusInfo.MoleculeType = moleculeType;
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            metadata.Locus = locusInfo;
            bioReader.GoToNextLine();
        }
Пример #36
0
        /// <summary>
        /// Given range may contain normal metadata and features, if you can find 
        /// the heading 'Features' in any row of the range, anything below it is 
        /// part of features otherwise, try to parse everything as metadata.
        /// </summary>
        /// <param name="ranges">ranges</param>
        /// <returns></returns>
        public static GenBankMetadata RangeToGenBankMetadata(IList<Range> ranges)
        {
            GenBankMetadata metadata = new GenBankMetadata();
            int height = 0, width = 0;
            object[,] cellRange;

            foreach (Range r in ranges)
            {
                height += r.Rows.Count;
                width = width < r.Columns.Count ? r.Columns.Count : width;
            }

            cellRange = new object[height + 1, width + 1];
            int k = 1;
            foreach (Range r in ranges)
            {
                for (int i = 1; i <= r.Rows.Count; i++, k++)
                {
                    for (int j = 1; j <= r.Columns.Count; j++)
                    {
                        cellRange[k, j] = r[i, j].Value2 as object;
                    }
                }
            }

            int rowIndex = 1;

            while (rowIndex < cellRange.GetLength(0))
            {
                if (null != cellRange[rowIndex, 1])
                {
                    string cellValue = cellRange[rowIndex, 1].ToString().ToUpperInvariant();
                    switch (cellValue)
                    {
                        case METADATA:
                            rowIndex++;
                            rowIndex = ParseGenBankMetadata(metadata,
                                    cellRange,
                                    rowIndex);
                            break;
                        case FEATURES:
                            rowIndex++;
                            rowIndex = ParseGenBankFeatures(metadata,
                                    cellRange,
                                    rowIndex);
                            break;
                        default:
                            rowIndex++;
                            break;
                    }
                }
                else
                {
                    rowIndex++;
                }
            }

            return metadata;
        }
Пример #37
0
        private static void ParseSource(BioTextReader bioReader, ref Sequence sequence)
        {
            string source      = string.Empty;
            string organism    = string.Empty;
            string classLevels = string.Empty;

            while (bioReader.HasLines)
            {
                if (bioReader.LineHeader == "SOURCE")
                {
                    // data can be multiline. spec says last line must end with period
                    // (note: this doesn't apply unless multiline)
                    bool lastDotted = true;
                    source = bioReader.LineData;

                    bioReader.GoToNextLine();
                    while (bioReader.HasLines && !bioReader.LineHasHeader)
                    {
                        source    += " " + bioReader.LineData;
                        lastDotted = (source.EndsWith(".", StringComparison.Ordinal));
                        bioReader.GoToNextLine();
                    }

                    if (!lastDotted && Trace.Want(Trace.SeqWarnings))
                    {
                        Trace.Report("GenBank.ParseSource", Properties.Resource.OutOfSpec, source);
                    }

                    // don't go to next line; current line still needs to be processed
                }
                else if (bioReader.Line[0] == ' ')
                {
                    if (bioReader.LineHeader != "ORGANISM")
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserInvalidSourceField,
                            bioReader.LineHeader);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    // this also can be multiline
                    organism = bioReader.LineData;

                    bioReader.GoToNextLine();
                    while (bioReader.HasLines && !bioReader.LineHasHeader)
                    {
                        if (bioReader.Line.EndsWith(";", StringComparison.Ordinal) || bioReader.Line.EndsWith(".", StringComparison.Ordinal))
                        {
                            if (!String.IsNullOrEmpty(classLevels))
                            {
                                classLevels += " ";
                            }

                            classLevels += bioReader.LineData;
                        }
                        else
                        {
                            organism += " " + bioReader.LineData;
                        }
                        bioReader.GoToNextLine();
                    }

                    // don't go to next line; current line still needs to be processed
                }
                else
                {
                    // don't go to next line; current line still needs to be processed
                    break;
                }
            }

            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            metadata.Source            = new SequenceSource();
            metadata.Source.CommonName = source;
            if (!string.IsNullOrEmpty(organism))
            {
                int index = organism.IndexOf(" ", StringComparison.Ordinal);
                if (index > 0)
                {
                    metadata.Source.Organism.Genus = organism.Substring(0, index);
                    if (organism.Length > index)
                    {
                        index++;
                        metadata.Source.Organism.Species = organism.Substring(index, organism.Length - index);
                    }
                }
                else
                {
                    metadata.Source.Organism.Genus = organism;
                }
            }

            metadata.Source.Organism.ClassLevels = classLevels;
        }
Пример #38
0
        /// <summary>
        /// Parses source info.
        /// </summary>
        /// <param name="metadata">Metadata object</param>
        /// <param name="cellRange">Range of cells</param>
        /// <param name="rowIndex">Current index of row</param>
        /// <returns>Index of row</returns>
        private static int ParseSource(GenBankMetadata metadata, object[,] cellRange, int rowIndex)
        {
            string Key;
            string subKey;
            string value;
            string message;

            value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty;
            rowIndex++;

            while (rowIndex < cellRange.GetLength(0))
            {
                if (3 > cellRange.GetLength(1))
                {
                    message = String.Format(
                                CultureInfo.InvariantCulture,
                                Resources.UnrecognizedGenBankMetadataFormat,
                                SOURCE);
                    throw new FormatException(message);
                }

                if (null != cellRange[rowIndex, KeyColumnIndex])
                {
                    Key = cellRange[rowIndex, KeyColumnIndex].ToString();
                    if (!string.IsNullOrWhiteSpace(Key))
                    {
                        break;
                    }
                }

                if (null == cellRange[rowIndex, SubKeyColumnIndex] || string.IsNullOrWhiteSpace(cellRange[rowIndex, SubKeyColumnIndex].ToString()))
                {
                    message = String.Format(
                              CultureInfo.InvariantCulture,
                              Resources.UnrecognizedGenBankMetadataFormat,
                              SOURCE);
                    throw new FormatException(message);
                }

                if (metadata.Source == null)
                {
                    metadata.Source = new SequenceSource();
                    metadata.Source.CommonName = value;
                }

                subKey = cellRange[rowIndex, SubKeyColumnIndex].ToString().ToUpperInvariant();
                value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty;

                if (metadata.Source.Organism == null)
                {
                    metadata.Source.Organism = new OrganismInfo();
                }

                switch (subKey)
                {
                    case SOURCE_ORGANISM:
                        if (string.IsNullOrWhiteSpace(value))
                        {
                            break;
                        }

                        string[] tokens = value.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);

                        metadata.Source.Organism.Genus = tokens[0];

                        if (tokens.Length > 1)
                        {
                            metadata.Source.Organism.Species = tokens[1];

                            for (int i = 2; i < tokens.Length; i++)
                            {
                                metadata.Source.Organism.Species = metadata.Source.Organism.Species + " " + tokens[i];
                            }
                        }
                        break;
                    case SOURCE_CLASSLEVELS:
                        metadata.Source.Organism.ClassLevels = value;
                        break;
                }

                rowIndex++;
            }

            return rowIndex;
        }
Пример #39
0
        // Handle optional BASE COUNT, then ORIGIN and sequence data.
        private void ParseSequence(BioTextReader bioReader, ref Sequence sequence)
        {
            string message = string.Empty;

            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            // set data indent for sequence headers
            bioReader.DataIndent = _dataIndent;

            while (bioReader.HasLines)
            {
                if (bioReader.Line.StartsWith("//", StringComparison.Ordinal))
                {
                    bioReader.GoToNextLine();
                    break; // end of sequence record
                }

                switch (bioReader.LineHeader)
                {
                case "BASE COUNT":
                    // The BASE COUNT linetype is obsolete and was removed
                    // from the GenBank flatfile format in October 2003.  But if it is
                    // present, we will use it.  We get the untrimmed version since it
                    // starts with a right justified column.
                    metadata.BaseCount = bioReader.Line.Substring(_dataIndent);
                    bioReader.GoToNextLine();
                    break;

                case "ORIGIN":
                    // The origin line can contain optional data; don't put empty string into
                    // metadata.
                    if (!String.IsNullOrEmpty(bioReader.LineData))
                    {
                        metadata.Origin = bioReader.LineData;
                    }
                    bioReader.GoToNextLine();
                    IAlphabet alphabet = null;
                    while (bioReader.HasLines && bioReader.Line[0] == ' ')
                    {
                        // Using a regex is too slow.
                        int len = bioReader.Line.Length;
                        int k   = 10;
                        while (k < len)
                        {
                            string seqData = bioReader.Line.Substring(k, Math.Min(10, len - k));
                            if (Alphabet == null)
                            {
                                alphabet = IdentifyAlphabet(alphabet, seqData);

                                if (alphabet == null)
                                {
                                    message = String.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, bioReader.Line);
                                    Trace.Report(message);
                                    throw new InvalidDataException(message);
                                }

                                if (sequence.Alphabet != alphabet)
                                {
                                    Sequence seq = new Sequence(alphabet, Encoding, sequence);
                                    seq.MoleculeType = sequence.MoleculeType;
                                    seq.IsReadOnly   = false;
                                    sequence.Clear();
                                    sequence = seq;
                                }
                            }

                            sequence.InsertRange(sequence.Count, seqData);
                            k += 11;
                        }

                        bioReader.GoToNextLine();
                    }
                    break;

                case "CONTIG":
                    metadata.Contig = ParseMultiLineData(bioReader, Environment.NewLine);
                    // don't go to next line; current line still needs to be processed
                    break;

                default:
                    message = String.Format(
                        CultureInfo.CurrentCulture,
                        Properties.Resource.ParserUnexpectedLineInSequence,
                        bioReader.Line);
                    Trace.Report(message);
                    throw new InvalidDataException(message);
                }
            }
        }
Пример #40
0
        /// <summary>
        /// Helper method to parse the metadata of gen bank data
        /// </summary>
        /// <param name="metadata">Metadata object</param>
        /// <param name="cellRange">Range of cells</param>
        /// <param name="rowIndex">Current index of row</param>
        /// <returns>Index of row</returns>
        private static int ParseGenBankMetadata(GenBankMetadata metadata, object[,] cellRange, int rowIndex)
        {
            string message = string.Empty;
            string key;
            string subKey;
            string value = string.Empty;
            while (rowIndex < cellRange.GetLength(0))
            {
                if (null != cellRange[rowIndex, KeyColumnIndex] && !string.IsNullOrWhiteSpace(cellRange[rowIndex, KeyColumnIndex].ToString()))
                {
                    key = cellRange[rowIndex, KeyColumnIndex].ToString().ToUpperInvariant();
                    if (key.Equals(FEATURES))
                    {
                        break;
                    }
                }
                else
                {
                    rowIndex++;
                    continue;
                }

                subKey = cellRange[rowIndex, SubKeyColumnIndex] != null ? cellRange[rowIndex, SubKeyColumnIndex].ToString() : string.Empty;
                value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty;
                string[] tokens;
                switch (key)
                {
                    case LOCUS:
                        rowIndex = ParseLocus(metadata, cellRange, rowIndex);
                        rowIndex--;
                        break;
                    case DEFINITION:
                        metadata.Definition = value;
                        break;
                    case ACCESSION:

                        metadata.Accession = new GenBankAccession();
                        if (string.IsNullOrWhiteSpace(value))
                        {
                            message = String.Format(
                             CultureInfo.InvariantCulture,
                             Resources.UnrecognizedGenBankMetadataFormat,
                             ACCESSION);
                            throw new FormatException(message);
                        }

                        string[] accessions = value.Split(' ');
                        metadata.Accession.Primary = accessions[0];

                        for (int i = 1; i < accessions.Length; i++)
                        {
                            metadata.Accession.Secondary.Add(accessions[i]);
                        }
                        break;

                    case DBLINK:
                        if (!string.IsNullOrWhiteSpace(value))
                        {
                            break;
                        }

                        tokens = value.Split(':');
                        if (tokens.Length == 2)
                        {
                            if (metadata.DbLinks == null) 
                            { metadata.DbLinks = new List<CrossReferenceLink>(2); }
                            var curLink = new CrossReferenceLink();
                            
                            if (string.Compare(tokens[0],
                                CrossReferenceType.Project.ToString(),
                                StringComparison.OrdinalIgnoreCase) == 0)
                            {
                                curLink.Type = CrossReferenceType.Project;
                            }
                            else if (string.Compare(tokens[0], 
                                CrossReferenceType.BioProject.ToString(), 
                                StringComparison.OrdinalIgnoreCase) == 0)
                            {
                                curLink.Type = CrossReferenceType.BioProject;
                            }
                            else
                            {
                                curLink.Type = CrossReferenceType.TraceAssemblyArchive;
                            }

                            tokens = tokens[1].Split(',');
                            for (int i = 0; i < tokens.Length; i++)
                            {
                                curLink.Numbers.Add(tokens[i]);
                            }
                            metadata.DbLinks.Add(curLink);
                        }

                        break;
                    case DBSOURCE:
                        metadata.DbSource = value;
                        break;

                    case VERSION:
                        if (string.IsNullOrWhiteSpace(value))
                        {
                            break;
                        }

                        tokens = value.Split(new char[] { ' ' },
                           StringSplitOptions.RemoveEmptyEntries);
                        // first token contains accession and version
                        Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$");
                        metadata.Version = new GenBankVersion();

                        if (m.Success)
                        {
                            metadata.Version.Version = m.Groups["version"].Value;
                            // The first token in the data from the accession line is referred to as
                            // the primary accession number, and should be the one used here in the
                            // version line.
                            metadata.Version.Accession = m.Groups["accession"].Value;
                        }

                        if (tokens.Length > 1)
                        {
                            // second token contains primary ID
                            m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)");
                            if (m.Success)
                            {
                                metadata.Version.GiNumber = m.Groups["primaryID"].Value;
                            }
                        }
                        break;
                    case SEGMENT:
                        if (string.IsNullOrWhiteSpace(value))
                        {
                            break;
                        }

                        tokens = value.Split(" of ".ToArray(), StringSplitOptions.RemoveEmptyEntries);
                        if (tokens.Length == 2)
                        {
                            int current;
                            int count;
                            if (int.TryParse(tokens[0], out current))
                            {
                                if (int.TryParse(tokens[1], out count))
                                {
                                    metadata.Segment = new SequenceSegment();
                                    metadata.Segment.Current = current;
                                    metadata.Segment.Count = count;
                                }
                            }
                        }

                        if (metadata.Segment == null)
                        {
                            message = String.Format(
                            CultureInfo.InvariantCulture,
                            Resources.UnrecognizedGenBankMetadataFormat,
                            ACCESSION);
                            throw new FormatException(message);
                        }

                        break;
                    case KEYWORDS:
                        metadata.Keywords = value;
                        break;
                    case SOURCE:
                        rowIndex = ParseSource(metadata, cellRange, rowIndex);
                        rowIndex--;
                        break;
                    case REFERENCE:
                        rowIndex = ParseReference(metadata, cellRange, rowIndex);
                        rowIndex--;
                        break;
                    case PRIMARY:
                        metadata.Primary = value;
                        break;
                    case COMMENT:
                        if (!string.IsNullOrWhiteSpace(value))
                        {
                            tokens = value.Split(Environment.NewLine.ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
                            foreach (string str in tokens)
                            {
                                metadata.Comments.Add(str);
                            }
                        }
                        break;
                }

                rowIndex++;
            }

            return rowIndex;
        }
Пример #41
0
        /// <summary>
        ///     Validate GenBank features for medium size sequences.
        /// </summary>
        /// <param name="nodeName">xml node name.</param>
        /// <param name="methodName">DNA,RNA or Protein method</param>
        private void ValidateGenBankFeatures(string nodeName, string methodName)
        {
            // Get Values from XML node.
            string filePath = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.FilePathNode);
            string mRNAFeatureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.mRNACount);
            string exonFeatureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.ExonCount);
            string intronFeatureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.IntronCount);
            string cdsFeatureCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.CDSCount);
            string allFeaturesCount = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.GenBankFeaturesCount);
            string expectedCDSKey = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.CDSKey);
            string expectedIntronKey = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.IntronKey);
            string expectedExonKey = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.ExonKey);
            string mRNAKey = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.mRNAKey);
            string sourceKeyName = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.SourceKey);

            // Parse a file.
            ISequenceParser parserObj = new GenBankParser();
            {
                IEnumerable<ISequence> sequenceList = parserObj.Parse(filePath);

                // GenBank metadata.
                var metadata = new GenBankMetadata();
                if (1 == sequenceList.Count())
                {
                    metadata =
                        sequenceList.ElementAt(0).Metadata[Constants.GenBank] as GenBankMetadata;
                }
                else
                {
                    metadata =
                        sequenceList.ElementAt(1).Metadata[Constants.GenBank] as GenBankMetadata;
                }

                // Validate GenBank Features.
                Assert.AreEqual(metadata.Features.All.Count,
                                Convert.ToInt32(allFeaturesCount, null));
                Assert.AreEqual(metadata.Features.CodingSequences.Count,
                                Convert.ToInt32(cdsFeatureCount, null));
                Assert.AreEqual(metadata.Features.Exons.Count,
                                Convert.ToInt32(exonFeatureCount, null));
                Assert.AreEqual(metadata.Features.Introns.Count,
                                Convert.ToInt32(intronFeatureCount, null));
                Assert.AreEqual(metadata.Features.MessengerRNAs.Count,
                                Convert.ToInt32(mRNAFeatureCount, null));
                Assert.AreEqual(metadata.Features.Attenuators.Count, 0);
                Assert.AreEqual(metadata.Features.CAATSignals.Count, 0);
                Assert.AreEqual(metadata.Features.DisplacementLoops.Count, 0);
                Assert.AreEqual(metadata.Features.Enhancers.Count, 0);

                // Validate GenBank feature list.
                if ((0 == string.Compare(methodName, "DNA",
                                         CultureInfo.CurrentCulture, CompareOptions.IgnoreCase))
                    || (0 == string.Compare(methodName, "RNA",
                                            CultureInfo.CurrentCulture, CompareOptions.IgnoreCase)))
                {
                    IList<FeatureItem> featureList = metadata.Features.All;
                    Assert.AreEqual(featureList[0].Key.ToString(null), sourceKeyName);
                    Assert.AreEqual(featureList[1].Key.ToString(null), expectedCDSKey);
                    Assert.AreEqual(featureList[2].Key.ToString(null), expectedCDSKey);
                    Assert.AreEqual(featureList[10].Key.ToString(null), mRNAKey);
                    Assert.AreEqual(featureList[12].Key.ToString(null), expectedExonKey);
                    Assert.AreEqual(featureList[18].Key.ToString(null), expectedIntronKey);
                    ApplicationLog.WriteLine(
                        "GenBank Features P1: Successfully validated the GenBank Features");
                }
                else if ((0 == string.Compare(methodName, "Protein", CultureInfo.CurrentCulture,
                                              CompareOptions.IgnoreCase)))
                {
                    IList<FeatureItem> featureList = metadata.Features.All;
                    Assert.AreEqual(featureList[10].Key.ToString(null), expectedIntronKey);
                    Assert.AreEqual(featureList[18].Key.ToString(null), expectedExonKey);
                    ApplicationLog.WriteLine(
                        "GenBank Features P1: Successfully validated the GenBank Features");
                }
            }
        }