コード例 #1
0
        // Write all the header sections that come before the features section.
        private void WriteHeaders(ISequence sequence, TextWriter writer)
        {
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            if (metadata != null)
            {
                WriteLocus(sequence, writer);
                WriteHeaderSection("DEFINITION", metadata.Definition, writer);

                if (metadata.Accession != null)
                {
                    WriteHeaderSection("ACCESSION", Helper.GetGenBankAccession(metadata.Accession), writer);

                    string version = "";
                    if (metadata.Version != null)
                    {
                        version = metadata.Accession.Primary + "." + metadata.Version.Version;

                        if (!string.IsNullOrEmpty(metadata.Version.GINumber))
                        {
                            version += "  GI:" + metadata.Version.GINumber;
                        }
                        if (version.Length > 0)
                        {
                            WriteHeaderSection("VERSION", version, writer);
                        }
                    }
                }

                if (metadata.Project != null)
                {
                    WriteHeaderSection("PROJECT", Helper.GetProjectIdentifier(metadata.Project), writer);
                }

                if (metadata.DBLink != null)
                {
                    WriteHeaderSection("DBLINK", Helper.GetCrossReferenceLink(metadata.DBLink), writer);
                }

                WriteHeaderSection("DBSOURCE", metadata.DBSource, writer);
                WriteHeaderSection("KEYWORDS", metadata.Keywords, writer);

                if (metadata.Segment != null)
                {
                    WriteHeaderSection("SEGMENT", Helper.GetSequenceSegment(metadata.Segment), writer);
                }

                WriteSource(metadata, writer);
                WriteReferences(metadata, writer);
                WriteComments(metadata, writer);
                WriteHeaderSection("PRIMARY", metadata.Primary, writer);
            }
        }
コード例 #2
0
        // Handle optional BASE COUNT, then ORIGIN and sequence data.
        private void ParseSequence(MBFTextReader mbfReader, ref Sequence sequence)
        {
            string message = string.Empty;

            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            // set data indent for sequence headers
            mbfReader.DataIndent = _dataIndent;

            while (mbfReader.HasLines)
            {
                if (mbfReader.Line.StartsWith("//", StringComparison.Ordinal))
                {
                    mbfReader.GoToNextLine();
                    break; // end of sequence record
                }

                switch (mbfReader.LineHeader)
                {
                case "BASE COUNT":
                    // The BASE COUNT linetype is obsolete and was removed
                    // from the GenBank flatfile format in October 2003.  But if it is
                    // present, we will use it.  We get the untrimmed version since it
                    // starts with a right justified column.
                    metadata.BaseCount = mbfReader.Line.Substring(_dataIndent);
                    mbfReader.GoToNextLine();
                    break;

                case "ORIGIN":
                    // Change Note: The original implementation would validate the alphabet every line
                    // which would greatly impact performance on large sequences.  This updates the method
                    // to improve performance by validating the alphabet after parsing the sequence.
                    ParseOrigin(mbfReader, metadata, ref sequence);
                    break;

                case "CONTIG":
                    metadata.Contig = ParseMultiLineData(mbfReader, Environment.NewLine);
                    // don't go to next line; current line still needs to be processed
                    break;

                default:
                    message = String.Format(
                        CultureInfo.CurrentCulture,
                        Properties.Resource.ParserUnexpectedLineInSequence,
                        mbfReader.Line);
                    Trace.Report(message);
                    throw new InvalidDataException(message);
                }
            }
        }
コード例 #3
0
        private void WriteFeatures(ISequence sequence, TextWriter writer)
        {
            ILocationBuilder locBuilder = LocationBuilder;

            if (locBuilder == null)
            {
                throw new InvalidOperationException(Resource.NullLocationBuild);
            }
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            if (metadata != null && metadata.Features != null)
            {
                WriteFeatureSection("FEATURES", "Location/Qualifiers", writer);

                // write the features in the order they were put in the list
                foreach (FeatureItem feature in metadata.Features.All)
                {
                    WriteFeatureSection(_featureHeaderIndentString + feature.Key, locBuilder.GetLocationString(feature.Location), writer);

                    // The sub-items of a feature are referred to as qualifiers.  These do not have
                    // unique keys, so they are stored as lists in the SubItems dictionary.
                    foreach (KeyValuePair <string, List <string> > qualifierList in feature.Qualifiers)
                    {
                        foreach (string qualifierValue in qualifierList.Value)
                        {
                            string data = "/" + qualifierList.Key;

                            if (qualifierValue != string.Empty)
                            {
                                data += "=" + qualifierValue;
                            }

                            // use a blank header; the qualifier key is part of the data
                            WriteFeatureSection(string.Empty, data, writer);
                        }
                    }
                }
            }
        }
コード例 #4
0
        private void WriteReferences(GenBankMetadata metadata, TextWriter writer)
        {
            if (metadata.References != null)
            {
                foreach (CitationReference reference in metadata.References)
                {
                    // format the data for the first line
                    string data = reference.Number.ToString();
                    if (!string.IsNullOrEmpty(reference.Location))
                    {
                        data = data.PadRight(3) + "(" + reference.Location + ")";
                    }

                    WriteHeaderSection("REFERENCE", data, writer);
                    WriteHeaderSection("  AUTHORS", reference.Authors, writer);
                    WriteHeaderSection("  CONSRTM", reference.Consortiums, writer);
                    WriteHeaderSection("  TITLE", reference.Title, writer);
                    WriteHeaderSection("  JOURNAL", reference.Journal, writer);
                    WriteHeaderSection("  MEDLINE", reference.Medline, writer);
                    WriteHeaderSection("   PUBMED", reference.PubMed, writer);
                    WriteHeaderSection("  REMARK", reference.Remarks, writer);
                }
            }
        }
コード例 #5
0
        private static void ParseSource(MBFTextReader mbfReader, ref Sequence sequence)
        {
            string source      = string.Empty;
            string organism    = string.Empty;
            string classLevels = string.Empty;

            while (mbfReader.HasLines)
            {
                if (mbfReader.LineHeader == "SOURCE")
                {
                    // data can be multiline. spec says last line must end with period
                    // (note: this doesn't apply unless multiline)
                    bool lastDotted = true;
                    source = mbfReader.LineData;

                    mbfReader.GoToNextLine();
                    while (mbfReader.HasLines && !mbfReader.LineHasHeader)
                    {
                        source    += " " + mbfReader.LineData;
                        lastDotted = (source.EndsWith(".", StringComparison.Ordinal));
                        mbfReader.GoToNextLine();
                    }

                    if (!lastDotted && Trace.Want(Trace.SeqWarnings))
                    {
                        Trace.Report("GenBank.ParseSource", Properties.Resource.OutOfSpec, source);
                    }

                    // don't go to next line; current line still needs to be processed
                }
                else if (mbfReader.Line[0] == ' ')
                {
                    if (mbfReader.LineHeader != "ORGANISM")
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserInvalidSourceField,
                            mbfReader.LineHeader);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    // this also can be multiline
                    organism = mbfReader.LineData;

                    mbfReader.GoToNextLine();
                    while (mbfReader.HasLines && !mbfReader.LineHasHeader)
                    {
                        if (mbfReader.Line.EndsWith(";", StringComparison.Ordinal) || mbfReader.Line.EndsWith(".", StringComparison.Ordinal))
                        {
                            if (!String.IsNullOrEmpty(classLevels))
                            {
                                classLevels += " ";
                            }

                            classLevels += mbfReader.LineData;
                        }
                        else
                        {
                            organism += " " + mbfReader.LineData;
                        }
                        mbfReader.GoToNextLine();
                    }

                    // don't go to next line; current line still needs to be processed
                }
                else
                {
                    // don't go to next line; current line still needs to be processed
                    break;
                }
            }

            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            metadata.Source            = new SequenceSource();
            metadata.Source.CommonName = source;
            if (!string.IsNullOrEmpty(organism))
            {
                int index = organism.IndexOf(" ", StringComparison.Ordinal);
                if (index > 0)
                {
                    metadata.Source.Organism.Genus = organism.Substring(0, index);
                    if (organism.Length > index)
                    {
                        index++;
                        metadata.Source.Organism.Species = organism.Substring(index, organism.Length - index);
                    }
                }
                else
                {
                    metadata.Source.Organism.Genus = organism;
                }
            }

            metadata.Source.Organism.ClassLevels = classLevels;
        }
コード例 #6
0
        private static void ParseReferences(MBFTextReader mbfReader, ref Sequence sequence)
        {
            GenBankMetadata           metadata      = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
            IList <CitationReference> referenceList = metadata.References;
            CitationReference         reference     = null;

            //List<MetadataListItem<string>> referenceList = new List<MetadataListItem<string>>();
            //MetadataListItem<string> reference = null;

            while (mbfReader.HasLines)
            {
                if (mbfReader.LineHeader == "REFERENCE")
                {
                    // add previous reference
                    if (reference != null)
                    {
                        referenceList.Add(reference);
                    }

                    // check for start/end e.g. (bases 1 to 118), or prose notes
                    Match m = Regex.Match(mbfReader.LineData,
                                          @"^(?<number>\d+)(\s+\((?<location>.*)\))?");
                    if (!m.Success)
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserReferenceError,
                            mbfReader.LineData);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    // create new reference
                    string number   = m.Groups["number"].Value;
                    string location = m.Groups["location"].Value;
                    reference = new CitationReference();
                    int outValue;
                    if (!int.TryParse(number, out outValue))
                    {
                        throw new InvalidOperationException();
                    }
                    reference.Number   = outValue;
                    reference.Location = location;
                    mbfReader.GoToNextLine();
                }
                else if (mbfReader.Line.StartsWith(" ", StringComparison.Ordinal))
                {
                    switch (mbfReader.LineHeader)
                    {
                    // all the following are extracted the same way - possibly multiline
                    case "AUTHORS":
                        reference.Authors = ParseMultiLineData(mbfReader, " ");
                        break;

                    case "CONSRTM":
                        reference.Consortiums = ParseMultiLineData(mbfReader, " ");
                        break;

                    case "TITLE":
                        reference.Title = ParseMultiLineData(mbfReader, " ");
                        break;

                    case "JOURNAL":
                        reference.Journal = ParseMultiLineData(mbfReader, " ");
                        break;

                    case "REMARK":
                        reference.Remarks = ParseMultiLineData(mbfReader, " ");
                        break;

                    case "MEDLINE":
                        reference.Medline = ParseMultiLineData(mbfReader, " ");
                        break;

                    case "PUBMED":
                        reference.PubMed = ParseMultiLineData(mbfReader, " ");
                        break;

                    default:
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserInvalidReferenceField,
                            mbfReader.LineHeader);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                }
                else
                {
                    // add last reference
                    if (reference != null)
                    {
                        referenceList.Add(reference);
                    }

                    // don't go to next line; current line still needs to be processed
                    break;
                }
            }
        }
コード例 #7
0
        // parses everything before the features section
        private void ParseHeaders(MBFTextReader mbfReader, ref Sequence sequence)
        {
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
            string          data     = string.Empty;

            string[] tokens = null;
            // set data indent for headers
            mbfReader.DataIndent = _dataIndent;

            // only allow one locus line
            bool haveParsedLocus = false;

            // parse until we hit the features or sequence section
            bool haveFinishedHeaders = false;

            while (mbfReader.HasLines && !haveFinishedHeaders)
            {
                switch (mbfReader.LineHeader)
                {
                case "LOCUS":
                    if (haveParsedLocus)
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserSecondLocus,
                            mbfReader.LocationString);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                    ParseLocusByTokens(mbfReader, ref sequence);
                    metadata        = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    haveParsedLocus = true;
                    // don't go to next line; current line still needs to be processed
                    break;

                case "VERSION":
                    tokens = mbfReader.LineData.Split(new char[] { ' ' },
                                                      StringSplitOptions.RemoveEmptyEntries);
                    // first token contains accession and version
                    Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$");
                    metadata.Version = new GenBankVersion();

                    if (m.Success)
                    {
                        metadata.Version.Version = m.Groups["version"].Value;
                        // The first token in the data from the accession line is referred to as
                        // the primary accession number, and should be the one used here in the
                        // version line.
                        string versionLineAccession = m.Groups["accession"].Value;
                        if (metadata.Accession == null)
                        {
                            ApplicationLog.WriteLine("WARN: VERSION processed before ACCESSION");
                        }
                        else
                        {
                            if (!versionLineAccession.Equals(metadata.Accession.Primary))
                            {
                                ApplicationLog.WriteLine("WARN: VERSION tag doesn't match ACCESSION");
                            }
                            else
                            {
                                metadata.Version.Accession = metadata.Accession.Primary;
                            }
                        }
                    }
                    // second token contains primary ID
                    m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)");
                    if (m.Success)
                    {
                        metadata.Version.GINumber = m.Groups["primaryID"].Value;
                    }
                    mbfReader.GoToNextLine();
                    break;

                case "PROJECT":
                    tokens = mbfReader.LineData.Split(':');
                    if (tokens.Length == 2)
                    {
                        metadata.Project      = new ProjectIdentifier();
                        metadata.Project.Name = tokens[0];
                        tokens = tokens[1].Split(',');
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            metadata.Project.Numbers.Add(tokens[i]);
                        }
                    }
                    else
                    {
                        ApplicationLog.WriteLine("WARN: unexpected PROJECT header: " + mbfReader.Line);
                    }
                    mbfReader.GoToNextLine();
                    break;

                case "SOURCE":
                    ParseSource(mbfReader, ref sequence);
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "REFERENCE":
                    ParseReferences(mbfReader, ref sequence);       // can encounter more than one
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "COMMENT":
                    ParseComments(mbfReader, ref sequence);       // can encounter more than one
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "PRIMARY":
                    // This header is followed by sequence info in a table format that could be
                    // stored in a custom object.  The first line contains column headers.
                    // For now, just validate the presence of the headers, and save the data
                    // as a string.
                    tokens = mbfReader.LineData.Split("\t ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);

                    // Validating for minimum two headers.
                    if (tokens.Length != 4)
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserPrimaryLineError,
                            mbfReader.Line);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    string primaryData = ParseMultiLineData(mbfReader, Environment.NewLine);
                    metadata.Primary = primaryData;
                    // don't go to next line; current line still needs to be processed
                    break;

                // all the following are extracted the same way - possibly multiline
                case "DEFINITION":
                    metadata.Definition = ParseMultiLineData(mbfReader, " ");
                    break;

                case "ACCESSION":
                    data = ParseMultiLineData(mbfReader, " ");
                    metadata.Accession = new GenBankAccession();
                    string[] accessions = data.Split(' ');
                    metadata.Accession.Primary = accessions[0];

                    for (int i = 1; i < accessions.Length; i++)
                    {
                        metadata.Accession.Secondary.Add(accessions[i]);
                    }
                    break;

                case "DBLINK":
                    tokens = mbfReader.LineData.Split(':');
                    if (tokens.Length == 2)
                    {
                        metadata.DBLink = new CrossReferenceLink();
                        if (string.Compare(tokens[0],
                                           CrossReferenceType.Project.ToString(),
                                           StringComparison.OrdinalIgnoreCase) == 0)
                        {
                            metadata.DBLink.Type = CrossReferenceType.Project;
                        }
                        else
                        {
                            metadata.DBLink.Type = CrossReferenceType.TraceAssemblyArchive;
                        }

                        tokens = tokens[1].Split(',');
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            metadata.DBLink.Numbers.Add(tokens[i]);
                        }
                    }
                    else
                    {
                        ApplicationLog.WriteLine("WARN: unexpected DBLINK header: " + mbfReader.Line);
                    }
                    mbfReader.GoToNextLine();
                    break;

                case "DBSOURCE":
                    metadata.DBSource = ParseMultiLineData(mbfReader, " ");
                    break;

                case "KEYWORDS":
                    metadata.Keywords = ParseMultiLineData(mbfReader, " ");
                    break;

                case "SEGMENT":
                    data = ParseMultiLineData(mbfReader, " ");
                    string delimeter = "of";
                    tokens = data.Split(delimeter.ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
                    int outvalue;
                    if (tokens.Length == 2)
                    {
                        metadata.Segment = new SequenceSegment();
                        if (int.TryParse(tokens[0].Trim(), out outvalue))
                        {
                            metadata.Segment.Current = outvalue;
                        }
                        else
                        {
                            ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + mbfReader.Line);
                        }

                        if (int.TryParse(tokens[1].Trim(), out outvalue))
                        {
                            metadata.Segment.Count = outvalue;
                        }
                        else
                        {
                            ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + mbfReader.Line);
                        }
                    }
                    else
                    {
                        ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + mbfReader.Line);
                    }
                    break;

                // all the following indicate sections beyond the headers parsed by this method
                case "FEATURES":
                case "BASE COUNT":
                case "ORIGIN":
                case "CONTIG":
                    haveFinishedHeaders = true;
                    break;

                default:
                    ApplicationLog.WriteLine(ToString() + "WARN: unknown {0} -> {1}", mbfReader.LineHeader, mbfReader.LineData);
                    string errMessage = String.Format(
                        CultureInfo.CurrentCulture,
                        Properties.Resource.ParseHeaderError,
                        mbfReader.LineHeader);
                    Trace.Report(errMessage);
                    throw new InvalidDataException(errMessage);
                }
            }

            // check for required features
            if (!haveParsedLocus)
            {
                string message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }
        }
コード例 #8
0
        private void WriteLocus(ISequence sequence, TextWriter writer)
        {
            // determine molecule and seqiemce type
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            GenBankLocusInfo locusInfo = null;

            if (metadata != null)
            {
                locusInfo = metadata.Locus;
            }

            string molType = sequence.MoleculeType.ToString();
            string seqType;

            if (sequence.MoleculeType != MoleculeType.Invalid)
            {
                if (molType == MoleculeType.Protein.ToString())
                {
                    seqType = "aa";
                    molType = string.Empty; // protein files don't use molecule type
                }
                else
                {
                    seqType = "bp";
                }
            }
            else
            {
                if (sequence.Alphabet == Alphabets.Protein)
                {
                    seqType = "aa";
                    molType = string.Empty; // protein files don't use molecule type
                }
                else
                {
                    seqType = "bp";

                    if (sequence.Alphabet == Alphabets.DNA)
                    {
                        molType = MoleculeType.DNA.ToString();
                    }
                    else
                    {
                        molType = MoleculeType.RNA.ToString();
                    }
                }
            }

            // retrieve metadata fields
            string   strandType     = string.Empty;
            string   strandTopology = string.Empty;
            string   division       = string.Empty;
            DateTime date           = DateTime.Now;

            if (locusInfo != null)
            {
                strandType = Helper.GetStrandType(locusInfo.Strand);

                strandTopology = Helper.GetStrandTopology(locusInfo.StrandTopology);
                if (locusInfo.DivisionCode != SequenceDivisionCode.None)
                {
                    division = locusInfo.DivisionCode.ToString();
                }

                date = locusInfo.Date;
            }

            writer.WriteLine("{0,-12}{1,-16} {2,11} {3} {4,3}{5,-6}  {6,-8} {7,3} {8}",
                             "LOCUS",
                             sequence.ID,
                             sequence.Count,
                             seqType,
                             strandType,
                             molType,
                             strandTopology,
                             division,
                             date.ToString("dd-MMM-yyyy").ToUpper());
        }