Esempio n. 1
0
        /// <summary>
        /// Parses a locus string into a <see cref="GenBankLocusInfo"/>.
        /// </summary>
        /// <param name="locusText">Locus text.</param>
        /// <returns>
        /// Locus containing the info in the passed in string.
        /// </returns>
        public GenBankLocusInfo Parse(string locusText)
        {
            line = locusText;

            var locus        = new GenBankLocusInfo();
            var tokenParsers = GetLocusTokenParsers(locus);

            foreach (string token in line.Split(' '))
            {
                if (string.IsNullOrEmpty(token))
                {
                    continue;
                }

                string token1 = token;
                Func <string, bool> usedParser = tokenParsers.FirstOrDefault(parser => parser(token1));

                if (usedParser == null)
                {
                    Trace.Report(string.Format(CultureInfo.InvariantCulture,
                                               Properties.Resource.GenBankFailedToParseLocusTokenFormat, token, line));
                }

                tokenParsers.Remove(usedParser);
            }

            if (String.IsNullOrEmpty(locus.SequenceType))
            {
                string message = String.Format(Properties.Resource.GenBankUnknownLocusFormat, line);
                Trace.Report(message);
                throw new Exception(message);
            }

            if (locus.SequenceType.ToLowerInvariant() == "aa")
            {
                locus.MoleculeType = MoleculeType.Protein;
            }

            return(locus);
        }
Esempio n. 2
0
        /// <summary>
        /// The LOCUS format has defined positions for each individual value in the LOCUS but through experimentation
        /// and some reading this format is not followed.  Instead we have to parse each token and interpret which value
        /// each token belongs too.  Luckily there is a standard set of values for all but the DATE and LOCUS ID, which we can
        /// infer based on the string.
        /// </summary>
        /// <param name="locus"></param>
        /// <returns></returns>
        private static List <Func <string, bool> > GetLocusTokenParsers(GenBankLocusInfo locus)
        {
            //
            // The order of token parsers matter, the items which we know definitions for must be parsed
            // before those which are inferred.
            //
            return
                (new List <Func <string, bool> >
            {
                //
                // 1. LOCUS Token
                //

                token => token == "LOCUS",

                //
                // 2. Strand Topology
                //

                token =>
                {
                    locus.StrandTopology = FirstOrDefault(
                        LocusConstants.MoleculeTopologies,
                        topology => topology.Key.ToLowerInvariant() == token.ToLowerInvariant(),
                        new KeyValuePair <string, SequenceStrandTopology>("", SequenceStrandTopology.None)).Value;

                    return locus.StrandTopology != SequenceStrandTopology.None;
                },

                //
                // 3. Strand & Molecule Definition Token
                //

                token =>
                {
                    //
                    // Strand and molecule definition are one token defining two separate attributes, such
                    // as ds-DNA so the parsing occurs on one token.
                    //

                    string s = token.ToLowerInvariant();

                    locus.Strand = FirstOrDefault(
                        LocusConstants.SequenceStrandTypes,
                        strand => s.StartsWith(strand.Key.ToLowerInvariant()),
                        new KeyValuePair <string, SequenceStrandType>("", SequenceStrandType.None)).Value;

                    if (locus.Strand != SequenceStrandType.None)
                    {
                        token = token.Remove(0,
                                             LocusConstants.SequenceStrandTypes.First(
                                                 strand => strand.Value == locus.Strand).Key.Length);
                    }

                    locus.MoleculeType = FirstOrDefault(
                        LocusConstants.AlphabetTypes,
                        moleculeType => token.ToLowerInvariant() == moleculeType.Key.ToLowerInvariant(),
                        new KeyValuePair <string, MoleculeType>("", MoleculeType.Invalid)).Value;

                    return locus.MoleculeType != MoleculeType.Invalid ||
                    locus.Strand != SequenceStrandType.None;
                },

                //
                // 4. Division Code
                //

                token =>
                {
                    locus.DivisionCode = FirstOrDefault(
                        LocusConstants.SequenceDivisionCodes,
                        divisionCode => divisionCode.Key.ToLowerInvariant() == token.ToLowerInvariant(),
                        new KeyValuePair <string, SequenceDivisionCode>("", SequenceDivisionCode.None)).Value;

                    return locus.DivisionCode != SequenceDivisionCode.None;
                },

                //
                // 4. Sequence Length
                //

                token =>
                {
                    int length;
                    bool result = int.TryParse(token, out length);
                    if (result)
                    {
                        locus.SequenceLength = length;
                    }
                    return result;
                },

                //
                // 5. Sequence Type
                //

                token =>
                {
                    locus.SequenceType = LocusConstants.SequenceTypes.FirstOrDefault(
                        sequenceType => sequenceType.ToLowerInvariant() == token.ToLowerInvariant());

                    return !string.IsNullOrEmpty(locus.SequenceType);
                },

                //
                // 6. Date
                //

                token =>
                {
                    DateTime dateTime;
                    bool result = DateTime.TryParse(token, out dateTime);
                    if (result)
                    {
                        locus.Date = dateTime;
                    }
                    return result;
                },

                //
                // 7. Sequence Name / ID
                //

                token =>
                {
                    locus.Name = token;
                    return true;
                }
            });
        }
Esempio n. 3
0
        // LOCUS is the first line in a GenBank record
        private void ParseLocus(BioTextReader bioReader, ref Sequence sequence)
        {
            GenBankLocusInfo locusInfo = new GenBankLocusInfo();

            // GenBank spec recommends token rather than position-based parsing, but this
            // is only partially possible without making extra assumptions about the presence
            // of optional fields.
            string[] tokens = bioReader.LineData.Split(new char[] { ' ' },
                                                       StringSplitOptions.RemoveEmptyEntries);
            sequence.ID    = tokens[0];
            locusInfo.Name = tokens[0];

            int sequenceLength;

            if (!int.TryParse(tokens[1], out sequenceLength))
            {
                throw new InvalidOperationException();
            }
            locusInfo.SequenceLength = sequenceLength;

            string seqType = tokens[2];

            if (seqType != "bp" && seqType != "aa")
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidLocus,
                    bioReader.Line);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }

            // Determine format version and parse the remaining fields by position.
            string strandType;
            string strandTopology;
            string division;
            string rawDate;
            string molType = string.Empty;

            if (Helper.StringHasMatch(bioReader.GetLineField(31, 32), "bp", "aa"))
            {
                // older format
                strandType     = bioReader.GetLineField(34, 36).Trim();
                strandTopology = bioReader.GetLineField(43, 52).Trim();
                division       = bioReader.GetLineField(53, 56).Trim();
                rawDate        = bioReader.GetLineField(63).Trim();

                // molecule type field is not used for amino acid chains
                if (seqType != "aa")
                {
                    molType = bioReader.GetLineField(37, 42).Trim();
                }
            }
            else
            {
                // newer format
                strandType     = bioReader.GetLineField(45, 47).Trim();
                strandTopology = bioReader.GetLineField(56, 63).Trim();
                division       = bioReader.GetLineField(65, 67).Trim();
                rawDate        = bioReader.GetLineField(69).Trim();

                // molecule type field is not used for amino acid chains
                if (seqType != "aa")
                {
                    molType = bioReader.GetLineField(48, 53).Trim();
                }
            }

            // process strand type
            if (!Helper.StringHasMatch(strandType, string.Empty, "ss-", "ds-", "ms-"))
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidLocus,
                    bioReader.Line);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }
            locusInfo.Strand = Helper.GetStrandType(strandType);

            // process strand topology
            if (!Helper.StringHasMatch(strandTopology, string.Empty, "linear", "circular"))
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidStrand,
                    strandTopology);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }

            locusInfo.StrandTopology = Helper.GetStrandTopology(strandTopology);

            // process division
            try
            {
                locusInfo.DivisionCode = (SequenceDivisionCode)Enum.Parse(typeof(SequenceDivisionCode), division);
            }
            catch (ArgumentException)
            {
                locusInfo.DivisionCode = SequenceDivisionCode.None;
            }

            // process date
            DateTime date;

            if (!DateTime.TryParse(rawDate, out date))
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidDate,
                    rawDate);
                Trace.Report(message);
                throw new FormatException(message);
            }

            locusInfo.Date         = date;
            locusInfo.SequenceType = seqType;

            // process sequence type and molecule type
            MoleculeType moleculeType;

            if (seqType == "aa")
            {
                moleculeType = MoleculeType.Protein;
            }
            else
            {
                moleculeType = GetMoleculeType(molType);

                if (moleculeType == MoleculeType.Invalid)
                {
                    string message = String.Format(
                        CultureInfo.CurrentCulture,
                        Properties.Resource.ParserInvalidLocus,
                        bioReader.Line);
                    Trace.Report(message);
                    throw new FormatException(message);
                }
            }

            IAlphabet alphabet = GetAlphabet(moleculeType);

            if (alphabet != sequence.Alphabet)
            {
                if (Alphabet != null && Alphabet != alphabet)
                {
                    string message = Properties.Resource.ParserIncorrectAlphabet;
                    Trace.Report(message);
                    throw new InvalidDataException(message);
                }
                sequence            = new Sequence(alphabet, Encoding, sequence);
                sequence.IsReadOnly = false;
            }

            sequence.MoleculeType  = moleculeType;
            locusInfo.MoleculeType = moleculeType;
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            metadata.Locus = locusInfo;
            bioReader.GoToNextLine();
        }
Esempio n. 4
0
        private static void WriteLocus(ISequence sequence, TextWriter txtWriter)
        {
            // determine molecule and sequence type
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            GenBankLocusInfo locusInfo = null;
            string           molType   = sequence.Alphabet.Name;

            if (metadata != null)
            {
                locusInfo = metadata.Locus;
                molType   = locusInfo.MoleculeType.ToString();
            }

            string seqType;

            if (sequence.Alphabet.Name != null)
            {
                if (molType == Alphabets.Protein.Name)
                {
                    seqType = "aa";
                    molType = string.Empty; // protein files don't use molecule type
                }
                else
                {
                    seqType = "bp";
                }
            }
            else
            {
                if (sequence.Alphabet == Alphabets.Protein)
                {
                    seqType = "aa";
                    molType = string.Empty; // protein files don't use molecule type
                }
                else
                {
                    seqType = "bp";

                    if (sequence.Alphabet == Alphabets.DNA)
                    {
                        molType = Alphabets.DNA.Name;
                    }
                    else
                    {
                        molType = Alphabets.RNA.Name;
                    }
                }
            }

            // retrieve metadata fields
            string   strandType     = string.Empty;
            string   strandTopology = string.Empty;
            string   division       = string.Empty;
            DateTime date           = DateTime.Now;

            if (locusInfo != null)
            {
                strandType = Helper.GetStrandType(locusInfo.Strand);

                strandTopology = Helper.GetStrandTopology(locusInfo.StrandTopology);
                if (locusInfo.DivisionCode != SequenceDivisionCode.None)
                {
                    division = locusInfo.DivisionCode.ToString();
                }

                date = locusInfo.Date;
            }

            txtWriter.WriteLine("{0,-12}{1,-16} {2,11} {3} {4,3}{5,-6}  {6,-8} {7,3} {8}",
                                "LOCUS",
                                sequence.ID,
                                sequence.Count,
                                seqType,
                                strandType,
                                molType,
                                strandTopology,
                                division,
                                date.ToString("dd-MMM-yyyy", CultureInfo.InvariantCulture).ToUpperInvariant());
        }
 /// <summary>
 /// There can only exist two type of strands namely the Circular and the Linear form.
 /// This method extracts the kind of strand from the Genbankmeta of a particular mbf sequence.
 /// </summary>
 /// <param name="locus">The locus information belonging to the MBF sequence</param>
 /// <returns>True: is circular ; else false.</returns>
 private bool IsCircularStrand(GenBankLocusInfo locus)
 {
     return locus.StrandTopology == SequenceStrandTopology.Circular ? true : false;
 }