/// <summary> /// Parses a locus string into a <see cref="GenBankLocusInfo"/>. /// </summary> /// <param name="locusText">Locus text.</param> /// <returns> /// Locus containing the info in the passed in string. /// </returns> public GenBankLocusInfo Parse(string locusText) { line = locusText; var locus = new GenBankLocusInfo(); var tokenParsers = GetLocusTokenParsers(locus); foreach (string token in line.Split(' ')) { if (string.IsNullOrEmpty(token)) { continue; } string token1 = token; Func <string, bool> usedParser = tokenParsers.FirstOrDefault(parser => parser(token1)); if (usedParser == null) { Trace.Report(string.Format(CultureInfo.InvariantCulture, Properties.Resource.GenBankFailedToParseLocusTokenFormat, token, line)); } tokenParsers.Remove(usedParser); } if (String.IsNullOrEmpty(locus.SequenceType)) { string message = String.Format(Properties.Resource.GenBankUnknownLocusFormat, line); Trace.Report(message); throw new Exception(message); } if (locus.SequenceType.ToLowerInvariant() == "aa") { locus.MoleculeType = MoleculeType.Protein; } return(locus); }
/// <summary> /// The LOCUS format has defined positions for each individual value in the LOCUS but through experimentation /// and some reading this format is not followed. Instead we have to parse each token and interpret which value /// each token belongs too. Luckily there is a standard set of values for all but the DATE and LOCUS ID, which we can /// infer based on the string. /// </summary> /// <param name="locus"></param> /// <returns></returns> private static List <Func <string, bool> > GetLocusTokenParsers(GenBankLocusInfo locus) { // // The order of token parsers matter, the items which we know definitions for must be parsed // before those which are inferred. // return (new List <Func <string, bool> > { // // 1. LOCUS Token // token => token == "LOCUS", // // 2. Strand Topology // token => { locus.StrandTopology = FirstOrDefault( LocusConstants.MoleculeTopologies, topology => topology.Key.ToLowerInvariant() == token.ToLowerInvariant(), new KeyValuePair <string, SequenceStrandTopology>("", SequenceStrandTopology.None)).Value; return locus.StrandTopology != SequenceStrandTopology.None; }, // // 3. Strand & Molecule Definition Token // token => { // // Strand and molecule definition are one token defining two separate attributes, such // as ds-DNA so the parsing occurs on one token. // string s = token.ToLowerInvariant(); locus.Strand = FirstOrDefault( LocusConstants.SequenceStrandTypes, strand => s.StartsWith(strand.Key.ToLowerInvariant()), new KeyValuePair <string, SequenceStrandType>("", SequenceStrandType.None)).Value; if (locus.Strand != SequenceStrandType.None) { token = token.Remove(0, LocusConstants.SequenceStrandTypes.First( strand => strand.Value == locus.Strand).Key.Length); } locus.MoleculeType = FirstOrDefault( LocusConstants.AlphabetTypes, moleculeType => token.ToLowerInvariant() == moleculeType.Key.ToLowerInvariant(), new KeyValuePair <string, MoleculeType>("", MoleculeType.Invalid)).Value; return locus.MoleculeType != MoleculeType.Invalid || locus.Strand != SequenceStrandType.None; }, // // 4. Division Code // token => { locus.DivisionCode = FirstOrDefault( LocusConstants.SequenceDivisionCodes, divisionCode => divisionCode.Key.ToLowerInvariant() == token.ToLowerInvariant(), new KeyValuePair <string, SequenceDivisionCode>("", SequenceDivisionCode.None)).Value; return locus.DivisionCode != SequenceDivisionCode.None; }, // // 4. Sequence Length // token => { int length; bool result = int.TryParse(token, out length); if (result) { locus.SequenceLength = length; } return result; }, // // 5. Sequence Type // token => { locus.SequenceType = LocusConstants.SequenceTypes.FirstOrDefault( sequenceType => sequenceType.ToLowerInvariant() == token.ToLowerInvariant()); return !string.IsNullOrEmpty(locus.SequenceType); }, // // 6. Date // token => { DateTime dateTime; bool result = DateTime.TryParse(token, out dateTime); if (result) { locus.Date = dateTime; } return result; }, // // 7. Sequence Name / ID // token => { locus.Name = token; return true; } }); }
// LOCUS is the first line in a GenBank record private void ParseLocus(BioTextReader bioReader, ref Sequence sequence) { GenBankLocusInfo locusInfo = new GenBankLocusInfo(); // GenBank spec recommends token rather than position-based parsing, but this // is only partially possible without making extra assumptions about the presence // of optional fields. string[] tokens = bioReader.LineData.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); sequence.ID = tokens[0]; locusInfo.Name = tokens[0]; int sequenceLength; if (!int.TryParse(tokens[1], out sequenceLength)) { throw new InvalidOperationException(); } locusInfo.SequenceLength = sequenceLength; string seqType = tokens[2]; if (seqType != "bp" && seqType != "aa") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidLocus, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } // Determine format version and parse the remaining fields by position. string strandType; string strandTopology; string division; string rawDate; string molType = string.Empty; if (Helper.StringHasMatch(bioReader.GetLineField(31, 32), "bp", "aa")) { // older format strandType = bioReader.GetLineField(34, 36).Trim(); strandTopology = bioReader.GetLineField(43, 52).Trim(); division = bioReader.GetLineField(53, 56).Trim(); rawDate = bioReader.GetLineField(63).Trim(); // molecule type field is not used for amino acid chains if (seqType != "aa") { molType = bioReader.GetLineField(37, 42).Trim(); } } else { // newer format strandType = bioReader.GetLineField(45, 47).Trim(); strandTopology = bioReader.GetLineField(56, 63).Trim(); division = bioReader.GetLineField(65, 67).Trim(); rawDate = bioReader.GetLineField(69).Trim(); // molecule type field is not used for amino acid chains if (seqType != "aa") { molType = bioReader.GetLineField(48, 53).Trim(); } } // process strand type if (!Helper.StringHasMatch(strandType, string.Empty, "ss-", "ds-", "ms-")) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidLocus, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } locusInfo.Strand = Helper.GetStrandType(strandType); // process strand topology if (!Helper.StringHasMatch(strandTopology, string.Empty, "linear", "circular")) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidStrand, strandTopology); Trace.Report(message); throw new InvalidDataException(message); } locusInfo.StrandTopology = Helper.GetStrandTopology(strandTopology); // process division try { locusInfo.DivisionCode = (SequenceDivisionCode)Enum.Parse(typeof(SequenceDivisionCode), division); } catch (ArgumentException) { locusInfo.DivisionCode = SequenceDivisionCode.None; } // process date DateTime date; if (!DateTime.TryParse(rawDate, out date)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidDate, rawDate); Trace.Report(message); throw new FormatException(message); } locusInfo.Date = date; locusInfo.SequenceType = seqType; // process sequence type and molecule type MoleculeType moleculeType; if (seqType == "aa") { moleculeType = MoleculeType.Protein; } else { moleculeType = GetMoleculeType(molType); if (moleculeType == MoleculeType.Invalid) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidLocus, bioReader.Line); Trace.Report(message); throw new FormatException(message); } } IAlphabet alphabet = GetAlphabet(moleculeType); if (alphabet != sequence.Alphabet) { if (Alphabet != null && Alphabet != alphabet) { string message = Properties.Resource.ParserIncorrectAlphabet; Trace.Report(message); throw new InvalidDataException(message); } sequence = new Sequence(alphabet, Encoding, sequence); sequence.IsReadOnly = false; } sequence.MoleculeType = moleculeType; locusInfo.MoleculeType = moleculeType; GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; metadata.Locus = locusInfo; bioReader.GoToNextLine(); }
private static void WriteLocus(ISequence sequence, TextWriter txtWriter) { // determine molecule and sequence type GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; GenBankLocusInfo locusInfo = null; string molType = sequence.Alphabet.Name; if (metadata != null) { locusInfo = metadata.Locus; molType = locusInfo.MoleculeType.ToString(); } string seqType; if (sequence.Alphabet.Name != null) { if (molType == Alphabets.Protein.Name) { seqType = "aa"; molType = string.Empty; // protein files don't use molecule type } else { seqType = "bp"; } } else { if (sequence.Alphabet == Alphabets.Protein) { seqType = "aa"; molType = string.Empty; // protein files don't use molecule type } else { seqType = "bp"; if (sequence.Alphabet == Alphabets.DNA) { molType = Alphabets.DNA.Name; } else { molType = Alphabets.RNA.Name; } } } // retrieve metadata fields string strandType = string.Empty; string strandTopology = string.Empty; string division = string.Empty; DateTime date = DateTime.Now; if (locusInfo != null) { strandType = Helper.GetStrandType(locusInfo.Strand); strandTopology = Helper.GetStrandTopology(locusInfo.StrandTopology); if (locusInfo.DivisionCode != SequenceDivisionCode.None) { division = locusInfo.DivisionCode.ToString(); } date = locusInfo.Date; } txtWriter.WriteLine("{0,-12}{1,-16} {2,11} {3} {4,3}{5,-6} {6,-8} {7,3} {8}", "LOCUS", sequence.ID, sequence.Count, seqType, strandType, molType, strandTopology, division, date.ToString("dd-MMM-yyyy", CultureInfo.InvariantCulture).ToUpperInvariant()); }
/// <summary> /// There can only exist two type of strands namely the Circular and the Linear form. /// This method extracts the kind of strand from the Genbankmeta of a particular mbf sequence. /// </summary> /// <param name="locus">The locus information belonging to the MBF sequence</param> /// <returns>True: is circular ; else false.</returns> private bool IsCircularStrand(GenBankLocusInfo locus) { return locus.StrandTopology == SequenceStrandTopology.Circular ? true : false; }