/// <summary> /// Parses a locus string into a <see cref="GenBankLocusInfo"/>. /// </summary> /// <param name="locusText">Locus text.</param> /// <returns> /// Locus containing the info in the passed in string. /// </returns> public GenBankLocusInfo Parse(string locusText) { this.line = locusText; var locus = new GenBankLocusInfo(); var tokenParsers = GetLocusTokenParsers(locus); foreach (string token in line.Split(' ')) { if (string.IsNullOrEmpty(token)) { continue; } string token1 = token; Func <string, bool> usedParser = tokenParsers.FirstOrDefault(parser => parser(token1)); if (usedParser == null) { Trace.Report(string.Format(CultureInfo.InvariantCulture, Resource.GenBankFailedToParseLocusTokenFormat, token, this.line)); } tokenParsers.Remove(usedParser); } if (String.IsNullOrEmpty(locus.SequenceType)) { string message = String.Format(Resource.GenBankUnknownLocusFormat, this.line); Trace.Report(message); throw new Exception(message); } if (locus.SequenceType.ToLowerInvariant() == "aa") { locus.MoleculeType = MoleculeType.Protein; } return(locus); }
private void WriteLocus(ISequence sequence, TextWriter writer) { // determine molecule and seqiemce type GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; GenBankLocusInfo locusInfo = null; if (metadata != null) { locusInfo = metadata.Locus; } string molType = sequence.MoleculeType.ToString(); string seqType; if (sequence.MoleculeType != MoleculeType.Invalid) { if (molType == MoleculeType.Protein.ToString()) { seqType = "aa"; molType = string.Empty; // protein files don't use molecule type } else { seqType = "bp"; } } else { if (sequence.Alphabet == Alphabets.Protein) { seqType = "aa"; molType = string.Empty; // protein files don't use molecule type } else { seqType = "bp"; if (sequence.Alphabet == Alphabets.DNA) { molType = MoleculeType.DNA.ToString(); } else { molType = MoleculeType.RNA.ToString(); } } } // retrieve metadata fields string strandType = string.Empty; string strandTopology = string.Empty; string division = string.Empty; DateTime date = DateTime.Now; if (locusInfo != null) { strandType = Helper.GetStrandType(locusInfo.Strand); strandTopology = Helper.GetStrandTopology(locusInfo.StrandTopology); if (locusInfo.DivisionCode != SequenceDivisionCode.None) { division = locusInfo.DivisionCode.ToString(); } date = locusInfo.Date; } writer.WriteLine("{0,-12}{1,-16} {2,11} {3} {4,3}{5,-6} {6,-8} {7,3} {8}", "LOCUS", sequence.ID, sequence.Count, seqType, strandType, molType, strandTopology, division, date.ToString("dd-MMM-yyyy").ToUpper()); }
/// <summary> /// The LOCUS format has defined positions for each individual value in the LOCUS but through experimentation /// and some reading this format is not followed. Instead we have to parse each token and interpret which value /// each token belongs too. Luckily there is a standard set of values for all but the DATE and LOCUS ID, which we can /// infer based on the string. /// </summary> /// <param name="locus"></param> /// <returns></returns> private static List <Func <string, bool> > GetLocusTokenParsers(GenBankLocusInfo locus) { // // The order of token parsers matter, the items which we know definitions for must be parsed // before those which are inferred. // return (new List <Func <string, bool> > { // // 1. LOCUS Token // token => token == "LOCUS", // // 2. Strand Topology // token => { locus.StrandTopology = FirstOrDefault( LocusConstants.MoleculeTopologies, topology => topology.Key.ToLowerInvariant() == token.ToLowerInvariant(), new KeyValuePair <string, SequenceStrandTopology>("", SequenceStrandTopology.None)).Value; return locus.StrandTopology != SequenceStrandTopology.None; }, // // 3. Strand & Molecule Definition Token // token => { // // Strand and molecule definition are one token defining two seperate attributes, such // as ds-DNA so the parsing occurs on one token. // string s = token.ToLowerInvariant(); locus.Strand = FirstOrDefault( LocusConstants.SequenceStrandTypes, strand => s.StartsWith(strand.Key.ToLowerInvariant()), new KeyValuePair <string, SequenceStrandType>("", SequenceStrandType.None)).Value; if (locus.Strand != SequenceStrandType.None) { token = token.Remove(0, LocusConstants.SequenceStrandTypes.First( strand => strand.Value == locus.Strand).Key.Length); } locus.MoleculeType = FirstOrDefault( LocusConstants.MoleculeTypes, moleculeType => token.ToLowerInvariant() == moleculeType.Key.ToLowerInvariant(), new KeyValuePair <string, MoleculeType>("", MoleculeType.Invalid)).Value; return locus.MoleculeType != MoleculeType.Invalid || locus.Strand != SequenceStrandType.None; }, // // 4. Division Code // token => { locus.DivisionCode = FirstOrDefault( LocusConstants.SequenceDivisionCodes, divisionCode => divisionCode.Key.ToLowerInvariant() == token.ToLowerInvariant(), new KeyValuePair <string, SequenceDivisionCode>("", SequenceDivisionCode.None)).Value; return locus.DivisionCode != SequenceDivisionCode.None; }, // // 4. Sequence Length // token => { int length; bool result = int.TryParse(token, out length); if (result) { locus.SequenceLength = length; } return result; }, // // 5. Sequence Type // token => { locus.SequenceType = LocusConstants.SequenceTypes.FirstOrDefault( sequenceType => sequenceType.ToLowerInvariant() == token.ToLowerInvariant()); return !string.IsNullOrEmpty(locus.SequenceType); }, // // 6. Date // token => { DateTime dateTime; bool result = DateTime.TryParse(token, out dateTime); if (result) { locus.Date = dateTime; } return result; }, // // 7. Sequence Name / ID // token => { locus.Name = token; return true; } }); }