/// <summary> /// Uses the known list of regexes to parse lineIn, keeping the /// result that fills in the most metdata. In the event of a tie, /// first result wins - so regex list order matters. /// Populates the WebSearchInfo field but does not perform /// the actual search - that's done elsewhere. /// </summary> /// <param name="lineIn">the text to be parsed</param> public ProteinMetadata ParseProteinMetaData(String lineIn) { if (lineIn.Length <= 0) return null; var line = lineIn.Replace('\t', ' '); // regularize whitespace for simpler regexes // If there is a second >, then this is a custom name, and not // a real FASTA sequence. int start = (line.Length > 0 && line[0] == '>' ? 1 : 0); if (line.Length > 1 && line[1] == '>') { start++; } ProteinMetadata bestResult = null; var bestCount = 0; foreach (var r in _regexFasta) { Match match = r.RegexPattern.Match(line.Substring(start)); if (match.Success) { // a hit - now use the replacement expression to get the ProteinMetadata parts string[] regexOutputs = r.RegexPattern.Replace(line.Substring(start), r.RegexReplacement).Split('\n'); var headerResult = new DbProteinName(); string searchterm = null; // assume no webservice lookup unless told otherwise int dbColumnsFound = 0; for (var n = regexOutputs.Length; n-- > 0;) { var split = regexOutputs[n].Split(new[] {':'}, 2); // split on first colon only if (split.Length == 2) { var type = split[0].Trim(); var val = split[1].Trim(); if (val.Contains("${")) // failed match // Not L10N { val = String.Empty; } if (val.Length > 0) { dbColumnsFound++; // valid entry switch (type) { case "name": // Not L10N headerResult.Name = val; break; case "description": // Not L10N headerResult.Description = val; break; case "accession": // Not L10N headerResult.Accession = val; break; case "preferredname": // Not L10N headerResult.PreferredName = val; break; case "gene": // Not L10N headerResult.Gene = val; break; case "species": // Not L10N headerResult.Species = val; break; case "searchterm": // Not L10N dbColumnsFound--; // not actually a db column searchterm = val; break; default: throw new ArgumentOutOfRangeException( String.Format("Unknown Fasta RegEx output formatter type \'{0}\'", // Not L10N regexOutputs[n])); } } } else { throw new ArgumentOutOfRangeException( String.Format("Fasta RegEx failure in \'{0}\'", // Not L10N line.Substring(start))); } } if (headerResult.GetProteinMetadata().HasMissingMetadata()) { if (searchterm != null) { // shave off any alternatives (might look like "IPI:IPI00197700.1|SWISS-PROT:P04638|ENSEMBL:ENSRNOP00000004662|REFSEQ:NP_037244") searchterm = searchterm.Split('|')[0]; // a reasonable accession value will have at least one digit in it, and won't have things like tabs and parens and braces that confuse web services if ("0123456789".Any(searchterm.Contains) && !" \t()[]".Any(searchterm.Contains)) // Not L10N headerResult.SetWebSearchTerm(new WebSearchTerm(searchterm[0], searchterm.Substring(1))); // we'll need to hit the webservices to get this missing info } } if (headerResult.GetProteinMetadata().WebSearchInfo.IsEmpty()) headerResult.SetWebSearchCompleted(); // no search possible if (dbColumnsFound > bestCount) { bestCount = dbColumnsFound; // best match so far - tie goes to the first hit so order matters bestResult = headerResult.GetProteinMetadata(); } } } return bestResult; }