Exemplo n.º 1
0
 internal Protein(ProteomeDbPath proteomeDb, DbProtein protein, DbProteinName primaryName)
     : base(proteomeDb, protein)
 {
     Sequence = protein.Sequence;
     if (primaryName != null)
     {
         _proteinMetadata = primaryName.GetProteinMetadata();
         if (primaryName.Protein != null)
         {
             // grab the alternative names now, rather than going back to the db later
             _alternativeNames = new List <ProteinMetadata>();
             foreach (var name in primaryName.Protein.Names)
             {
                 if (!name.IsPrimary)
                 {
                     _alternativeNames.Add(name.GetProteinMetadata());
                 }
             }
         }
     }
 }
        /// <summary>
        /// Uses the known list of regexes to parse lineIn, keeping the
        /// result that fills in the most metdata.  In the event of a tie,
        /// first result wins - so regex list order matters.
        /// Populates the WebSearchInfo field but does not perform 
        /// the actual search - that's done elsewhere.
        /// </summary>
        /// <param name="lineIn">the text to be parsed</param>
        public ProteinMetadata ParseProteinMetaData(String lineIn)
        {
            if (lineIn.Length <= 0)
                return null;   

            var line = lineIn.Replace('\t', ' '); // regularize whitespace for simpler regexes

            // If there is a second >, then this is a custom name, and not
            // a real FASTA sequence.
            int start = (line.Length > 0 && line[0] == '>' ? 1 : 0);
            if (line.Length > 1 && line[1] == '>')
            {
                start++;
            }
            
            ProteinMetadata bestResult = null;
            var bestCount = 0;
            foreach (var r in _regexFasta)
            {
                Match match = r.RegexPattern.Match(line.Substring(start));
                if (match.Success)
                {
                    // a hit - now use the replacement expression to get the ProteinMetadata parts
                    string[] regexOutputs = r.RegexPattern.Replace(line.Substring(start), r.RegexReplacement).Split('\n');
                    var headerResult = new DbProteinName();
                    string searchterm = null; // assume no webservice lookup unless told otherwise
                    int dbColumnsFound = 0;
                    for (var n = regexOutputs.Length; n-- > 0;)
                    {
                        var split = regexOutputs[n].Split(new[] {':'}, 2); // split on first colon only
                        if (split.Length == 2)
                        {
                            var type = split[0].Trim();
                            var val = split[1].Trim();
                            if (val.Contains("${")) // failed match // Not L10N
                            {
                                val = String.Empty;
                            }
                            if (val.Length > 0)
                            {
                                dbColumnsFound++; // valid entry
                                switch (type)
                                {
                                    case "name": // Not L10N
                                        headerResult.Name = val;
                                        break;
                                    case "description": // Not L10N
                                        headerResult.Description = val;
                                        break;
                                    case "accession": // Not L10N
                                        headerResult.Accession = val;
                                        break;
                                    case "preferredname": // Not L10N
                                        headerResult.PreferredName = val;
                                        break;
                                    case "gene": // Not L10N
                                        headerResult.Gene = val;
                                        break;
                                    case "species": // Not L10N
                                        headerResult.Species = val;
                                        break;
                                    case "searchterm": // Not L10N
                                        dbColumnsFound--; // not actually a db column
                                        searchterm = val;
                                        break;
                                    default:
                                        throw new ArgumentOutOfRangeException(
                                            String.Format("Unknown Fasta RegEx output formatter type \'{0}\'",    // Not L10N
                                                regexOutputs[n]));
                                }

                            }
                        }
                        else
                        {
                            throw new ArgumentOutOfRangeException(
                                String.Format("Fasta RegEx failure in \'{0}\'",  // Not L10N
                                    line.Substring(start)));
                        }
                    }
                    if (headerResult.GetProteinMetadata().HasMissingMetadata())
                    {
                        if (searchterm != null)
                        {
                            // shave off any alternatives (might look like "IPI:IPI00197700.1|SWISS-PROT:P04638|ENSEMBL:ENSRNOP00000004662|REFSEQ:NP_037244")
                            searchterm = searchterm.Split('|')[0];
                            // a reasonable accession value will have at least one digit in it, and won't have things like tabs and parens and braces that confuse web services
                            if ("0123456789".Any(searchterm.Contains) && !" \t()[]".Any(searchterm.Contains))  // Not L10N
                                headerResult.SetWebSearchTerm(new WebSearchTerm(searchterm[0], searchterm.Substring(1))); // we'll need to hit the webservices to get this missing info
                        }
                    }
                    if (headerResult.GetProteinMetadata().WebSearchInfo.IsEmpty())
                        headerResult.SetWebSearchCompleted(); // no search possible
                    if (dbColumnsFound > bestCount)
                    {
                        bestCount = dbColumnsFound; // best match so far - tie goes to the first hit so order matters
                        bestResult = headerResult.GetProteinMetadata();
                    }
                }
            }
            return bestResult;
        }