Example #1
0
 public void TestIpiToUniprot()
 {
     var mapper = new IpiToUniprotMap();
     Assert.AreEqual("O95793", mapper.MapToUniprot("IPI0000001")); // The 0th one
     Assert.AreEqual("Q28062", mapper.MapToUniprot("IPI1028485")); // The nth one
     Assert.AreEqual("IPI1028486", mapper.MapToUniprot("IPI1028486")); // The n+1th nonexistent one
     Assert.AreEqual("IPI1027205", mapper.MapToUniprot("IPI1027205")); // A midrange nonexistent one
     Assert.AreEqual("Q2V4A8", mapper.MapToUniprot("IPI0657347")); // Somewhere in the middle
     Assert.AreEqual("IPI0000000", mapper.MapToUniprot("IPI0000000")); // The -1th nonexistent one
     Assert.AreEqual("nonsense", mapper.MapToUniprot("nonsense")); // The really nonexistent one
     Assert.AreEqual("O14686", mapper.MapToUniprot("IPI0297859")); // Somewhere in the middle
     Assert.AreEqual("P42025", mapper.MapToUniprot("IPI029469")); // Somewhere toward the front
 }
        /// <summary>
        /// Peruse the list of proteins, access webservices as needed to resolve protein metadata.
        /// note: best to use this in its own thread since it may block on web access
        /// </summary>
        /// <param name="proteinsToSearch">the proteins to populate</param>
        /// <param name="progressMonitor">For checking operation cancellation</param>
        /// <param name="singleBatch">if true, just do one batch and plan to come back later for more</param>
        /// <returns>IEnumerable of the proteins it actually populated (some don't need it, some have to wait for a decent web connection)</returns>
        public IEnumerable<ProteinSearchInfo> DoWebserviceLookup(IEnumerable<ProteinSearchInfo> proteinsToSearch, IProgressMonitor progressMonitor, bool singleBatch)
        {
            const int SINGLE_BATCH_SIZE = 500; // If caller has indicated that it wants to do a single batch and return for more later, stop after this many successes
            const int ENTREZ_RATELIMIT = 333; // 1/3 second between requests on Entrez
            const int UNIPROTKB_RATELIMIT = 10; 
            var searchOrder = new[]
            {
                GENINFO_TAG,
                ENTREZ_TAG,
                UNIPROTKB_TAG // this order matters - we may take entrez results into uniprot for further search
            };
            var minSearchTermLen = new Dictionary<char, int>
            {
                {GENINFO_TAG, 3}, // some gi numbers are quite small
                {ENTREZ_TAG, 6},
                {UNIPROTKB_TAG, 6} // if you feed uniprot a partial search term you get a huge response
            };
            var ratelimit = new Dictionary<char, int>
            {
                {GENINFO_TAG, ENTREZ_RATELIMIT}, // search on Entrez, but don't mix with non-GI searches
                {ENTREZ_TAG, ENTREZ_RATELIMIT},
                {UNIPROTKB_TAG, UNIPROTKB_RATELIMIT}
            };

            // sort out the various webservices so we can batch up
            var proteins = proteinsToSearch.ToArray();
            foreach (var prot in proteins)
            {
                // translate from IPI to Uniprot if needed
                var search = prot.GetProteinMetadata().GetPendingSearchTerm().ToUpperInvariant();
                if (search.StartsWith("IPI")) // Not L10N
                {
                    if (_ipiMapper == null)
                        _ipiMapper = new IpiToUniprotMap();
                    string mapped = _ipiMapper.MapToUniprot(search);
                    if (mapped == search) // no mapping from that IPI
                    {
                        prot.SetWebSearchCompleted(); // no resolution for that IPI value
                    }
                    else
                    {
                        prot.Accession = mapped; // go ahead and note the Uniprot accession, even though we haven't searched yet
                        prot.SetWebSearchTerm(new WebSearchTerm(UNIPROTKB_TAG,mapped)); 
                    }
                }
                else if (prot.GetProteinMetadata().GetSearchType() == UNIPROTKB_TAG)
                {
                    // Check for homegrown numbering schemes
                    long dummy;
                    if (Int64.TryParse(search, out dummy))
                    {
                        prot.SetWebSearchCompleted();  // All numbers, not a Uniprot ID
                    }
                    // Some uniprot records just don't have gene info - have we already parsed all there is to know from description?
                    else if (prot.GetProteinMetadata().HasMissingMetadata())
                    {
                        var test = prot.GetProteinMetadata().ChangeGene("test"); // Not L10N
                        if (!test.HasMissingMetadata()) // We got everything except gene info
                        {
                            // Check to see if this is a standard uniprot formatted header, if so assume that GN was ommitted on purpose
                            if (prot.GetProteinMetadata().Description.Contains(string.Format(" OS={0} ", (prot.GetProteinMetadata().Species)))) // Not L10N
                                prot.SetWebSearchCompleted(); // There's not going to be any gene info
                        }
                    }
                }

                if (prot.GetProteinMetadata().NeedsSearch())
                {
                    // if you feed uniprot a partial search term you get a huge response                    
                    int minLen = minSearchTermLen[prot.GetProteinMetadata().GetSearchType()];
                    if (prot.GetProteinMetadata().GetPendingSearchTerm().Length < minLen)
                    {
                        prot.SetWebSearchCompleted(); // don't attempt it
                    }
                }

                if (!(prot.GetProteinMetadata().NeedsSearch()))
                {
                    yield return prot;  // doesn't need search, just pass it back unchanged
                }
            }
            _ipiMapper = null;  // Done with this now

            // CONSIDER(bspratt): Could this be simplified?
            var cancelled = false;
            var politeStopwatch = new Stopwatch();
            var totalSuccessesThisBatch = 0;
            politeStopwatch.Start();
            var furtherUniprotSearches = new List<ProteinSearchInfo>(); // Uniprot searches that result from intermediate searches
            foreach (var searchType in searchOrder)
            {
                var consecutiveFailures = 0;
                var failureCount = 0;
                var successCount = 0;
                cancelled |= ((progressMonitor != null) && progressMonitor.IsCanceled);
                if (cancelled)
                    break;
                var politenessIntervalMsec = ratelimit[searchType];
                var batchsizeIncreaseThreshold = 2;  // If you can do this many in a row at reduced batchsize, try increasing it
                while (true)
                {
                    var idealBatchsize = _maxBatchSize[searchType];
                    cancelled |= ((progressMonitor != null) && progressMonitor.IsCanceled);
                    if (cancelled)
                        break;
                    var type = searchType;
                    var furtherUniprotSearchesToProcess = Equals(searchType, UNIPROTKB_TAG)
                        ? furtherUniprotSearches.Where(s => (s.GetProteinMetadata().NeedsSearch())).ToList()
                        : null;
                    var searchlist = (furtherUniprotSearchesToProcess != null && furtherUniprotSearchesToProcess.Any()) 
                        ? furtherUniprotSearchesToProcess 
                        : proteins.Where(s => (s.GetProteinMetadata().NeedsSearch() &&
                                               s.GetProteinMetadata().GetSearchType() == type)).ToList();
                    if (!searchlist.Any())
                        break;
                    // try to batch up requests - reduce batch size if responses are ambiguous
                    var nextSearch = 0;
                    var searches = new List<ProteinSearchInfo>();
                    while ((nextSearch < searchlist.Count) && (searches.Count < _batchsize[searchType]))
                    {
                        // Unique sequence length helps a lot in disambiguation
                        if (searches.All(s => s.SeqLength != searchlist[nextSearch].SeqLength))
                            searches.Add(searchlist[nextSearch]);
                        nextSearch++;
                    }

                    // Be a good citizen - no more than three hits per second for Entrez
                    var snoozeMs = politenessIntervalMsec - politeStopwatch.ElapsedMilliseconds;
                    if (snoozeMs > 0)
                        Thread.Sleep((int)snoozeMs);
                    politeStopwatch.Restart();
                    cancelled |= ((progressMonitor != null) && progressMonitor.IsCanceled);
                    if (cancelled)
                        break;
                    var processed = DoWebserviceLookup(searches, searchType, progressMonitor);
                    if (processed < 0) // Returns negative on web access error
                    {
                        if (processed == -2)
                        {
                            // We're creating URLs that are too long
                            _maxBatchSize[searchType] = Math.Max(1, (int)(_maxBatchSize[searchType] * 0.75));
                        }
                        else
                        {
                            // Some error, we should just try again later so don't retry now
                            break;
                        }
                    }
                    var success = false;
                    foreach (var s in searches)
                    {
                        if (s.GetProteinMetadata().GetPendingSearchTerm().Length == 0)
                        {
                            yield return s; // We've processed it
                            if (s.Status == ProteinSearchInfo.SearchStatus.success)
                            {
                                success = true;
                                _successCountAtThisBatchsize[searchType]++;
                            }
                        }
                        else
                        {
                            // possibly an Entrez search we wish to further process in Uniprot
                            var newSearchType = s.GetProteinMetadata().GetSearchType();
                            if ((newSearchType != searchType) && (Equals(newSearchType, UNIPROTKB_TAG)))
                            {
                                success = true; // Entrez search worked, leads us to a UniprotKB search
                                s.Status = ProteinSearchInfo.SearchStatus.unsearched; // Not yet search as UniprotKB
                                furtherUniprotSearches.Add(s);
                                _successCountAtThisBatchsize[searchType]++;
                            }
                            else if (_batchsize[searchType] == 1)
                            {
                                // No ambiguity is possible at batchsize==1, this one just plain didn't work
                                s.SetWebSearchCompleted();
                                yield return s;
                            }
                        }
                    }

                    // Review the overall history - how is this working out?
                    foreach (var s in searchlist)
                    {
                        if (s.Status == ProteinSearchInfo.SearchStatus.success)
                        {
                            successCount++;
                            totalSuccessesThisBatch++;
                            consecutiveFailures = 0;
                        }
                        else if (s.Status == ProteinSearchInfo.SearchStatus.failure)
                        {
                            failureCount++;
                            consecutiveFailures++;
                        }
                        if ((consecutiveFailures > (MAX_CONSECUTIVE_PROTEIN_METATDATA_LOOKUP_FAILURES + successCount)) ||
                            ((failureCount + successCount) > 100 &&
                            failureCount / (double)(failureCount + successCount) > .5))
                        {
                            // We have failed a bunch in a row, or more than half overall.  Assume the rest are the same as this streak, and bail.
                            // That  "+ successCount" term above guards against the case where we're a few hundred successes in then 
                            // we hit a bad patch (though this is unlikely - FASTA files tend to be internally consistent).
                            foreach (var ss in searchlist)
                            {
                                if (ss.GetProteinMetadata().GetPendingSearchTerm().Length > 0)
                                {
                                    ss.NoteSearchFailure();
                                    ss.SetWebSearchCompleted(); // Just tag this as having been tried
                                    yield return ss; // And move on
                                }
                            }
                            break;
                        }
                    }

                    if (success)
                    {
                        if ((_successCountAtThisBatchsize[searchType] > batchsizeIncreaseThreshold) && (_batchsize[searchType] < idealBatchsize))
                        {
                            _batchsize[searchType] = Math.Min(idealBatchsize, _batchsize[searchType]*2);
                            _successCountAtThisBatchsize[searchType] = 0;
                        }
                        if (singleBatch && (totalSuccessesThisBatch >= SINGLE_BATCH_SIZE))
                        {  // Probably called from a background loader that's trying to be polite
                            break; // done with this search type for now, but go on to the next if any (especially for follow-on Uniprot searches)
                        }
                    }
                    else
                    {
                        _batchsize[searchType] = Math.Max(1, _batchsize[searchType] / 2);
                        _successCountAtThisBatchsize[searchType] = 0;
                        batchsizeIncreaseThreshold = Math.Max(batchsizeIncreaseThreshold, batchsizeIncreaseThreshold * 2); // Get increasingly pessimistic (watch for integer rollover)
                    }
                }
            }
        }