public void TestIpiToUniprot() { var mapper = new IpiToUniprotMap(); Assert.AreEqual("O95793", mapper.MapToUniprot("IPI0000001")); // The 0th one Assert.AreEqual("Q28062", mapper.MapToUniprot("IPI1028485")); // The nth one Assert.AreEqual("IPI1028486", mapper.MapToUniprot("IPI1028486")); // The n+1th nonexistent one Assert.AreEqual("IPI1027205", mapper.MapToUniprot("IPI1027205")); // A midrange nonexistent one Assert.AreEqual("Q2V4A8", mapper.MapToUniprot("IPI0657347")); // Somewhere in the middle Assert.AreEqual("IPI0000000", mapper.MapToUniprot("IPI0000000")); // The -1th nonexistent one Assert.AreEqual("nonsense", mapper.MapToUniprot("nonsense")); // The really nonexistent one Assert.AreEqual("O14686", mapper.MapToUniprot("IPI0297859")); // Somewhere in the middle Assert.AreEqual("P42025", mapper.MapToUniprot("IPI029469")); // Somewhere toward the front }
/// <summary> /// Peruse the list of proteins, access webservices as needed to resolve protein metadata. /// note: best to use this in its own thread since it may block on web access /// </summary> /// <param name="proteinsToSearch">the proteins to populate</param> /// <param name="progressMonitor">For checking operation cancellation</param> /// <param name="singleBatch">if true, just do one batch and plan to come back later for more</param> /// <returns>IEnumerable of the proteins it actually populated (some don't need it, some have to wait for a decent web connection)</returns> public IEnumerable<ProteinSearchInfo> DoWebserviceLookup(IEnumerable<ProteinSearchInfo> proteinsToSearch, IProgressMonitor progressMonitor, bool singleBatch) { const int SINGLE_BATCH_SIZE = 500; // If caller has indicated that it wants to do a single batch and return for more later, stop after this many successes const int ENTREZ_RATELIMIT = 333; // 1/3 second between requests on Entrez const int UNIPROTKB_RATELIMIT = 10; var searchOrder = new[] { GENINFO_TAG, ENTREZ_TAG, UNIPROTKB_TAG // this order matters - we may take entrez results into uniprot for further search }; var minSearchTermLen = new Dictionary<char, int> { {GENINFO_TAG, 3}, // some gi numbers are quite small {ENTREZ_TAG, 6}, {UNIPROTKB_TAG, 6} // if you feed uniprot a partial search term you get a huge response }; var ratelimit = new Dictionary<char, int> { {GENINFO_TAG, ENTREZ_RATELIMIT}, // search on Entrez, but don't mix with non-GI searches {ENTREZ_TAG, ENTREZ_RATELIMIT}, {UNIPROTKB_TAG, UNIPROTKB_RATELIMIT} }; // sort out the various webservices so we can batch up var proteins = proteinsToSearch.ToArray(); foreach (var prot in proteins) { // translate from IPI to Uniprot if needed var search = prot.GetProteinMetadata().GetPendingSearchTerm().ToUpperInvariant(); if (search.StartsWith("IPI")) // Not L10N { if (_ipiMapper == null) _ipiMapper = new IpiToUniprotMap(); string mapped = _ipiMapper.MapToUniprot(search); if (mapped == search) // no mapping from that IPI { prot.SetWebSearchCompleted(); // no resolution for that IPI value } else { prot.Accession = mapped; // go ahead and note the Uniprot accession, even though we haven't searched yet prot.SetWebSearchTerm(new WebSearchTerm(UNIPROTKB_TAG,mapped)); } } else if (prot.GetProteinMetadata().GetSearchType() == UNIPROTKB_TAG) { // Check for homegrown numbering schemes long dummy; if (Int64.TryParse(search, out dummy)) { prot.SetWebSearchCompleted(); // All numbers, not a Uniprot ID } // Some uniprot records just don't have gene info - have we already parsed all there is to know from description? else if (prot.GetProteinMetadata().HasMissingMetadata()) { var test = prot.GetProteinMetadata().ChangeGene("test"); // Not L10N if (!test.HasMissingMetadata()) // We got everything except gene info { // Check to see if this is a standard uniprot formatted header, if so assume that GN was ommitted on purpose if (prot.GetProteinMetadata().Description.Contains(string.Format(" OS={0} ", (prot.GetProteinMetadata().Species)))) // Not L10N prot.SetWebSearchCompleted(); // There's not going to be any gene info } } } if (prot.GetProteinMetadata().NeedsSearch()) { // if you feed uniprot a partial search term you get a huge response int minLen = minSearchTermLen[prot.GetProteinMetadata().GetSearchType()]; if (prot.GetProteinMetadata().GetPendingSearchTerm().Length < minLen) { prot.SetWebSearchCompleted(); // don't attempt it } } if (!(prot.GetProteinMetadata().NeedsSearch())) { yield return prot; // doesn't need search, just pass it back unchanged } } _ipiMapper = null; // Done with this now // CONSIDER(bspratt): Could this be simplified? var cancelled = false; var politeStopwatch = new Stopwatch(); var totalSuccessesThisBatch = 0; politeStopwatch.Start(); var furtherUniprotSearches = new List<ProteinSearchInfo>(); // Uniprot searches that result from intermediate searches foreach (var searchType in searchOrder) { var consecutiveFailures = 0; var failureCount = 0; var successCount = 0; cancelled |= ((progressMonitor != null) && progressMonitor.IsCanceled); if (cancelled) break; var politenessIntervalMsec = ratelimit[searchType]; var batchsizeIncreaseThreshold = 2; // If you can do this many in a row at reduced batchsize, try increasing it while (true) { var idealBatchsize = _maxBatchSize[searchType]; cancelled |= ((progressMonitor != null) && progressMonitor.IsCanceled); if (cancelled) break; var type = searchType; var furtherUniprotSearchesToProcess = Equals(searchType, UNIPROTKB_TAG) ? furtherUniprotSearches.Where(s => (s.GetProteinMetadata().NeedsSearch())).ToList() : null; var searchlist = (furtherUniprotSearchesToProcess != null && furtherUniprotSearchesToProcess.Any()) ? furtherUniprotSearchesToProcess : proteins.Where(s => (s.GetProteinMetadata().NeedsSearch() && s.GetProteinMetadata().GetSearchType() == type)).ToList(); if (!searchlist.Any()) break; // try to batch up requests - reduce batch size if responses are ambiguous var nextSearch = 0; var searches = new List<ProteinSearchInfo>(); while ((nextSearch < searchlist.Count) && (searches.Count < _batchsize[searchType])) { // Unique sequence length helps a lot in disambiguation if (searches.All(s => s.SeqLength != searchlist[nextSearch].SeqLength)) searches.Add(searchlist[nextSearch]); nextSearch++; } // Be a good citizen - no more than three hits per second for Entrez var snoozeMs = politenessIntervalMsec - politeStopwatch.ElapsedMilliseconds; if (snoozeMs > 0) Thread.Sleep((int)snoozeMs); politeStopwatch.Restart(); cancelled |= ((progressMonitor != null) && progressMonitor.IsCanceled); if (cancelled) break; var processed = DoWebserviceLookup(searches, searchType, progressMonitor); if (processed < 0) // Returns negative on web access error { if (processed == -2) { // We're creating URLs that are too long _maxBatchSize[searchType] = Math.Max(1, (int)(_maxBatchSize[searchType] * 0.75)); } else { // Some error, we should just try again later so don't retry now break; } } var success = false; foreach (var s in searches) { if (s.GetProteinMetadata().GetPendingSearchTerm().Length == 0) { yield return s; // We've processed it if (s.Status == ProteinSearchInfo.SearchStatus.success) { success = true; _successCountAtThisBatchsize[searchType]++; } } else { // possibly an Entrez search we wish to further process in Uniprot var newSearchType = s.GetProteinMetadata().GetSearchType(); if ((newSearchType != searchType) && (Equals(newSearchType, UNIPROTKB_TAG))) { success = true; // Entrez search worked, leads us to a UniprotKB search s.Status = ProteinSearchInfo.SearchStatus.unsearched; // Not yet search as UniprotKB furtherUniprotSearches.Add(s); _successCountAtThisBatchsize[searchType]++; } else if (_batchsize[searchType] == 1) { // No ambiguity is possible at batchsize==1, this one just plain didn't work s.SetWebSearchCompleted(); yield return s; } } } // Review the overall history - how is this working out? foreach (var s in searchlist) { if (s.Status == ProteinSearchInfo.SearchStatus.success) { successCount++; totalSuccessesThisBatch++; consecutiveFailures = 0; } else if (s.Status == ProteinSearchInfo.SearchStatus.failure) { failureCount++; consecutiveFailures++; } if ((consecutiveFailures > (MAX_CONSECUTIVE_PROTEIN_METATDATA_LOOKUP_FAILURES + successCount)) || ((failureCount + successCount) > 100 && failureCount / (double)(failureCount + successCount) > .5)) { // We have failed a bunch in a row, or more than half overall. Assume the rest are the same as this streak, and bail. // That "+ successCount" term above guards against the case where we're a few hundred successes in then // we hit a bad patch (though this is unlikely - FASTA files tend to be internally consistent). foreach (var ss in searchlist) { if (ss.GetProteinMetadata().GetPendingSearchTerm().Length > 0) { ss.NoteSearchFailure(); ss.SetWebSearchCompleted(); // Just tag this as having been tried yield return ss; // And move on } } break; } } if (success) { if ((_successCountAtThisBatchsize[searchType] > batchsizeIncreaseThreshold) && (_batchsize[searchType] < idealBatchsize)) { _batchsize[searchType] = Math.Min(idealBatchsize, _batchsize[searchType]*2); _successCountAtThisBatchsize[searchType] = 0; } if (singleBatch && (totalSuccessesThisBatch >= SINGLE_BATCH_SIZE)) { // Probably called from a background loader that's trying to be polite break; // done with this search type for now, but go on to the next if any (especially for follow-on Uniprot searches) } } else { _batchsize[searchType] = Math.Max(1, _batchsize[searchType] / 2); _successCountAtThisBatchsize[searchType] = 0; batchsizeIncreaseThreshold = Math.Max(batchsizeIncreaseThreshold, batchsizeIncreaseThreshold * 2); // Get increasingly pessimistic (watch for integer rollover) } } } }