/// <summary> /// Quick, cheap check for internet access (uniprot access, specifically) /// </summary> /// <returns>false if internet isn't available for any reason</returns> public bool HasWebAccess() { if (!_hasWebAccess.HasValue) { // First time anyone has asked - try a simple search to see if we have access var prot = ParseProteinLine(KNOWNGOOD_UNIPROT_SEARCH_TARGET); var protname = new ProteinSearchInfo(new DbProteinName(prot, new ProteinMetadata(KNOWNGOOD_UNIPROT_SEARCH_TARGET, string.Empty, null, null, null, null, UNIPROTKB_TAG + KNOWNGOOD_UNIPROT_SEARCH_TARGET)), KNOWNGOOD_UNIPROT_SEARCH_TARGET_SEQLEN); _hasWebAccess = DoWebserviceLookup(new []{protname}, null, true).Any(); } return _hasWebAccess.Value; }
/// <summary> /// Handles web access for deriving missing protein metadata /// </summary> /// <param name="proteins">items to search</param> /// <param name="searchType">Uniprot or Entrez</param> /// <param name="progressMonitor">For detecting operation cancellation</param> /// <returns>negative value if we need to try again later, else number of proteins looked up</returns> /// private int DoWebserviceLookup(IList<ProteinSearchInfo> proteins, char searchType, IProgressMonitor progressMonitor) { int lookupCount = _webSearchProvider is FakeWebSearchProvider ? proteins.Count : 0; // Fake websearch provider used in tests just claims victory, returns 0 for WebRetryCount var searchterms = _webSearchProvider.ListSearchTerms(proteins); if (searchterms.Count == 0) return 0; // no work, but not error either var responses = new List<ProteinSearchInfo>(); for (var retries = _webSearchProvider.WebRetryCount();retries-->0;) // be patient with the web { if (searchterms.Count == 0) break; if ((progressMonitor != null) && progressMonitor.IsCanceled) break; var caught = false; try { string urlString; // left at outer scope for exception debugging ease if ((searchType == GENINFO_TAG) || (searchType == ENTREZ_TAG)) { // first try to get enough summary information to redo this seach in uniprot // throw in something we know will hit (Note: it's important that this particular value appear in the unit tests, so we can mimic web response) string knowngood = (searchType == GENINFO_TAG) ? KNOWNGOOD_GENINFO_SEARCH_TARGET : KNOWNGOOD_ENTREZ_SEARCH_TARGET; // Not L10N bool addedKnowngood = false; if (!searchterms.Any(searchterm => SimilarSearchTerms(searchterm,knowngood))) { searchterms.Insert(0, knowngood); // ensure at least one response if connection is good addedKnowngood = true; } urlString = _webSearchProvider.ConstructEntrezURL(searchterms,true); // get in summary form /* * a search on XP_915497 and 15834432 yields something like this (but don't mix GI and non GI in practice): <DocSum> <Id>82891194</Id> <Item Name="Caption" Type="String">XP_915497</Item> <Item Name="Title" Type="String"> PREDICTED: similar to Syntaxin binding protein 3 (UNC-18 homolog 3) (UNC-18C) (MUNC-18-3) [Mus musculus] </Item> <Item Name="Extra" Type="String">gi|82891194|ref|XP_915497.1|[82891194]</Item> <Item Name="Gi" Type="Integer">82891194</Item> <Item Name="CreateDate" Type="String">2005/12/01</Item> <Item Name="UpdateDate" Type="String">2005/12/01</Item> <Item Name="Flags" Type="Integer">512</Item> <Item Name="TaxId" Type="Integer">10090</Item> <Item Name="Length" Type="Integer">566</Item> <Item Name="Status" Type="String">replaced</Item> <Item Name="ReplacedBy" Type="String">NP_035634</Item> <-- useful for Uniprot search <Item Name="Comment" Type="String"> <![CDATA[ This record was replaced or removed. ]]> </Item> </DocSum> <DocSum> <Id>15834432</Id> <Item Name="Caption" Type="String">NP_313205</Item> <-- useful for Uniprot search <Item Name="Title" Type="String"> 30S ribosomal protein S18 [Escherichia coli O157:H7 str. Sakai] </Item> <Item Name="Extra" Type="String">gi|15834432|ref|NP_313205.1|[15834432]</Item> <Item Name="Gi" Type="Integer">15834432</Item> <Item Name="CreateDate" Type="String">2001/03/07</Item> <Item Name="UpdateDate" Type="String">2013/12/20</Item> <Item Name="Flags" Type="Integer">512</Item> <Item Name="TaxId" Type="Integer">386585</Item> <Item Name="Length" Type="Integer">75</Item> <Item Name="Status" Type="String">live</Item> <Item Name="ReplacedBy" Type="String"/> <Item Name="Comment" Type="String"> <![CDATA[ ]]> </Item> </DocSum> */ using (var xmlTextReader = _webSearchProvider.GetXmlTextReader(urlString)) { var elementName = String.Empty; var response = new ProteinSearchInfo(); bool dummy = addedKnowngood; string id = null; string caption = null; string replacedBy = null; string attrName = null; string length = null; while (xmlTextReader.Read()) { switch (xmlTextReader.NodeType) { case XmlNodeType.Element: // The node is an element. elementName = xmlTextReader.Name; attrName = xmlTextReader.GetAttribute("Name"); // Not L10N break; case XmlNodeType.Text: // text for current element if ("Id" == elementName) // this will be the input GI number, or GI equivalent of input // Not L10N { id = NullForEmpty(xmlTextReader.Value); } else if ("ERROR" == elementName) // Not L10N { // we made connection, but some trouble on their end throw new WebException(xmlTextReader.Value); } else if ("Item" == elementName) // Not L10N { var value = NullForEmpty(xmlTextReader.Value); if (value != null) { switch (attrName) { case "ReplacedBy": // Not L10N replacedBy = value; // a better read on name break; case "Caption": // Not L10N caption = value; // a better read on name break; case "Length": // Not L10N length = value; // Useful for disambiguation break; } } } break; case XmlNodeType.EndElement: if ("DocSum" == xmlTextReader.Name) // Not L10N { if (dummy) { dummy = false; // first returned is just the known-good seed, the rest are useful } else { // can we transfer this search to UniprotKB? Gets us the proper accession ID, // and avoids downloading sequence data we already have or just don't want string newSearchTerm = null; string intermediateName = null; if (replacedBy != null) { newSearchTerm = replacedBy; // Ref|XP_nnn -> GI -> NP_yyyy intermediateName = caption; } else if (caption != null) { newSearchTerm = caption; // GI -> NP_yyyy intermediateName = id; } if (newSearchTerm != null) { response.Accession = newSearchTerm; // a decent accession if uniprot doesn't find it response.Description = intermediateName; // stow this here to help make the connection between searches response.SetWebSearchTerm(new WebSearchTerm(UNIPROTKB_TAG, newSearchTerm)); int intLength; if (!int.TryParse(length, out intLength)) intLength = 0; response.SeqLength = intLength; // Useful for disambiguation responses.Add(response); foreach (var value in new[] {id, caption}) { // note as altname for association with the original search if (response.Protein == null) response.Protein = new DbProtein(); response.Protein.Names.Add(new DbProteinName(null, new ProteinMetadata(value, null))); // and remove from consideration for the full-data Entrez search var val = value; var oldSearches = searchterms.Where(s => SimilarSearchTerms(s, val)).ToArray(); if (oldSearches.Any()) { // conceivably same search term is in there twice, just replace the first searchterms.Remove(oldSearches[0]); // don't do the more verbose Entrez search } } } } response = new ProteinSearchInfo(); // and start another id = caption = replacedBy = null; } break; } } xmlTextReader.Close(); } if (searchterms.Count > (addedKnowngood ? 1 : 0)) { // now do full entrez search - unfortunately this pulls down sequence information so it's slow and we try to avoid it urlString = _webSearchProvider.ConstructEntrezURL(searchterms, false); // not a summary using (var xmlTextReader = _webSearchProvider.GetXmlTextReader(urlString)) { var elementName = String.Empty; var latestGbQualifierName = string.Empty; var response = new ProteinSearchInfo(); // and start another bool dummy = addedKnowngood; while (xmlTextReader.Read()) { switch (xmlTextReader.NodeType) { case XmlNodeType.Element: // The node is an element. elementName = xmlTextReader.Name; break; case XmlNodeType.Text: // text for current element if ("GBSeq_organism" == elementName) // Not L10N { response.Species = NullForEmpty(xmlTextReader.Value); } else if ("GBSeq_locus" == elementName) // Not L10N { response.PreferredName = NullForEmpty(xmlTextReader.Value); // a better read on name } else if ("GBSeq_primary-accession" == elementName) // Not L10N { response.Accession = NullForEmpty(xmlTextReader.Value); } else if ("GBSeq_definition" == elementName) // Not L10N { if (String.IsNullOrEmpty(response.Description)) response.Description = NullForEmpty(xmlTextReader.Value); } else if ("GBQualifier_name" == elementName) // Not L10N { latestGbQualifierName = NullForEmpty(xmlTextReader.Value); } else if (("GBQualifier_value" == elementName) && // Not L10N ("gene" == latestGbQualifierName)) // Not L10N { response.Gene = NullForEmpty(xmlTextReader.Value); } else if ("GBSeqid" == elementName) // Not L10N { // alternate name // use this as a way to associate this result with a search - // accession may be completely unlike the search term in GI case if (response.Protein == null) response.Protein = new DbProtein(); response.Protein.Names.Add(new DbProteinName(null, new ProteinMetadata(NullForEmpty(xmlTextReader.Value), null))); } else if ("GBSeq_length" == elementName) // Not L10N { int length; if (!int.TryParse(xmlTextReader.Value, out length)) length = 0; response.SeqLength = length; } break; case XmlNodeType.EndElement: if ("GBSeq" == xmlTextReader.Name) // Not L10N { if (dummy) { dummy = false; // first returned is just the known-good seed, the rest are useful } else { responses.Add(response); } response = new ProteinSearchInfo(); // and start another } break; } } xmlTextReader.Close(); } } // end full entrez search } // End if GENINFO or ENTREZ else if (searchType == UNIPROTKB_TAG) { int timeout = _webSearchProvider.GetTimeoutMsec(searchterms.Count); // 10 secs + 1 more for every 5 search terms urlString = _webSearchProvider.ConstructUniprotURL(searchterms); using (var webResponseStream = _webSearchProvider.GetWebResponseStream(urlString, timeout)) { if (webResponseStream != null) { using (var reader = new StreamReader(webResponseStream)) { if (!reader.EndOfStream) { var header = reader.ReadLine(); // eat the header string[] fieldNames = header.Split('\t'); // Not L10N // Normally comes in as Entry\tEntry name\tStatus\tProtein names\tGene names\tOrganism\tLength, but could be any order int colAccession = Array.IndexOf(fieldNames, "Entry"); // Not L10N int colPreferredName = Array.IndexOf(fieldNames, "Entry name"); // Not L10N int colDescription = Array.IndexOf(fieldNames, "Protein names"); // Not L10N int colGene = Array.IndexOf(fieldNames, "Gene names"); // Not L10N int colSpecies = Array.IndexOf(fieldNames, "Organism"); // Not L10N int colLength = Array.IndexOf(fieldNames, "Length"); // Not L10N int colStatus = Array.IndexOf(fieldNames, "Status"); // Not L10N while (!reader.EndOfStream) { var line = reader.ReadLine(); if (line != null) { string[] fields = line.Split('\t'); // Not L10N int length = 0; if (colLength >= 0) int.TryParse(fields[colLength], out length); var response = new ProteinSearchInfo { ProteinDbInfo = new DbProteinName { Accession = NullForEmpty(fields[colAccession]), PreferredName = NullForEmpty(fields[colPreferredName]), Description = NullForEmpty(fields[colDescription]), Gene = NullForEmpty(fields[colGene]), Species = NullForEmpty(fields[colSpecies]), }, SeqLength = length, ReviewStatus = NullForEmpty(colStatus>=0 ? fields[colStatus] : null) // Reviewed or unreviewed }; responses.Add(response); } } } reader.Close(); } webResponseStream.Close(); } } } // End if Uniprot } catch (WebException ex) { if (ex.Status == WebExceptionStatus.ProtocolError) { switch (((HttpWebResponse)ex.Response).StatusCode) { case HttpStatusCode.BadRequest: case HttpStatusCode.RequestUriTooLong: // malformed search, stop trying if (proteins.Count == 1) { proteins[0].SetWebSearchCompleted(); // No more need for lookup return 1; // We resolved one } return -2; // Probably asked for too many at once, caller will go into batch reduction mode } } caught = true; } catch { caught = true; } if (caught) { if (retries == 0) return -1; // just try again later Thread.Sleep(1000); continue; } if (responses.Count>0) { const string STATUS_REVIEWED = "reviewed"; // Uniprot reviewed status // Not L10N // now see if responses are ambiguous or not if (proteins.Count() == 1) { // Any responses must belong to this protein - or this isn't a protein at all (user named it "peptide6" for example). // Can get multiple results for single uniprot code, but we'll ignore those // since we're not in the market for alternative proteins (in fact we're likely // resolving metadata for one here). ProteinSearchInfo result = null; // See if we can uniquely match by sequence length int length = proteins[0].SeqLength; if (length == 0) { // From a peptide list, probably - sequence unknown if (responses.Count(r => Equals(r.ReviewStatus, STATUS_REVIEWED)) == 1) { result = responses.First(r => Equals(r.ReviewStatus, STATUS_REVIEWED)); } else if (responses.Count(r => Equals(r.Accession, proteins[0].Accession)) == 1) { result = responses.First(r =>Equals(r.Accession, proteins[0].Accession)); } else { if (responses.Count != 1) { // Ambiguous - don't make uneducated guesses. But if all responses share species or gene etc note that var common = ProteinSearchInfo.Intersection(responses); if (common != null) { var old = proteins[0].GetProteinMetadata(); proteins[0].ChangeProteinMetadata(MergeSearchResult(common.GetProteinMetadata(), old)); } proteins[0].SetWebSearchCompleted(); // We aren't going to get an answer proteins[0].NoteSearchFailure(); break; } result = responses.First(); // We got an unambiguous response } } else if (responses.Count(r => r.SeqLength == length) == 1) { result = responses.First(r =>r.SeqLength == length); } else if (responses.Count(r => r.SeqLength == length && Equals(r.ReviewStatus, STATUS_REVIEWED)) == 1) // Narrow it down to reviewed only { result = responses.First(r => r.SeqLength == length && Equals(r.ReviewStatus, STATUS_REVIEWED)); } if (result == null) { if ((length > 0) && (responses.Count(r => r.SeqLength == length) == 0)) // No plausible matches (nothing of the proper length) { proteins[0].SetWebSearchCompleted(); // We aren't going to get an answer proteins[0].NoteSearchFailure(); break; } else if (responses.Count(r => Equals(r.ReviewStatus, STATUS_REVIEWED)) == 1) { result = responses.First(r => Equals(r.ReviewStatus, STATUS_REVIEWED)); } else { // Ambiguous - don't make uneducated guesses. But if all responses share species or gene etc note that var common = ProteinSearchInfo.Intersection(responses); if (common != null) { var old = proteins[0].GetProteinMetadata(); proteins[0].ChangeProteinMetadata(MergeSearchResult(common.GetProteinMetadata(), old)); } proteins[0].SetWebSearchCompleted(); // We aren't going to get an answer proteins[0].NoteSearchFailure(); break; } } // prefer the data we got from web search to anything we parsed. var oldMetadata = proteins[0].GetProteinMetadata(); proteins[0].ChangeProteinMetadata(MergeSearchResult(result.GetProteinMetadata(), oldMetadata)); // use the first, if more than one, as the primary proteins[0].Status = ProteinSearchInfo.SearchStatus.success; lookupCount++; // Succcess! if (Equals(searchType, proteins[0].GetProteinMetadata().GetSearchType())) // did we reassign search from Entrez to UniprotKB? If so don't mark as resolved yet. proteins[0].SetWebSearchCompleted(); // no more need for lookup } else if ((searchType == ENTREZ_TAG) || (searchType == GENINFO_TAG)) { // multiple proteins, but responses come in reliable order if (proteins.Count() == responses.Count()) { int n = 0; foreach (var response in responses) { // prefer the data we got from web search var oldMetadata = proteins[n].GetProteinMetadata(); if (Equals(searchType, proteins[n].GetProteinMetadata().GetSearchType())) // did we reassign search from Entrez to UniprotKB? oldMetadata = oldMetadata.SetWebSearchCompleted(); // no more need for lookup // use oldMetadata to fill in any holes in response, then take oldMetadata name and description proteins[n].Status = ProteinSearchInfo.SearchStatus.success; proteins[n++].ChangeProteinMetadata(MergeSearchResult(response.GetProteinMetadata(), oldMetadata)); lookupCount++; // Succcess! } } else // but sometimes with gaps { int n = 0; foreach (var response in responses) { // each response should correspond to a protein, but some proteins won't have a response while (n < proteins.Count) { var s = proteins[n].GetProteinMetadata().WebSearchInfo; bool hit = (s.MatchesPendingSearchTerm(response.Accession) || s.MatchesPendingSearchTerm(response.PreferredName)); if (!hit && (response.ProteinDbInfo != null)) { // we have a list of alternative names from the search, try those foreach (var altName in response.Protein.Names) { hit = s.MatchesPendingSearchTerm(altName.Name); if (hit) break; } } if (hit) { // prefer the data we got from web search var oldMetadata = proteins[n].GetProteinMetadata(); if (Equals(searchType, proteins[0].GetProteinMetadata().GetSearchType())) // did we reassign search from Entrez to UniprotKB? oldMetadata = oldMetadata.SetWebSearchCompleted(); // no more need for lookup // use oldMetadata to fill in any holes in response, then take oldMetadata name and description proteins[n].ChangeProteinMetadata(MergeSearchResult(response.GetProteinMetadata(), oldMetadata)); proteins[n].Status = ProteinSearchInfo.SearchStatus.success; lookupCount++; // Succcess! break; } n++; } } } } else // (searchType == UNIPROTKB_TAG) { // Multiple proteins, responses come back in no particular order, and // possibly with alternatives thrown in foreach (var p in proteins) { var seqLength = p.SeqLength; var uniqueProteinLength = proteins.Count(pr => (pr.SeqLength == seqLength)) == 1; for (var reviewedOnly=0; reviewedOnly < 2; reviewedOnly++) { // Only look at responses with proper sequence length - narrowing to reviewed only if we have ambiguity var likelyResponses = reviewedOnly == 0 ? (from r in responses where (r.SeqLength == seqLength) select r).ToArray() : (from r in responses where (r.SeqLength == seqLength && Equals(r.ReviewStatus, STATUS_REVIEWED)) select r).ToArray(); var results = (uniqueProteinLength && likelyResponses.Count()==1) ? likelyResponses : // Unambiguous - single response that matches this length, and this protein is the only one with this length (from r in likelyResponses where (p.GetProteinMetadata().WebSearchInfo.MatchesPendingSearchTerm(r.Accession)) select r).ToArray(); if (results.Count() != 1) { // See if the search term is found in exactly one result's description field var resultsDescription = (from r in likelyResponses where ((!String.IsNullOrEmpty(r.Description) && r.Description.ToUpperInvariant(). Split(' ').Contains(p.GetProteinMetadata().GetPendingSearchTerm().ToUpperInvariant()))) select r).ToArray(); if (resultsDescription.Count() == 1) results = resultsDescription; } if (results.Count() != 1) { // See if the search term is found in exactly one result's gene names field var resultsGene = (from r in likelyResponses where ((!String.IsNullOrEmpty(r.Gene) && r.Gene.ToUpperInvariant(). Split(' ').Contains(p.GetProteinMetadata().GetPendingSearchTerm().ToUpperInvariant()))) select r).ToArray(); if (resultsGene.Count() == 1) results = resultsGene; } if (results.Count() != 1 && uniqueProteinLength) { // Didn't find an obvious match, but this is the only protein of this length in the search results = likelyResponses; } // Make sure all matching responses have same accession, at a minimum var common = ProteinSearchInfo.Intersection(results); if (results.Any() && common.Accession != null) { // prefer the data we got from web search var oldMetadata = p.GetProteinMetadata(); oldMetadata = oldMetadata.SetWebSearchCompleted(); // no more need for lookup // use oldMetadata to fill in any holes in response, then take oldMetadata name and description p.ChangeProteinMetadata(MergeSearchResult(common.GetProteinMetadata(), oldMetadata)); p.Status = ProteinSearchInfo.SearchStatus.success; lookupCount++; // Succcess! break; } } if (p.GetProteinMetadata().NeedsSearch() && uniqueProteinLength) { p.SetWebSearchCompleted(); // No answer found, but we're done p.NoteSearchFailure(); lookupCount++; // done with this one } } } } // End if we got any respones else if (searchType == UNIPROTKB_TAG) { // None of the searches hit - Uniprot is our last search so just set these as complete foreach (var p in proteins.Where(p => p.GetProteinMetadata().NeedsSearch())) { p.SetWebSearchCompleted(); // No answer found, but we're done p.NoteSearchFailure(); lookupCount++; // done with this one } } else if (proteins.Count() == 1) { proteins[0].SetWebSearchCompleted(); // no response for a single protein - we aren't going to get an answer proteins[0].NoteSearchFailure(); lookupCount++; // done with this one } break; // No need for retry } return lookupCount; }
// // Return a ProteinSearchInfo whose members are the same in every member of the list, or null when list members disagree // public static ProteinSearchInfo Intersection(IEnumerable<ProteinSearchInfo> list) { if (list == null) return null; var proteinSearchInfos = list as ProteinSearchInfo[] ?? list.ToArray(); if (!proteinSearchInfos.Any()) return null; var result = new ProteinSearchInfo(new DbProteinName(proteinSearchInfos[0].ProteinDbInfo.Protein, proteinSearchInfos[0].ProteinDbInfo.GetProteinMetadata().ClearWebSearchInfo()),0); var rdb = result.ProteinDbInfo; foreach (var p in proteinSearchInfos.Skip(1)) { // Make sure all string properties in list agree, nulling out those that don't var pdb = p.ProteinDbInfo; foreach (var resultProperty in rdb.GetType().GetProperties().Where(prop => prop.PropertyType == typeof (string))) { var pdbProperty = pdb.GetType().GetProperties().First(pprop => Equals(pprop.Name, resultProperty.Name)); if (!Equals(resultProperty.GetValue(rdb, null), pdbProperty.GetValue(pdb, null))) { resultProperty.SetValue(rdb, null); } } } return result; }