/// <summary> /// Gets the output columns for the current mapping based on output column configuration /// </summary> public static string GetOutputColumns(ReferenceMapping mappedReference) { var outputColumns = new List <string>(); if (mappedReference == null) { foreach (var attribute in configuration.OutputColumns) { switch (attribute) { case "score": outputColumns.Add("0"); break; default: outputColumns.Add(""); break; } } } else { // SPECIAL CASE: If there is an exact match for DOI, override confidence score to a 1, as DOI is generally 1:1 if (mappedReference.MappedReference.Contains("<attr confidence=\"1\" name=\"academic#DOI\">")) { mappedReference.PercentOfReferenceMapped = 1.0; } foreach (var attribute in configuration.OutputColumns) { switch (attribute) { case "score": outputColumns.Add(mappedReference.PercentOfReferenceMapped.ToString()); break; case "mapping": outputColumns.Add(mappedReference.MappedReference); break; case "id": outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "Id")); break; case "familyId": outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "FamId")); break; case "pubmedId": if (mappedReference.MappedPaper["S"] != null) { if (mappedReference.MappedPaper["S"].Any(a => a.Value <string>("U").StartsWith("https://www.ncbi.nlm.nih.gov/pubmed/"))) { outputColumns.Add(mappedReference.MappedPaper["S"].First(a => a.Value <string>("U").StartsWith("https://www.ncbi.nlm.nih.gov/pubmed/")).Value <string>("U").Replace("https://www.ncbi.nlm.nih.gov/pubmed/", "")); } } break; case "title": outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "DN")); break; case "authors": var authors = new List <string>(); if (mappedReference.MappedPaper["AA"] != null) { foreach (var author in mappedReference.MappedPaper["AA"]) { authors.Add(GetAttributeIfExists(mappedReference.MappedPaper, "DAuN")); } } outputColumns.Add(string.Join(", ", authors)); break; case "year": outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "Y")); break; case "venue": outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "VFN")); break; case "volume": outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "V")); break; case "issue": outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "I")); break; case "firstPage": outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "FP")); break; case "lastPage": outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "LP")); break; case "doi": outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "DOI")); break; } } } // Output mapping data + original row columns return(string.Join("\t", outputColumns)); }
/// <summary> /// Maps an academic reference string to a Microsoft Academic paper entity using Project Academic Knowledge Interpret method /// </summary> static ReferenceMapping MapReferenceToPaper(string referenceString, string subscriptionKey, string attributesToMap, string attributesToReturn) { var resultString = GetQueryInterpretations( query: referenceString, subscriptionKey: subscriptionKey, complete: false, interpretationCount: 1, entityCount: 1, attributes: attributesToMap + "," + attributesToReturn, // NOTE: Longer timeouts can sometimes provide better quality interpretations, though generally 500ms is sufficient for most reference strings timeout: 500); var result = JObject.Parse(resultString); if (result["interpretations"] != null && result["interpretations"].Count() > 0) { var toMap = new HashSet <string>(attributesToMap.Split(',')); // We only care about the first interpretation var firstInterpretation = result["interpretations"].First(); // Extract relevance fields from JSON response var interpretedQuery = result.Value <string>("query"); var newParse = interpretedQuery; var entity = firstInterpretation["rules"].First()["output"]["entities"].First(); // Interpretation of the reference string is not guaranteed to map all parts of said string, as it will stop // once it has "enough" to generate a matching entity and leave the rest of the query unprocessed. // // Filler, stop words, misspelled words or unknown synonyms are all possibilities for the remaining terms not matched // // To generate a "confidence" score we do a "re-mapping" of the top entity's values based on Jaccard distance // First remove the enclosing "rule" tags from XML parse var modifiedParse = newParse; var remainingUnmatched = string.Empty; var unmatchedChunk = interpretedQuery; // Break up parse into chunks based on parts of string that were matched var reducedChunk = unmatchedChunk.Trim(); var expandedChunk = reducedChunk; // Generate mappings for all of the entity's attributes var entityAttributeMapping = new List <Tuple <double, string, string, string> >(); foreach (var attribute in entity) { var prop = (JProperty)attribute; if (prop.Value is JArray) { // Map composite attribute arrays foreach (var innerObject in (JArray)prop.Value) { foreach (var innerAttribute in (JObject)innerObject) { var name = $"{prop.Name}.{innerAttribute.Key}"; if (toMap.Contains(name)) { var value = innerAttribute.Value.ToString(); var nearestSubstring = FindBestSubstring(reducedChunk, value); entityAttributeMapping.Add(new Tuple <double, string, string, string>(nearestSubstring.Item1, name, value, nearestSubstring.Item2)); } } } } else if (prop.Value is JObject) { // Map composite attributes foreach (var innerAttribute in (JObject)prop.Value) { var name = $"{prop.Name}.{innerAttribute.Key}"; if (toMap.Contains(name)) { var value = innerAttribute.Value.ToString(); var nearestSubstring = FindBestSubstring(reducedChunk, value); entityAttributeMapping.Add(new Tuple <double, string, string, string>(nearestSubstring.Item1, name, value, nearestSubstring.Item2)); } } } else { // Map simple value attributes if (toMap.Contains(prop.Name)) { var value = prop.Value.ToString(); var nearestSubstring = FindBestSubstring(reducedChunk, value); entityAttributeMapping.Add(new Tuple <double, string, string, string>(nearestSubstring.Item1, prop.Name, value, nearestSubstring.Item2)); } } } // Reduce the mapping down to only the most confident ones based on inverse Jaccard distance var confidentMappings = new List <Tuple <double, string, string, string> >(); foreach (var b in entityAttributeMapping.OrderByDescending(a => a.Item1).ThenByDescending(a => a.Item4.Length)) { if (b.Item1 > 0.8 && reducedChunk.Contains(b.Item4)) { confidentMappings.Add(b); reducedChunk = reducedChunk.Replace(b.Item4, ""); } } // For each mapping we're confident with, generate a mapping string with embedded XML tags for each match foreach (var b in confidentMappings.OrderByDescending(a => a.Item1).ThenByDescending(a => a.Item4.Length)) { if (b.Item3 == b.Item4) { expandedChunk = Regex.Replace(expandedChunk, $"({Regex.Escape(b.Item4)})(?![^<]*>|[^<>]*</)", $" <attr confidence=\"{b.Item1}\" name=\"academic#{b.Item2}\">{b.Item4}</attr> "); } else { expandedChunk = Regex.Replace(expandedChunk, $"({Regex.Escape(b.Item4)})(?![^<]*>|[^<>]*</)", $" <attr confidence=\"{b.Item1}\" name=\"academic#{b.Item2}\" canonical=\"{b.Item3}\">{b.Item4}</attr> "); } } remainingUnmatched += (remainingUnmatched == "" ? "" : " ") + reducedChunk; newParse = Regex.Replace(newParse, $"({Regex.Escape(unmatchedChunk)})(?![^<]*>|[^<>]*</)", expandedChunk); // Collapse whitespace down to single space remainingUnmatched = Regex.Replace(remainingUnmatched, "\\s+", " ").Trim(); newParse = Regex.Replace(newParse, "\\s+", " ").Trim(); var response = new ReferenceMapping { PercentOfReferenceMapped = 1.0 - ((double)remainingUnmatched.Length / (double)interpretedQuery.Length), NormalizedReference = interpretedQuery, MappedPaper = entity, MappedReference = newParse, OriginalReference = referenceString, ReferenceNotMapped = remainingUnmatched }; return(response); } return(null); }