コード例 #1
0
        /// <summary>
        /// Gets the output columns for the current mapping based on output column configuration
        /// </summary>
        public static string GetOutputColumns(ReferenceMapping mappedReference)
        {
            var outputColumns = new List <string>();

            if (mappedReference == null)
            {
                foreach (var attribute in configuration.OutputColumns)
                {
                    switch (attribute)
                    {
                    case "score":
                        outputColumns.Add("0");
                        break;

                    default:
                        outputColumns.Add("");
                        break;
                    }
                }
            }
            else
            {
                // SPECIAL CASE: If there is an exact match for DOI, override confidence score to a 1, as DOI is generally 1:1
                if (mappedReference.MappedReference.Contains("<attr confidence=\"1\" name=\"academic#DOI\">"))
                {
                    mappedReference.PercentOfReferenceMapped = 1.0;
                }

                foreach (var attribute in configuration.OutputColumns)
                {
                    switch (attribute)
                    {
                    case "score":
                        outputColumns.Add(mappedReference.PercentOfReferenceMapped.ToString());
                        break;

                    case "mapping":
                        outputColumns.Add(mappedReference.MappedReference);
                        break;

                    case "id":
                        outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "Id"));
                        break;

                    case "familyId":
                        outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "FamId"));
                        break;

                    case "pubmedId":
                        if (mappedReference.MappedPaper["S"] != null)
                        {
                            if (mappedReference.MappedPaper["S"].Any(a => a.Value <string>("U").StartsWith("https://www.ncbi.nlm.nih.gov/pubmed/")))
                            {
                                outputColumns.Add(mappedReference.MappedPaper["S"].First(a => a.Value <string>("U").StartsWith("https://www.ncbi.nlm.nih.gov/pubmed/")).Value <string>("U").Replace("https://www.ncbi.nlm.nih.gov/pubmed/", ""));
                            }
                        }
                        break;

                    case "title":
                        outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "DN"));
                        break;

                    case "authors":
                        var authors = new List <string>();

                        if (mappedReference.MappedPaper["AA"] != null)
                        {
                            foreach (var author in mappedReference.MappedPaper["AA"])
                            {
                                authors.Add(GetAttributeIfExists(mappedReference.MappedPaper, "DAuN"));
                            }
                        }

                        outputColumns.Add(string.Join(", ", authors));
                        break;

                    case "year":
                        outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "Y"));
                        break;

                    case "venue":
                        outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "VFN"));
                        break;

                    case "volume":
                        outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "V"));
                        break;

                    case "issue":
                        outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "I"));
                        break;

                    case "firstPage":
                        outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "FP"));
                        break;

                    case "lastPage":
                        outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "LP"));
                        break;

                    case "doi":
                        outputColumns.Add(GetAttributeIfExists(mappedReference.MappedPaper, "DOI"));
                        break;
                    }
                }
            }

            // Output mapping data + original row columns
            return(string.Join("\t", outputColumns));
        }
コード例 #2
0
        /// <summary>
        /// Maps an academic reference string to a Microsoft Academic paper entity using Project Academic Knowledge Interpret method
        /// </summary>
        static ReferenceMapping MapReferenceToPaper(string referenceString, string subscriptionKey, string attributesToMap, string attributesToReturn)
        {
            var resultString = GetQueryInterpretations(
                query: referenceString,
                subscriptionKey: subscriptionKey,
                complete: false,
                interpretationCount: 1,
                entityCount: 1,
                attributes: attributesToMap + "," + attributesToReturn,

                // NOTE: Longer timeouts can sometimes provide better quality interpretations, though generally 500ms is sufficient for most reference strings
                timeout: 500);

            var result = JObject.Parse(resultString);

            if (result["interpretations"] != null && result["interpretations"].Count() > 0)
            {
                var toMap = new HashSet <string>(attributesToMap.Split(','));

                // We only care about the first interpretation
                var firstInterpretation = result["interpretations"].First();

                // Extract relevance fields from JSON response
                var interpretedQuery = result.Value <string>("query");
                var newParse         = interpretedQuery;
                var entity           = firstInterpretation["rules"].First()["output"]["entities"].First();

                // Interpretation of the reference string is not guaranteed to map all parts of said string, as it will stop
                // once it has "enough" to generate a matching entity and leave the rest of the query unprocessed.
                //
                // Filler, stop words, misspelled words or unknown synonyms are all possibilities for the remaining terms not matched
                //
                // To generate a "confidence" score we do a "re-mapping" of the top entity's values based on Jaccard distance

                // First remove the enclosing "rule" tags from XML parse
                var modifiedParse      = newParse;
                var remainingUnmatched = string.Empty;
                var unmatchedChunk     = interpretedQuery;

                // Break up parse into chunks based on parts of string that were matched
                var reducedChunk  = unmatchedChunk.Trim();
                var expandedChunk = reducedChunk;

                // Generate mappings for all of the entity's attributes
                var entityAttributeMapping = new List <Tuple <double, string, string, string> >();
                foreach (var attribute in entity)
                {
                    var prop = (JProperty)attribute;

                    if (prop.Value is JArray)
                    {
                        // Map composite attribute arrays
                        foreach (var innerObject in (JArray)prop.Value)
                        {
                            foreach (var innerAttribute in (JObject)innerObject)
                            {
                                var name = $"{prop.Name}.{innerAttribute.Key}";

                                if (toMap.Contains(name))
                                {
                                    var value = innerAttribute.Value.ToString();

                                    var nearestSubstring = FindBestSubstring(reducedChunk, value);

                                    entityAttributeMapping.Add(new Tuple <double, string, string, string>(nearestSubstring.Item1, name, value, nearestSubstring.Item2));
                                }
                            }
                        }
                    }
                    else if (prop.Value is JObject)
                    {
                        // Map composite attributes
                        foreach (var innerAttribute in (JObject)prop.Value)
                        {
                            var name = $"{prop.Name}.{innerAttribute.Key}";

                            if (toMap.Contains(name))
                            {
                                var value = innerAttribute.Value.ToString();

                                var nearestSubstring = FindBestSubstring(reducedChunk, value);

                                entityAttributeMapping.Add(new Tuple <double, string, string, string>(nearestSubstring.Item1, name, value, nearestSubstring.Item2));
                            }
                        }
                    }
                    else
                    {
                        // Map simple value attributes
                        if (toMap.Contains(prop.Name))
                        {
                            var value = prop.Value.ToString();

                            var nearestSubstring = FindBestSubstring(reducedChunk, value);

                            entityAttributeMapping.Add(new Tuple <double, string, string, string>(nearestSubstring.Item1, prop.Name, value, nearestSubstring.Item2));
                        }
                    }
                }

                // Reduce the mapping down to only the most confident ones based on inverse Jaccard distance
                var confidentMappings = new List <Tuple <double, string, string, string> >();
                foreach (var b in entityAttributeMapping.OrderByDescending(a => a.Item1).ThenByDescending(a => a.Item4.Length))
                {
                    if (b.Item1 > 0.8 && reducedChunk.Contains(b.Item4))
                    {
                        confidentMappings.Add(b);
                        reducedChunk = reducedChunk.Replace(b.Item4, "");
                    }
                }

                // For each mapping we're confident with, generate a mapping string with embedded XML tags for each match
                foreach (var b in confidentMappings.OrderByDescending(a => a.Item1).ThenByDescending(a => a.Item4.Length))
                {
                    if (b.Item3 == b.Item4)
                    {
                        expandedChunk = Regex.Replace(expandedChunk, $"({Regex.Escape(b.Item4)})(?![^<]*>|[^<>]*</)", $" <attr confidence=\"{b.Item1}\" name=\"academic#{b.Item2}\">{b.Item4}</attr> ");
                    }
                    else
                    {
                        expandedChunk = Regex.Replace(expandedChunk, $"({Regex.Escape(b.Item4)})(?![^<]*>|[^<>]*</)", $" <attr confidence=\"{b.Item1}\" name=\"academic#{b.Item2}\" canonical=\"{b.Item3}\">{b.Item4}</attr> ");
                    }
                }

                remainingUnmatched += (remainingUnmatched == "" ? "" : " ") + reducedChunk;
                newParse            = Regex.Replace(newParse, $"({Regex.Escape(unmatchedChunk)})(?![^<]*>|[^<>]*</)", expandedChunk);

                // Collapse whitespace down to single space
                remainingUnmatched = Regex.Replace(remainingUnmatched, "\\s+", " ").Trim();
                newParse           = Regex.Replace(newParse, "\\s+", " ").Trim();

                var response = new ReferenceMapping
                {
                    PercentOfReferenceMapped = 1.0 - ((double)remainingUnmatched.Length / (double)interpretedQuery.Length),
                    NormalizedReference      = interpretedQuery,
                    MappedPaper        = entity,
                    MappedReference    = newParse,
                    OriginalReference  = referenceString,
                    ReferenceNotMapped = remainingUnmatched
                };

                return(response);
            }

            return(null);
        }