private void AddNameToMatchable(MatchableBagOfTokens matchable, string pre_processed_name) { var tokens = Tokenize(pre_processed_name); IList <string> tokens_per_name = new List <string>(); foreach (var token in tokens) { matchable.Tokens.Add(token); tokens_per_name.Add(token); } matchable.TokensList.Add(tokens_per_name); }
private IList <MatchableBagOfTokens> FindSemanticMatches( MatchableBagOfTokens matchable_entity_bag, IList <MatchableBagOfTokens> matchable_names, double semantic_threshold) { var matches = FindNearestMatchesWithin(matchable_entity_bag, matchable_names, semantic_threshold); IList <MatchableBagOfTokens> matching_bags = new List <MatchableBagOfTokens>(); foreach (var match in matches) { matching_bags.Add(match.Element); } return(matching_bags); }
private MatchableBagOfTokens MakeMatchableBagOfTokens(string pre_processed_canonical_name, IList <string> alternative_names, string canonical_setting_name, string canonical_value_name) { MatchableBagOfTokens matchable = new MatchableBagOfTokens { CanonicalSettingName = canonical_setting_name, CanonicalValueName = canonical_value_name }; AddNameToMatchable(matchable, pre_processed_canonical_name); foreach (var name in alternative_names) { AddNameToMatchable(matchable, PreProcessName(name)); } return(matchable); }
private MatchableBagOfTokens MakeMatchableBagOfTokens(List <string> entityValues) { MatchableBagOfTokens matchable = new MatchableBagOfTokens(); IList <string> preProcessedValues = new List <string>(); foreach (string value in entityValues) { preProcessedValues.Add(PreProcessPartial(value)); } string extracted_setting_name = string.Join(" ", preProcessedValues); AddNameToMatchable(matchable, extracted_setting_name); return(matchable); }
private double ComputeSimilarityScore(MatchableBagOfTokens lhs, MatchableBagOfTokens rhs) { // Given two lists of tokenized setting names // Return the maximum similarity score between setting names from these two lists double score_final = -1; foreach (var lhs_tokens in lhs.TokensList) { double score_lhs = -1; foreach (var rhs_tokens in rhs.TokensList) { double score = 0; foreach (var token in lhs_tokens) { if (rhs_tokens.Contains(token)) { score += 1; } } // Multiplying by 0.5 to normalize the scores into the [0,1] interval. score = 0.5 * ((score / lhs_tokens.Count()) + (score / rhs_tokens.Count())); if (score > score_lhs) { score_lhs = score; } } if (score_lhs > score_final) { score_final = score_lhs; } } return(score_final); }
private IList <MatchResult> FindNearestMatchesWithin(MatchableBagOfTokens matchable_entity_bag, IList <MatchableBagOfTokens> matchable_names, double threshold) { IList <MatchResult> matches = new List <MatchResult>(); foreach (var matchable_name in matchable_names) { double score = ComputeSimilarityScore(matchable_entity_bag, matchable_name); MatchResult match = new MatchResult { Element = matchable_name, Score = score }; matches.Add(match); } matches = matches.OrderByDescending(match => match.Score).ToList(); IList <MatchResult> selected_matches = new List <MatchResult>(); if (matches.Any()) { double max_score = matches[0].Score; if (max_score > 0) { foreach (var match in matches) { if (match.Score >= threshold) { selected_matches.Add(match); } } } } return(selected_matches); }
// We disambiguate between antonyms using TF-IDF with the names of each candidate forming a "document." private IList <MatchableBagOfTokens> DisambiguateAntonyms( MatchableBagOfTokens matchable_entity_bag, IList <MatchableBagOfTokens> matchable_candidate_bags, double threshold, double percentage_of_max, bool use_coverage_filter) { // Precompute the document frequency. IDictionary <string, int> document_freq = new Dictionary <string, int>(); foreach (var matchable_candidate_bag in matchable_candidate_bags) { foreach (var token in UniqueElements(matchable_candidate_bag.Tokens)) { if (!document_freq.TryAdd(token, 1)) { ++document_freq[token]; } } } IList <ScoredMatchableBagOfTokens> tf_idf_scored_bags = new List <ScoredMatchableBagOfTokens>(); IDictionary <string, IDictionary <string, double> > coverage_scores = new Dictionary <string, IDictionary <string, double> >(); foreach (var matchable_candidate_bag in matchable_candidate_bags) { ISet <string> matching_tokens = new HashSet <string>(); double tf_idf_sum = 0.0; foreach (var token in matchable_entity_bag.Tokens) { // Using binary TF because the alternative names contain a lot of repeated tokens and are not weighted by // how frequent each name is in natural language, so the numeric TF of a token is not representative of how // much that token is associated with the setting/ value. double tf_idf = 0.0; if (matchable_candidate_bag.Tokens.Contains(token)) { matching_tokens.Add(token); if (document_freq.TryGetValue(token, out var freq)) { tf_idf = 1.0 / freq; } } tf_idf_sum += tf_idf; } var tf_idf_avg = tf_idf_sum / matchable_entity_bag.Tokens.Count(); // Given two candidates with a similar score, we want to select the one that has the highest percentage of // matching tokens out of all its tokens. E.g., if one candidate is a substring of the other and the entities // match the shared substring, we want to select the shorter candidate because it has a better coverage. if (use_coverage_filter) { ISet <string> unique_candidate_tokens = new HashSet <string>(UniqueElements(matchable_candidate_bag.Tokens)); var coverage_score = (double)matching_tokens.Count() / unique_candidate_tokens.Count(); if (!coverage_scores.TryGetValue(matchable_candidate_bag.CanonicalSettingName, out IDictionary <string, double> innerMap)) { innerMap = new Dictionary <string, double>(); } innerMap.Add(matchable_candidate_bag.CanonicalValueName, coverage_score); coverage_scores[matchable_candidate_bag.CanonicalSettingName] = innerMap; } if (tf_idf_avg > threshold) { ScoredMatchableBagOfTokens scoredBag = new ScoredMatchableBagOfTokens { Option = matchable_candidate_bag, Score = tf_idf_avg }; tf_idf_scored_bags.Add(scoredBag); } } var selected_tf_idf_scored_bags = SelectPercentageOfMax(tf_idf_scored_bags, percentage_of_max); IList <ScoredMatchableBagOfTokens> selected_scored_bags; if (use_coverage_filter) { IList <ScoredMatchableBagOfTokens> coverage_scored_bags = new List <ScoredMatchableBagOfTokens>(); foreach (var tf_idf_scored_bag in selected_tf_idf_scored_bags) { ScoredMatchableBagOfTokens scoredBag = new ScoredMatchableBagOfTokens { Option = tf_idf_scored_bag.Option, Score = coverage_scores[tf_idf_scored_bag.Option.CanonicalSettingName][tf_idf_scored_bag.Option.CanonicalValueName] }; coverage_scored_bags.Add(scoredBag); } selected_scored_bags = SelectPercentageOfMax(coverage_scored_bags, percentage_of_max); } else { selected_scored_bags = selected_tf_idf_scored_bags; } IList <MatchableBagOfTokens> selected_candidate_bags = new List <MatchableBagOfTokens>(); foreach (var scored_bag in selected_scored_bags) { selected_candidate_bags.Add(scored_bag.Option); } return(selected_candidate_bags); }