Пример #1
0
        private MatchableBagOfTokens MakeMatchableBagOfTokens(List <string> entity_types)
        {
            MatchableBagOfTokens matchable = new MatchableBagOfTokens();

            string extracted_setting_name = string.Join(" ", entity_types);

            AddNameToMatchable(matchable, extracted_setting_name);

            return(matchable);
        }
Пример #2
0
        private void AddNameToMatchable(MatchableBagOfTokens matchable, string pre_processed_name)
        {
            var            tokens          = Tokenize(pre_processed_name);
            IList <string> tokens_per_name = new List <string>();

            foreach (var token in tokens)
            {
                matchable.Tokens.Add(token);
                tokens_per_name.Add(token);
            }

            matchable.TokensList.Add(tokens_per_name);
        }
Пример #3
0
        private IList <MatchableBagOfTokens> FindSemanticMatches(
            MatchableBagOfTokens matchable_entity_bag, IList <MatchableBagOfTokens> matchable_names,
            double semantic_threshold)
        {
            var matches = FindNearestMatchesWithin(matchable_entity_bag, matchable_names, semantic_threshold);

            IList <MatchableBagOfTokens> matching_bags = new List <MatchableBagOfTokens>();

            foreach (var match in matches)
            {
                matching_bags.Add(match.Element);
            }

            return(matching_bags);
        }
Пример #4
0
        private MatchableBagOfTokens MakeMatchableBagOfTokens(List <string> entityValues)
        {
            MatchableBagOfTokens matchable = new MatchableBagOfTokens();

            IList <string> preProcessedValues = new List <string>();

            foreach (string value in entityValues)
            {
                preProcessedValues.Add(PreProcessPartial(value));
            }
            string extracted_setting_name = string.Join(" ", preProcessedValues);

            AddNameToMatchable(matchable, extracted_setting_name);

            return(matchable);
        }
Пример #5
0
        private MatchableBagOfTokens MakeMatchableBagOfTokens(string pre_processed_canonical_name, IList <string> alternative_names,
                                                              string canonical_setting_name, string canonical_value_name)
        {
            MatchableBagOfTokens matchable = new MatchableBagOfTokens
            {
                CanonicalSettingName = canonical_setting_name,
                CanonicalValueName   = canonical_value_name
            };

            AddNameToMatchable(matchable, pre_processed_canonical_name);
            foreach (var name in alternative_names)
            {
                AddNameToMatchable(matchable, PreProcessName(name));
            }

            return(matchable);
        }
Пример #6
0
        private double ComputeSimilarityScore(MatchableBagOfTokens lhs, MatchableBagOfTokens rhs)
        {
            // Given two lists of tokenized setting names
            // Return the maximum similarity score between setting names from these two lists
            double score_final = -1;

            foreach (var lhs_tokens in lhs.TokensList)
            {
                double score_lhs = -1;
                foreach (var rhs_tokens in rhs.TokensList)
                {
                    double score = 0;
                    foreach (var token in lhs_tokens)
                    {
                        if (rhs_tokens.Contains(token))
                        {
                            score += 1;
                        }
                    }

                    // Multiplying by 0.5 to normalize the scores into the [0,1] interval.
                    score = 0.5 * ((score / lhs_tokens.Count()) + (score / rhs_tokens.Count()));
                    if (score > score_lhs)
                    {
                        score_lhs = score;
                    }
                }

                if (score_lhs > score_final)
                {
                    score_final = score_lhs;
                }
            }

            return(score_final);
        }
Пример #7
0
        private IList <MatchResult> FindNearestMatchesWithin(MatchableBagOfTokens matchable_entity_bag, IList <MatchableBagOfTokens> matchable_names, double threshold)
        {
            IList <MatchResult> matches = new List <MatchResult>();

            foreach (var matchable_name in matchable_names)
            {
                double      score = ComputeSimilarityScore(matchable_entity_bag, matchable_name);
                MatchResult match = new MatchResult
                {
                    Element = matchable_name,
                    Score   = score
                };
                matches.Add(match);
            }

            matches = matches.OrderByDescending(match => match.Score).ToList();

            IList <MatchResult> selected_matches = new List <MatchResult>();

            if (matches.Any())
            {
                double max_score = matches[0].Score;
                if (max_score > 0)
                {
                    foreach (var match in matches)
                    {
                        if (match.Score >= threshold)
                        {
                            selected_matches.Add(match);
                        }
                    }
                }
            }

            return(selected_matches);
        }
Пример #8
0
        // We disambiguate between antonyms using TF-IDF with the names of each candidate forming a "document."
        private IList <MatchableBagOfTokens> DisambiguateAntonyms(
            MatchableBagOfTokens matchable_entity_bag,
            IList <MatchableBagOfTokens> matchable_candidate_bags,
            double threshold,
            double percentage_of_max,
            bool use_coverage_filter)
        {
            // Precompute the document frequency.
            IDictionary <string, int> document_freq = new Dictionary <string, int>();

            foreach (var matchable_candidate_bag in matchable_candidate_bags)
            {
                foreach (var token in UniqueElements(matchable_candidate_bag.Tokens))
                {
                    if (!document_freq.TryAdd(token, 1))
                    {
                        ++document_freq[token];
                    }
                }
            }

            IList <ScoredMatchableBagOfTokens> tf_idf_scored_bags = new List <ScoredMatchableBagOfTokens>();
            IDictionary <string, IDictionary <string, double> > coverage_scores = new Dictionary <string, IDictionary <string, double> >();

            foreach (var matchable_candidate_bag in matchable_candidate_bags)
            {
                ISet <string> matching_tokens = new HashSet <string>();
                double        tf_idf_sum      = 0.0;
                foreach (var token in matchable_entity_bag.Tokens)
                {
                    // Using binary TF because the alternative names contain a lot of repeated tokens and are not weighted by
                    // how frequent each name is in natural language, so the numeric TF of a token is not representative of how
                    // much that token is associated with the setting/ value.
                    double tf_idf = 0.0;
                    if (matchable_candidate_bag.Tokens.Contains(token))
                    {
                        matching_tokens.Add(token);
                        if (document_freq.TryGetValue(token, out var freq))
                        {
                            tf_idf = 1.0 / freq;
                        }
                    }

                    tf_idf_sum += tf_idf;
                }

                var tf_idf_avg = tf_idf_sum / matchable_entity_bag.Tokens.Count();

                // Given two candidates with a similar score, we want to select the one that has the highest percentage of
                // matching tokens out of all its tokens. E.g., if one candidate is a substring of the other and the entities
                // match the shared substring, we want to select the shorter candidate because it has a better coverage.
                if (use_coverage_filter)
                {
                    ISet <string> unique_candidate_tokens = new HashSet <string>(UniqueElements(matchable_candidate_bag.Tokens));
                    var           coverage_score          = (double)matching_tokens.Count() / unique_candidate_tokens.Count();
                    if (!coverage_scores.TryGetValue(matchable_candidate_bag.CanonicalSettingName, out IDictionary <string, double> innerMap))
                    {
                        innerMap = new Dictionary <string, double>();
                    }

                    innerMap.Add(matchable_candidate_bag.CanonicalValueName, coverage_score);
                    coverage_scores[matchable_candidate_bag.CanonicalSettingName] = innerMap;
                }

                if (tf_idf_avg > threshold)
                {
                    ScoredMatchableBagOfTokens scoredBag = new ScoredMatchableBagOfTokens
                    {
                        Option = matchable_candidate_bag,
                        Score  = tf_idf_avg
                    };
                    tf_idf_scored_bags.Add(scoredBag);
                }
            }

            var selected_tf_idf_scored_bags = SelectPercentageOfMax(tf_idf_scored_bags, percentage_of_max);

            IList <ScoredMatchableBagOfTokens> selected_scored_bags;

            if (use_coverage_filter)
            {
                IList <ScoredMatchableBagOfTokens> coverage_scored_bags = new List <ScoredMatchableBagOfTokens>();
                foreach (var tf_idf_scored_bag in selected_tf_idf_scored_bags)
                {
                    ScoredMatchableBagOfTokens scoredBag = new ScoredMatchableBagOfTokens
                    {
                        Option = tf_idf_scored_bag.Option,
                        Score  = coverage_scores[tf_idf_scored_bag.Option.CanonicalSettingName][tf_idf_scored_bag.Option.CanonicalValueName]
                    };
                    coverage_scored_bags.Add(scoredBag);
                }

                selected_scored_bags = SelectPercentageOfMax(coverage_scored_bags, percentage_of_max);
            }
            else
            {
                selected_scored_bags = selected_tf_idf_scored_bags;
            }

            IList <MatchableBagOfTokens> selected_candidate_bags = new List <MatchableBagOfTokens>();

            foreach (var scored_bag in selected_scored_bags)
            {
                selected_candidate_bags.Add(scored_bag.Option);
            }

            return(selected_candidate_bags);
        }