public static HashSet ExpandTagsNGrams(TagLookup allTags, List <string> tagsToExpand, NGrams nGrams, bool printLoggingMessages = false) { // Query: /Google.*Search/, we can build a query of ANDs and ORs that gives the trigrams that must be present in any text matching the regular expression. // In this case, the query is // Goo AND oog AND ogl AND gle AND Sea AND ear AND arc AND rch // '*php* -> php // '*.net* -> .ne AND net // '*.net' -> .ne AND net // how do we distinguish this from '*.net*' // *hibernate* -> hib AND ibe AND ber AND ern AND rna AND nat AND ate var expandedTags = new HashSet(); // TODO is there a better way of doing this, as we are creating a tempoary list, just for indexing the dictionary!!! var allTagsList = allTags.Keys.ToList(); var results = new NGramResults(); foreach (var tagPattern in tagsToExpand) { if (IsWildCard(tagPattern) == false) { //not a wildcard, leave it as is if (allTags.ContainsKey(tagPattern)) { expandedTags.Add(tagPattern); } continue; } results.ActualWildcards++; var searches = CreateSearches(tagPattern); var tagAdded = CollectPossibleNGramMatches(allTagsList, nGrams, searches, tagPattern, expandedTags, results); } if (printLoggingMessages) { Logger.Log("{0:N0} wildcards, {1:N0} searches processed, {2:N0} tag Ids collected, \n" + "{3:N0} possible matches, {4:N0} IsActualMatch checks, {5:N0} tags added, {6:N0} false positives", results.ActualWildcards, results.SearchesProcessed, results.TagIdsCollected, results.PossibleMatches, results.ActualMatchChecks, results.TagsAdded, results.FalsePositives); } return(expandedTags); }
private static bool CollectPossibleNGramMatches(List <string> allTagsList, NGrams nGrams, IEnumerable <string> searches, string tagPattern, HashSet expandedTags, NGramResults results) { HashSet <int> expandedTagIds = null; // Sanity check, in case there is a tag in the exclusion list that is no longer a real tag // Also start with the search that has the least matches/hits, makes the Hash set intersections slightly faster foreach (var search in searches.Where(s => nGrams.ContainsKey(s)).OrderBy(s => nGrams[s].Count)) { results.SearchesProcessed++; var tagLocations = nGrams[search]; results.TagIdsCollected += tagLocations.Count; if (expandedTagIds == null) { expandedTagIds = new HashSet <int>(tagLocations); } else { expandedTagIds.IntersectWith(tagLocations); // This seems to be slower (probably the time taken to build the extra HashSet(..) // even though it should be able to use a fast-path intersect as it's intersecting 2 HashSets!! // expandedTagIds.IntersectWith(new HashSet<int>(tagLocations)); } } if (expandedTagIds == null) { Logger.Log("TagPattern={0} (Searches: {1}), produces NO Tag Ids to test", tagPattern, String.Join(", ", searches)); return(false); } // N-Grams can give false +ve, so we have to sanity check each match! // For example TagPattern: *php*, Searches: ph, hp, Tag: phonegap-pushplugin, bool tagWasAdded = false; results.PossibleMatches += (expandedTagIds != null ? expandedTagIds.Count : 1); var rawTagPattern = tagPattern.Replace("*", ""); foreach (var tagMatch in expandedTagIds.Select(expandedTagId => allTagsList[expandedTagId])) { results.ActualMatchChecks++; if (IsActualMatch(tagMatch, tagPattern, rawTagPattern)) { expandedTags.Add(tagMatch); results.TagsAdded++; tagWasAdded = true; } else { results.FalsePositives++; } } return(tagWasAdded); }