public override Dictionary <GeonamesCityEntity, Features> GenerateCandidatesAndFeatures(HostnameSplitterResult parsedHostname) { var candidatesAndFeatures = new Dictionary <GeonamesCityEntity, Features>(); var domain = parsedHostname?.DomainInfo?.RegistrableDomain; var subdomainParts = parsedHostname.SubdomainParts; if (subdomainParts == null || subdomainParts.Count == 0) { return(candidatesAndFeatures); } var ruleAtoms = this.miner.CreateRuleAtoms(subdomainParts); var rules = this.miner.GeneratePossibleRules(ruleAtoms); if (rules == null || rules.Count == 0) { return(candidatesAndFeatures); } PatternMiningCoordinates bestCoordinates = null; Dictionary <PatternRule, PatternMiningCoordinates> rulesToCoordinates; if (this.hostnamePatternRules.TryGetValue(domain, out rulesToCoordinates)) { foreach (var rule in rules) { PatternMiningCoordinates currentCoordinates; if (rulesToCoordinates.TryGetValue(rule, out currentCoordinates)) { if (currentCoordinates.ClosestCity != null) { if (bestCoordinates == null || currentCoordinates.Confidence > bestCoordinates.Confidence) { bestCoordinates = currentCoordinates; } } } } } if (bestCoordinates != null) { var features = this.InitializeDefaultFeatureValues(); features[CityFeatureType.HostnamePatternMatch] = true; features[CityFeatureType.HostnamePatternConfidence] = bestCoordinates.Confidence; candidatesAndFeatures[bestCoordinates.ClosestCity] = features; } return(candidatesAndFeatures); }
private void AddRulesCoordinatesToDomain( Dictionary <string, int> domainCounts, Dictionary <string, Dictionary <PatternRule, int> > ruleCountsForDomains, Dictionary <string, Dictionary <PatternRule, HashSet <PatternMiningCoordinates> > > domainsToRulesToCoordinates, string domain, List <PatternRule> rules, DatasetItem gtItem, int pruneIntervalCount, int pruneMinKeepThreshold) { if ( domainsToRulesToCoordinates == null || string.IsNullOrWhiteSpace(domain) || rules == null || rules.Count == 0 || gtItem == null) { return; } var rulesToCoordinates = this.RetrieveRulesToCoordinatesSet(domainsToRulesToCoordinates, domain); var newCoordinates = new PatternMiningCoordinates() { Latitude = gtItem.Latitude, Longitude = gtItem.Longitude, Confidence = 0d }; var rulesCountsForDomain = this.AddRetrieveDomainToRulesCounts(ruleCountsForDomains, domain); foreach (var rule in rules) { this.IncrementOccurrences(rulesCountsForDomain, rule); HashSet <PatternMiningCoordinates> coordinatesForRule; if (!rulesToCoordinates.TryGetValue(rule, out coordinatesForRule)) { coordinatesForRule = new HashSet <PatternMiningCoordinates>(); rulesToCoordinates[rule] = coordinatesForRule; } coordinatesForRule.Add(newCoordinates); } var domainOcc = this.IncrementOccurrences(domainCounts, domain); if (domainOcc % pruneIntervalCount == 0) { this.PruneCounts(rulesCountsForDomain, rulesToCoordinates, minKeepThreshold: pruneMinKeepThreshold); } }
private Dictionary <string, Dictionary <PatternRule, List <PatternMiningCoordinates> > > FindClusterCentroids( Dictionary <string, int> domainCounts, Dictionary <string, Dictionary <PatternRule, int> > ruleCoordOccsForDomains, Dictionary <string, Dictionary <PatternRule, Dictionary <Coord, int> > > domainsToRulesToCoordinateCounts, double clusterThresholdKm, int minItemsPerCluster, double minSupportRatioPerCluster) { var domainsToRulesToCentroids = new Dictionary <string, Dictionary <PatternRule, List <PatternMiningCoordinates> > >(); var totalDomains = domainsToRulesToCoordinateCounts.Count; var currentDomainCount = 0; foreach (var domainsToRulesToCoordinateCountsEntry in domainsToRulesToCoordinateCounts) { currentDomainCount++; var domain = domainsToRulesToCoordinateCountsEntry.Key; var rulesToCoordinateCounts = domainsToRulesToCoordinateCountsEntry.Value; var ruleCoordOccsForDomain = ruleCoordOccsForDomains[domain]; var totalRulesForDomain = rulesToCoordinateCounts.Count(); Dictionary <PatternRule, List <PatternMiningCoordinates> > rulesToCentroids = null; var currentRuleCount = 0; foreach (var rulesToCoordinateCountsEntry in rulesToCoordinateCounts) { currentRuleCount++; var rule = rulesToCoordinateCountsEntry.Key; List <PatternMiningCoordinates> centroidsForRule = null; Console.WriteLine($"{currentDomainCount}/{totalDomains} - {currentRuleCount}/{totalRulesForDomain} - Finding clusters for domain {domain} and rule {rule}"); var coordinateToCounts = rulesToCoordinateCountsEntry.Value; var originalCoordinatesOccSum = ruleCoordOccsForDomain[rule]; var coordinatesWithOcc = new HashSet <CoordWithOcc>(coordinateToCounts.Select(x => new CoordWithOcc(x.Key, x.Value))); var clustering = new QTClustering <CoordWithOcc>( distanceHelper: new CoordWithOccDistanceHelper(), clusterDiameter: clusterThresholdKm, itemsSet: coordinatesWithOcc); Cluster <CoordWithOcc> cluster = null; do { cluster = clustering.NextCluster(); if (cluster == null) { break; } var clusterMembersSum = cluster.Members.Sum(c => c.Occurrences); if (clusterMembersSum < minItemsPerCluster) { break; } var supportRatio = clusterMembersSum / ((1.0d) * originalCoordinatesOccSum); if (supportRatio < minSupportRatioPerCluster) { break; } if (rulesToCentroids == null) { rulesToCentroids = this.RetrieveRulesToCentroidsList(domainsToRulesToCentroids, domain); } if (centroidsForRule == null) { centroidsForRule = this.RetrieveCentroidsList(rulesToCentroids, rule); } var centroidCoordinates = new PatternMiningCoordinates() { Latitude = cluster.Centroid.Coord.Latitude, Longitude = cluster.Centroid.Coord.Longitude, Confidence = supportRatio }; centroidsForRule.Add(centroidCoordinates); Console.WriteLine($" Found centroid: {centroidCoordinates}"); coordinatesWithOcc.ExceptWith(cluster.Members); var maxRemainingSupportRatio = coordinatesWithOcc.Sum(c => c.Occurrences) / ((1.0d) * originalCoordinatesOccSum); if (maxRemainingSupportRatio < minSupportRatioPerCluster) { break; } }while (cluster != null); } } return(domainsToRulesToCentroids); }