public override Dictionary <GeonamesCityEntity, Features> GenerateCandidatesAndFeatures(HostnameSplitterResult parsedHostname)
        {
            var candidatesAndFeatures = new Dictionary <GeonamesCityEntity, Features>();

            var domain = parsedHostname?.DomainInfo?.RegistrableDomain;

            var subdomainParts = parsedHostname.SubdomainParts;

            if (subdomainParts == null || subdomainParts.Count == 0)
            {
                return(candidatesAndFeatures);
            }

            var ruleAtoms = this.miner.CreateRuleAtoms(subdomainParts);

            var rules = this.miner.GeneratePossibleRules(ruleAtoms);

            if (rules == null || rules.Count == 0)
            {
                return(candidatesAndFeatures);
            }

            PatternMiningCoordinates bestCoordinates = null;
            Dictionary <PatternRule, PatternMiningCoordinates> rulesToCoordinates;

            if (this.hostnamePatternRules.TryGetValue(domain, out rulesToCoordinates))
            {
                foreach (var rule in rules)
                {
                    PatternMiningCoordinates currentCoordinates;

                    if (rulesToCoordinates.TryGetValue(rule, out currentCoordinates))
                    {
                        if (currentCoordinates.ClosestCity != null)
                        {
                            if (bestCoordinates == null || currentCoordinates.Confidence > bestCoordinates.Confidence)
                            {
                                bestCoordinates = currentCoordinates;
                            }
                        }
                    }
                }
            }

            if (bestCoordinates != null)
            {
                var features = this.InitializeDefaultFeatureValues();
                features[CityFeatureType.HostnamePatternMatch]      = true;
                features[CityFeatureType.HostnamePatternConfidence] = bestCoordinates.Confidence;
                candidatesAndFeatures[bestCoordinates.ClosestCity]  = features;
            }

            return(candidatesAndFeatures);
        }
        private void AddRulesCoordinatesToDomain(
            Dictionary <string, int> domainCounts,
            Dictionary <string, Dictionary <PatternRule, int> > ruleCountsForDomains,
            Dictionary <string, Dictionary <PatternRule, HashSet <PatternMiningCoordinates> > > domainsToRulesToCoordinates,
            string domain,
            List <PatternRule> rules,
            DatasetItem gtItem,
            int pruneIntervalCount,
            int pruneMinKeepThreshold)
        {
            if (
                domainsToRulesToCoordinates == null ||
                string.IsNullOrWhiteSpace(domain) ||
                rules == null ||
                rules.Count == 0 ||
                gtItem == null)
            {
                return;
            }

            var rulesToCoordinates = this.RetrieveRulesToCoordinatesSet(domainsToRulesToCoordinates, domain);

            var newCoordinates = new PatternMiningCoordinates()
            {
                Latitude   = gtItem.Latitude,
                Longitude  = gtItem.Longitude,
                Confidence = 0d
            };

            var rulesCountsForDomain = this.AddRetrieveDomainToRulesCounts(ruleCountsForDomains, domain);

            foreach (var rule in rules)
            {
                this.IncrementOccurrences(rulesCountsForDomain, rule);

                HashSet <PatternMiningCoordinates> coordinatesForRule;

                if (!rulesToCoordinates.TryGetValue(rule, out coordinatesForRule))
                {
                    coordinatesForRule       = new HashSet <PatternMiningCoordinates>();
                    rulesToCoordinates[rule] = coordinatesForRule;
                }

                coordinatesForRule.Add(newCoordinates);
            }

            var domainOcc = this.IncrementOccurrences(domainCounts, domain);

            if (domainOcc % pruneIntervalCount == 0)
            {
                this.PruneCounts(rulesCountsForDomain, rulesToCoordinates, minKeepThreshold: pruneMinKeepThreshold);
            }
        }
Esempio n. 3
0
        private Dictionary <string, Dictionary <PatternRule, List <PatternMiningCoordinates> > > FindClusterCentroids(
            Dictionary <string, int> domainCounts,
            Dictionary <string, Dictionary <PatternRule, int> > ruleCoordOccsForDomains,
            Dictionary <string, Dictionary <PatternRule, Dictionary <Coord, int> > > domainsToRulesToCoordinateCounts,
            double clusterThresholdKm,
            int minItemsPerCluster,
            double minSupportRatioPerCluster)
        {
            var domainsToRulesToCentroids = new Dictionary <string, Dictionary <PatternRule, List <PatternMiningCoordinates> > >();

            var totalDomains       = domainsToRulesToCoordinateCounts.Count;
            var currentDomainCount = 0;

            foreach (var domainsToRulesToCoordinateCountsEntry in domainsToRulesToCoordinateCounts)
            {
                currentDomainCount++;
                var domain = domainsToRulesToCoordinateCountsEntry.Key;
                var rulesToCoordinateCounts = domainsToRulesToCoordinateCountsEntry.Value;

                var ruleCoordOccsForDomain = ruleCoordOccsForDomains[domain];

                var totalRulesForDomain = rulesToCoordinateCounts.Count();

                Dictionary <PatternRule, List <PatternMiningCoordinates> > rulesToCentroids = null;

                var currentRuleCount = 0;

                foreach (var rulesToCoordinateCountsEntry in rulesToCoordinateCounts)
                {
                    currentRuleCount++;
                    var rule = rulesToCoordinateCountsEntry.Key;
                    List <PatternMiningCoordinates> centroidsForRule = null;

                    Console.WriteLine($"{currentDomainCount}/{totalDomains} - {currentRuleCount}/{totalRulesForDomain} - Finding clusters for domain {domain} and rule {rule}");

                    var coordinateToCounts        = rulesToCoordinateCountsEntry.Value;
                    var originalCoordinatesOccSum = ruleCoordOccsForDomain[rule];

                    var coordinatesWithOcc = new HashSet <CoordWithOcc>(coordinateToCounts.Select(x => new CoordWithOcc(x.Key, x.Value)));

                    var clustering = new QTClustering <CoordWithOcc>(
                        distanceHelper: new CoordWithOccDistanceHelper(),
                        clusterDiameter: clusterThresholdKm,
                        itemsSet: coordinatesWithOcc);

                    Cluster <CoordWithOcc> cluster = null;

                    do
                    {
                        cluster = clustering.NextCluster();

                        if (cluster == null)
                        {
                            break;
                        }

                        var clusterMembersSum = cluster.Members.Sum(c => c.Occurrences);

                        if (clusterMembersSum < minItemsPerCluster)
                        {
                            break;
                        }

                        var supportRatio = clusterMembersSum / ((1.0d) * originalCoordinatesOccSum);

                        if (supportRatio < minSupportRatioPerCluster)
                        {
                            break;
                        }

                        if (rulesToCentroids == null)
                        {
                            rulesToCentroids = this.RetrieveRulesToCentroidsList(domainsToRulesToCentroids, domain);
                        }

                        if (centroidsForRule == null)
                        {
                            centroidsForRule = this.RetrieveCentroidsList(rulesToCentroids, rule);
                        }

                        var centroidCoordinates = new PatternMiningCoordinates()
                        {
                            Latitude   = cluster.Centroid.Coord.Latitude,
                            Longitude  = cluster.Centroid.Coord.Longitude,
                            Confidence = supportRatio
                        };

                        centroidsForRule.Add(centroidCoordinates);

                        Console.WriteLine($"                                                Found centroid: {centroidCoordinates}");

                        coordinatesWithOcc.ExceptWith(cluster.Members);

                        var maxRemainingSupportRatio = coordinatesWithOcc.Sum(c => c.Occurrences) / ((1.0d) * originalCoordinatesOccSum);

                        if (maxRemainingSupportRatio < minSupportRatioPerCluster)
                        {
                            break;
                        }
                    }while (cluster != null);
                }
            }

            return(domainsToRulesToCentroids);
        }