public virtual void AppendFeatures(string hostname, GeonamesCityEntity cityEntity, Features features)
        {
            if (hostname == null)
            {
                throw new ArgumentNullException("hostname");
            }

            if (cityEntity == null)
            {
                throw new ArgumentNullException("cityEntity");
            }

            if (features == null)
            {
                throw new ArgumentNullException("features");
            }

            var parsedHostname = HostnameSplitter.Split(hostname);

            if (this.FeaturesConfig.InitializeDefaultFeatures)
            {
                this.InitializeDefaultFeatureValues(features);
            }

            this.AppendFeatures(parsedHostname, cityEntity, features);
        }
        public Dictionary <string, Dictionary <PatternRule, List <PatternMiningCoordinates> > > MinePatternsFromGT(
            GroundTruthParser datasetParser,
            string inPath,
            int minRuleOcc,
            double clusterThresholdKm,
            int minItemsPerCluster,
            double minSupportRatioPerCluster,
            int pruneIntervalCount    = 10000,
            int pruneMinKeepThreshold = 10)
        {
            // Example:               frontiernet.net  435463
            var domainCounts = new Dictionary <string, int>();

            // Keys example:                     frontiernet.net         wlfr|rtl1   79
            var ruleCountsForDomains = new Dictionary <string, Dictionary <PatternRule, int> >();

            // Keys example:                            frontiernet.net         wlfr|rtl1           X,Y (coordinates)
            var domainsToRulesToCoordinates = new Dictionary <string, Dictionary <PatternRule, HashSet <PatternMiningCoordinates> > >();

            /*
             * // Keys example:             frontiernet.net       wlfr|rtl1           drkh7        15
             * var rulesGeohashCounts = new Dictionary<string, Dictionary<PatternRule, Dictionary<string, int>>>();
             */

            var processCount = 0;

            foreach (var gtItem in datasetParser.Parse(inPath, populateTextualLocationInfo: true))
            {
                var hostname     = gtItem.Hostname;
                var splitResults = HostnameSplitter.Split(hostname);

                if (splitResults == null || splitResults.DomainInfo?.RegistrableDomain == null || splitResults.SubdomainParts == null || splitResults.SubdomainParts.Count == 0)
                {
                    continue;
                }

                processCount++;

                if (processCount % 100000 == 0)
                {
                    Console.WriteLine(processCount);
                }

                var domain         = splitResults.DomainInfo.RegistrableDomain;
                var subdomainParts = splitResults.SubdomainParts;

                var ruleAtoms = this.CreateRuleAtoms(subdomainParts);
                var rules     = this.GeneratePossibleRules(ruleAtoms);

                this.AddRulesCoordinatesToDomain(domainCounts, ruleCountsForDomains, domainsToRulesToCoordinates, domain, rules, gtItem, pruneIntervalCount, pruneMinKeepThreshold);
            }

            this.DeleteRulesBelowOccThreshold(domainCounts, ruleCountsForDomains, domainsToRulesToCoordinates, minRuleOcc);
            this.DeleteEquivalentRules(ruleCountsForDomains, domainsToRulesToCoordinates);

            var domainsToRulesToCentroids = this.FindClusterCentroids(domainCounts, ruleCountsForDomains, domainsToRulesToCoordinates, clusterThresholdKm, minItemsPerCluster, minSupportRatioPerCluster);

            return(domainsToRulesToCentroids);
        }
        public virtual Dictionary <GeonamesCityEntity, Features> GenerateCandidatesAndFeatures(string hostname)
        {
            if (hostname == null)
            {
                throw new ArgumentNullException("hostname");
            }

            var parsedHostname = HostnameSplitter.Split(hostname);

            return(this.GenerateCandidatesAndFeatures(parsedHostname));
        }
        private ClassificationResult PickBestByPattern(string hostname, List <ClassificationResult> results)
        {
            var splitResults = HostnameSplitter.Split(hostname);

            if (splitResults == null || splitResults.DomainInfo?.RegistrableDomain == null || splitResults.SubdomainParts == null || splitResults.SubdomainParts.Count == 0)
            {
                return(null);
            }

            var domain = splitResults.DomainInfo.RegistrableDomain;

            Dictionary <PatternRule, PatternMiningCoordinates> rulesToCoordinates;

            if (!this.reducedRules.TryGetValue(domain, out rulesToCoordinates))
            {
                return(null);
            }

            var subdomainParts = splitResults.SubdomainParts;

            if (subdomainParts == null || subdomainParts.Count == 0)
            {
                return(null);
            }

            var ruleAtoms = this.miner.CreateRuleAtoms(subdomainParts);

            if (ruleAtoms == null || ruleAtoms.Count == 0)
            {
                return(null);
            }

            var rules = this.miner.GeneratePossibleRules(ruleAtoms);

            if (rules == null || rules.Count == 0)
            {
                return(null);
            }

            var filteredRulesToCoordinates = new Dictionary <PatternRule, PatternMiningCoordinates>();

            foreach (var rule in rules)
            {
                PatternMiningCoordinates coordinates;

                if (rulesToCoordinates.TryGetValue(rule, out coordinates))
                {
                    filteredRulesToCoordinates[rule] = coordinates;
                }
            }

            ClassificationResult closestResult = null;
            double      smallestDistanceKm     = int.MaxValue;
            PatternRule bestRule = null;

            foreach (var result in results)
            {
                if (result.City != null)
                {
                    foreach (var entry in filteredRulesToCoordinates)
                    {
                        var rule        = entry.Key;
                        var coordinates = entry.Value;

                        var distance = DistanceHelper.Distance(result.City.Latitude, result.City.Longitude, coordinates.Latitude, coordinates.Longitude, DistanceUnit.Kilometer);

                        if (distance < smallestDistanceKm)
                        {
                            closestResult      = result;
                            smallestDistanceKm = distance;
                            bestRule           = rule;
                        }
                    }
                }
            }

            if (closestResult != null && smallestDistanceKm <= this.distanceThresholdKm)
            {
                return(closestResult);
            }

            return(null);
        }
Esempio n. 5
0
        public TrainingData Sample(
            string citiesPath,
            string alternateNamesPath,
            string admin1Path,
            string admin2Path,
            string countriesPath,
            string clliPath,
            string unlocodePath,
            string groundTruthPath,
            int trueNegativesMultiplier,
            ShouldProcessHostname shouldProcessHostname = null,
            ShouldContinueIngestingNewHostnames shouldContinueIngestingNewHostnames = null,
            ShowConsoleStats showConsoleStats = null,
            FeaturesConfig featuresConfig     = null)
        {
            if (featuresConfig == null)
            {
                featuresConfig = new FeaturesConfig();
            }

            var aggregator = new CityFeaturesAggregator(citiesPath, alternateNamesPath, admin1Path, admin2Path, countriesPath, clliPath, unlocodePath, featuresConfig: featuresConfig);

            this.Aggregator = aggregator;

            var trainingData = new TrainingData(tableName: "ReverseDNSGeolocation Training", featuresAggregator: aggregator);

            this.TrainingData = trainingData;

            string line;
            var    counter = 0;

            var storedTruePositivesCount = 0;
            var storedTrueNegativesCount = 0;

            var positivesFeaturesDistribution = new Dictionary <CityFeatureType, int>();

            var rand = new Random();

            using (var file = new StreamReader(groundTruthPath))
            {
                while ((line = file.ReadLine()) != null)
                {
                    if (string.IsNullOrWhiteSpace(line) ||
                        (line.Length > 0 && line[0] == '#'))
                    {
                        continue;
                    }

                    var parts = new List <string>(line.Split(new char[] { '\t' }));

                    if (parts.Count != 13)
                    {
                        continue;
                    }

                    /*
                     * 0  RawIP   string
                     * 1  NumericIP   uint
                     * 2  Bucket  uint?
                     * 3  ReverseDNSHostname  string
                     * 4  RealtimeLatitude    double?
                     * 5  RealtimeLongitude   double?
                     * 6  RealtimeCountryISO  string
                     * 7  RealtimeCountryConfidence   byte
                     * 8  RealtimeState   string
                     * 9  RealtimeStateConfidence byte
                     * 10 RealtimeCity    string
                     * 11 RealtimeCityConfidence  byte
                     * 12 RealtimeAccuracyKm  double?
                     */

                    var hostname = parts[3];

                    var trueLatitudeStr  = parts[4];
                    var trueLongitudeStr = parts[5];

                    if (string.IsNullOrWhiteSpace(hostname) ||
                        string.IsNullOrWhiteSpace(trueLatitudeStr) ||
                        string.IsNullOrWhiteSpace(trueLongitudeStr))
                    {
                        continue;
                    }

                    double trueLatitude;
                    double trueLongitude;

                    if (!double.TryParse(trueLatitudeStr, out trueLatitude) ||
                        !double.TryParse(trueLongitudeStr, out trueLongitude))
                    {
                        continue;
                    }

                    var parsedHostname = HostnameSplitter.Split(hostname);

                    if (shouldProcessHostname != null && !shouldProcessHostname(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount))
                    {
                        continue;
                    }

                    counter++;

                    if (showConsoleStats != null)
                    {
                        showConsoleStats(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount);
                    }

                    var candidatesAndFeatures = aggregator.GenerateCandidatesForHostname(parsedHostname);

                    var truesPositives = new List <DataRow>();
                    var trueNegatives  = new List <DataRow>();

                    foreach (var candidateEntry in candidatesAndFeatures)
                    {
                        var locationCandidate = candidateEntry.Key;
                        var locationFeatures  = candidateEntry.Value;

                        var distance = DistanceHelper.Distance(trueLatitude, trueLongitude, locationCandidate.Latitude, locationCandidate.Longitude, DistanceUnit.Mile);

                        if (distance <= featuresConfig.TruePositiveMaximumDistanceKilometers)
                        {
                            if (this.ShouldAddPositiveExample(positivesFeaturesDistribution, locationFeatures))
                            {
                                var newRow = trainingData.CreateTrainingRow(locationFeatures, isValidLocation: true);
                                truesPositives.Add(newRow);

                                this.AddToFeaturesDistribution(positivesFeaturesDistribution, locationFeatures);
                            }

                            /*
                             * if (storedTruePositivesCount % 100 == 0)
                             * {
                             *  Console.WriteLine($"{hostname} - {locationCandidate}");
                             *  Console.WriteLine("---");
                             *
                             *  foreach (var feature in locationFeatures)
                             *  {
                             *      Console.WriteLine($"{feature.Key} = {feature.Value}");
                             *  }
                             *
                             *  Console.WriteLine("---------------------------------");
                             * }
                             */

                            /*
                             * if ((bool)locationFeatures[CityFeatureType.ExactAdmin1NameMatch] == true && (bool)locationFeatures[CityFeatureType.CityAdmin1NameMatch] == true)
                             * {
                             *  Console.WriteLine($"{hostname} - {locationCandidate}");
                             *  Console.WriteLine("---");
                             *
                             *  foreach (var feature in locationFeatures)
                             *  {
                             *      Console.WriteLine($"{feature.Key} = {feature.Value}");
                             *  }
                             *
                             *  Console.WriteLine("---------------------------------");
                             * }
                             */
                        }
                        else
                        {
                            var newRow = trainingData.CreateTrainingRow(locationFeatures, isValidLocation: false);
                            trueNegatives.Add(newRow);

                            /*
                             * if ((bool)locationFeatures[CityFeatureType.ExactAdmin1NameMatch] == true && (bool)locationFeatures[CityFeatureType.CityAdmin1NameMatch] == true)
                             * {
                             *  Console.WriteLine($"{hostname} - {locationCandidate}");
                             *  Console.WriteLine("---");
                             *
                             *  foreach (var feature in locationFeatures)
                             *  {
                             *      Console.WriteLine($"{feature.Key} = {feature.Value}");
                             *  }
                             *
                             *  Console.WriteLine("---------------------------------");
                             * }
                             */
                        }
                    }

                    // WARNING: Do not move this above the true negatives selection, at it will bias the data (even more)
                    if (truesPositives.Count > 0)
                    {
                        truesPositives.ForEach(tp => trainingData.AddTrainingRow(tp));
                        storedTruePositivesCount += truesPositives.Count;
                    }

                    var neededTrueNegativeItemsCount = storedTruePositivesCount * trueNegativesMultiplier;

                    if (trueNegatives.Count > 0 && storedTrueNegativesCount < neededTrueNegativeItemsCount)
                    {
                        var neededItemsCount = 0;

                        if (truesPositives.Count > 0)
                        {
                            neededItemsCount = Math.Min(truesPositives.Count, Math.Min(trueNegatives.Count, neededTrueNegativeItemsCount));
                        }
                        else
                        {
                            neededItemsCount = Math.Min(trueNegatives.Count, neededTrueNegativeItemsCount);
                        }

                        var extractedRandTrueNegativeItems = trueNegatives.OrderBy(x => rand.Next()).Take(neededItemsCount);

                        foreach (var trueNegativeItem in extractedRandTrueNegativeItems)
                        {
                            trainingData.AddTrainingRow(trueNegativeItem);
                            storedTrueNegativesCount++;
                        }
                    }

                    if (counter % 1000 == 0)
                    {
                        Console.WriteLine("------------------------------------");

                        foreach (var entry in positivesFeaturesDistribution)
                        {
                            Console.WriteLine($"Positive: {entry.Key}\t{entry.Value}");
                        }

                        Console.WriteLine("------------------------------------");
                    }

                    if (shouldContinueIngestingNewHostnames != null && !shouldContinueIngestingNewHostnames(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount))
                    {
                        showConsoleStats(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount, lastRow: true);
                        break;
                    }
                }
            }

            trainingData.FinalizeData();

            return(trainingData);
        }
Esempio n. 6
0
        public List <ClassificationResult> ExtractCities(string hostname)
        {
            this.TotalExtractCitiesTime.Start();

            this.HostnameSplittingTime.Start();
            var subdomainParts = HostnameSplitter.Split(hostname);

            this.HostnameSplittingTime.Stop();

            var results = new List <ClassificationResult>();

            if (subdomainParts == null)
            {
                return(results);
            }

            this.CandidateGenerationTime.Start();
            var candidatesAndFeatures = this.Aggregator.GenerateCandidatesForHostname(subdomainParts);

            this.CandidateGenerationTime.Stop();

            foreach (var candidateAndFeatures in candidatesAndFeatures)
            {
                var entity   = candidateAndFeatures.Key;
                var features = candidateAndFeatures.Value;

                /*
                 * object val;
                 *
                 * if (!features.TryGetValue(CityFeatureType.HostnamePatternMatch, out val))
                 * {
                 *  Console.WriteLine("!!!");
                 * }
                 *
                 * var valBool = (bool)val;
                 *
                 * if (valBool)
                 * {
                 *  Console.WriteLine("!!!");
                 * }
                 */

                var featuresRow    = this.TrainingData.CreateTrainingRow(features);
                var featuresRowArr = featuresRow.ToArray <double>(this.TrainingData.InputColumnNames);

                int label;

                this.ClassificationTime.Start();
                var probability = this.Classifier.Probability(featuresRowArr, out label);
                this.ClassificationTime.Stop();

                if (label == 1)
                {
                    /*
                     * if (valBool)
                     * {
                     *  Console.WriteLine("!!!");
                     * }
                     */

                    if (this.DebugMode)
                    {
                        Console.WriteLine(string.Format(CultureInfo.InvariantCulture, "{0}\t\t\t\t{1}\t\t\t\t{2}", hostname, entity, Math.Round(probability, 3)));
                    }

                    var nonDefaultFeatures = new Features();

                    foreach (var entry in features)
                    {
                        var featureName  = entry.Key;
                        var featureValue = entry.Value;

                        var featureDefaultValue = this.TrainingData.FeatureDefaults[featureName];

                        // Only show a feature if its value is different than the default
                        //if (featureValue != featureDefaultValue) // This does not work
                        if (!featureValue.Equals(featureDefaultValue)) // This DOES work
                        {
                            nonDefaultFeatures[featureName] = featureValue;

                            /*
                             * if (this.DebugMode)
                             * {
                             *  Console.WriteLine(string.Format(CultureInfo.InvariantCulture, "{0}: {1}", featureName, featureValue));
                             * }
                             */
                        }
                    }

                    /*
                     * if (this.DebugMode)
                     * {
                     *  Console.WriteLine("---");
                     *  Console.WriteLine(string.Format(CultureInfo.InvariantCulture, "Output label: {0} (Probability: {1})", label, probability));
                     *  Console.WriteLine();
                     *
                     *  Console.WriteLine("------------------------");
                     *  Console.WriteLine();
                     * }
                     */

                    results.Add(new ClassificationResult()
                    {
                        City               = entity,
                        AllFeatures        = features,
                        NonDefaultFeatures = nonDefaultFeatures,
                        Score              = probability
                    });
                }
            }

            if (results.Count > 1)
            {
                results = results.OrderByDescending(r => r.Score).ToList <ClassificationResult>();
            }

            this.TotalExtractCitiesTime.Stop();

            return(results);
        }
Esempio n. 7
0
        public virtual IEnumerable <TrainingDataSample> Sample()
        {
            string line;

            using (var file = new StreamReader(this.GroundTruthPath))
            {
                while ((line = file.ReadLine()) != null)
                {
                    if (string.IsNullOrWhiteSpace(line) ||
                        (line.Length > 0 && line[0] == '#'))
                    {
                        continue;
                    }

                    var parts = new List <string>(line.Split(new char[] { '\t' }));

                    if (parts.Count != 13)
                    {
                        continue;
                    }

                    /*
                     * 0  RawIP   string
                     * 1  NumericIP   uint
                     * 2  Bucket  uint?
                     * 3  ReverseDNSHostname  string
                     * 4  RealtimeLatitude    double?
                     * 5  RealtimeLongitude   double?
                     * 6  RealtimeCountryISO  string
                     * 7  RealtimeCountryConfidence   byte
                     * 8  RealtimeState   string
                     * 9  RealtimeStateConfidence byte
                     * 10 RealtimeCity    string
                     * 11 RealtimeCityConfidence  byte
                     * 12 RealtimeAccuracyKm  double?
                     */

                    var hostname = parts[3];

                    var trueLatitudeStr  = parts[4];
                    var trueLongitudeStr = parts[5];

                    if (string.IsNullOrWhiteSpace(hostname) ||
                        string.IsNullOrWhiteSpace(trueLatitudeStr) ||
                        string.IsNullOrWhiteSpace(trueLongitudeStr))
                    {
                        continue;
                    }

                    double trueLatitude;
                    double trueLongitude;

                    if (!double.TryParse(trueLatitudeStr, out trueLatitude) ||
                        !double.TryParse(trueLongitudeStr, out trueLongitude))
                    {
                        continue;
                    }

                    this.ValidLinesCounter++;

                    if (this.ValidLinesCounter % 100000 == 0)
                    {
                        Console.WriteLine($"ValidLinesCounter = {this.ValidLinesCounter}, ProcessingAttemptedCounter = {this.ProcessingAttemptedCounter}, SelectedTruePositivesCount = {this.SelectedTruePositivesCount}, SelectedTrueNegativesCount = {this.SelectedTrueNegativesCount}");
                    }

                    var parsedHostname = HostnameSplitter.Split(hostname);

                    if (!this.ShouldAcceptDomain(hostname, parsedHostname))
                    {
                        continue;
                    }

                    this.ProcessingAttemptedCounter++;

                    if (this.ProcessingAttemptedCounter > this.maxSamples)
                    {
                        break;
                    }

                    //// !!!!!! this.ShowConsoleStats(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount);

                    var candidatesAndFeatures = this.Aggregator.GenerateCandidatesForHostname(parsedHostname);

                    var truesPositives = new List <TrainingDataSample>();
                    var trueNegatives  = new List <TrainingDataSample>();

                    foreach (var candidateEntry in candidatesAndFeatures)
                    {
                        var locationCandidate = candidateEntry.Key;
                        var locationFeatures  = candidateEntry.Value;

                        //var distance = DistanceHelper.Distance(trueLatitude, trueLongitude, locationCandidate.Latitude, locationCandidate.Longitude, DistanceUnit.Mile);
                        var distance = DistanceHelper.Distance(trueLatitude, trueLongitude, locationCandidate.Latitude, locationCandidate.Longitude, DistanceUnit.Kilometer);

                        //if (distance <= this.FeaturesConfig.TruePositiveMaximumDistanceMiles)
                        if (distance <= this.FeaturesConfig.TruePositiveMaximumDistanceKilometers)
                        {
                            var positiveSampleFeaturesSignature = this.GenerateFeaturesBoolSignature(locationFeatures, isPositiveSample: true);

                            truesPositives.Add(new TrainingDataSample()
                            {
                                Hostname          = hostname,
                                City              = locationCandidate,
                                FeaturesSignature = positiveSampleFeaturesSignature,
                                Features          = locationFeatures,
                                IsPositiveExample = true
                            });
                        }
                        else
                        {
                            var negativeSampleFeaturesSignature = this.GenerateFeaturesBoolSignature(locationFeatures, isPositiveSample: false);

                            trueNegatives.Add(new TrainingDataSample()
                            {
                                Hostname          = hostname,
                                City              = locationCandidate,
                                FeaturesSignature = negativeSampleFeaturesSignature,
                                Features          = locationFeatures,
                                IsPositiveExample = false
                            });
                        }
                    }

                    // WARNING: Do not move this above the true negatives selection, at it will bias the data (even more)
                    if (truesPositives.Count > 0)
                    {
                        foreach (var truePositive in truesPositives)
                        {
                            yield return(truePositive);
                        }

                        this.SelectedTruePositivesCount += truesPositives.Count;
                    }

                    foreach (var trueNegativeItem in trueNegatives)
                    {
                        yield return(trueNegativeItem);

                        this.SelectedTrueNegativesCount++;
                        this.IncrementFeatureDistribution(this.NegativeSamplesFeatureDistribution, trueNegativeItem.FeaturesSignature);
                    }
                }
            }
        }
Esempio n. 8
0
        private ClassificationResult FilterByPatterns(string hostname, ClassificationResult result)
        {
            if (result == null)
            {
                return(null);
            }

            var splitResults = HostnameSplitter.Split(hostname);

            if (splitResults == null || splitResults.DomainInfo?.RegistrableDomain == null || splitResults.SubdomainParts == null || splitResults.SubdomainParts.Count == 0)
            {
                return(null);
            }

            var domain = splitResults.DomainInfo.RegistrableDomain;

            Dictionary <PatternRule, PatternMiningCoordinates> rulesToCoordinates;

            if (!this.reducedRules.TryGetValue(domain, out rulesToCoordinates))
            {
                return(null);
            }

            var subdomainParts = splitResults.SubdomainParts;

            if (subdomainParts == null || subdomainParts.Count == 0)
            {
                return(null);
            }

            var ruleAtoms = this.miner.CreateRuleAtoms(subdomainParts);

            if (ruleAtoms == null || ruleAtoms.Count == 0)
            {
                return(null);
            }

            var rules = this.miner.GeneratePossibleRules(ruleAtoms);

            if (rules == null || rules.Count == 0)
            {
                return(null);
            }

            var filteredRulesToCoordinates = new Dictionary <PatternRule, PatternMiningCoordinates>();

            foreach (var rule in rules)
            {
                PatternMiningCoordinates coordinates;

                if (rulesToCoordinates.TryGetValue(rule, out coordinates))
                {
                    var distance = DistanceHelper.Distance(result.City.Latitude, result.City.Longitude, coordinates.Latitude, coordinates.Longitude, DistanceUnit.Kilometer);

                    if (distance > this.distanceThresholdKm)
                    {
                        return(null);
                    }
                }
            }

            return(result);
        }
        public PatternMiningResult MineCommonStringGeohashesFromGT(
            GroundTruthParser datasetParser,
            string inPath,
            int pruneIntervalCount    = 10000,
            int pruneMinKeepThreshold = 10)
        {
            /*
             * Conceptual example for: static-32-213-114-101.wlfr.ct.frontiernet.net
             * key: frontiernet.net
             * value:
             *      key: wlfr|rtl1   (it means the string "wlfr, located at right-to-left index 1")
             *      value:
             *          key: drkh7   (geohash with precision 5 -> +/- 2.4 km)
             *          value: 100   (we found it 100 times in the dataset for this key)
             *
             */

            // Keys example:             frontiernet.net       wlfr|rtl1           drkh7        15
            var rulesGeohashCounts = new Dictionary <string, Dictionary <PatternRule, Dictionary <string, int> > >();

            // Keys example:                     frontiernet.net         wlfr|rtl1   79
            var rulesCounts = new Dictionary <string, Dictionary <PatternRule, int> >();

            // Example:               frontiernet.net  435463
            var domainCounts = new Dictionary <string, int>();

            var processCount = 0;

            foreach (var item in datasetParser.Parse(inPath, populateTextualLocationInfo: true))
            {
                var hostname     = item.Hostname;
                var splitResults = HostnameSplitter.Split(hostname);

                if (splitResults == null || splitResults.DomainInfo?.RegistrableDomain == null || splitResults.SubdomainParts == null || splitResults.SubdomainParts.Count == 0)
                {
                    continue;
                }

                processCount++;

                if (processCount % 100000 == 0)
                {
                    Console.WriteLine(processCount);
                }

                var domain         = splitResults.DomainInfo.RegistrableDomain;
                var subdomainParts = splitResults.SubdomainParts;

                var rulesToGeohashCounts = this.AddRetrieveDomainToRulesGeohashCounts(rulesGeohashCounts, domain);
                var rulesToCounts        = this.AddRetrieveDomainToRulesCounts(rulesCounts, domain);

                /*
                 * Geohash Precision:
                 #   km
                 #  1   ±2500
                 #  2   ±630
                 #  3   ±78
                 #  4   ±20
                 #  5   ±2.4
                 #  6   ±0.61
                 #  7   ±0.076
                 #  8   ±0.019
                 */

                var geohashes = new HashSet <string>();
                //geohashes.Add(GeoHash.Encode(item.Latitude, item.Longitude, numberOfChars: 2)); // 2 = ±630km
                //geohashes.Add(GeoHash.Encode(item.Latitude, item.Longitude, numberOfChars: 3)); // 3 = ±78km
                geohashes.Add(GeoHash.Encode(item.Latitude, item.Longitude, numberOfChars: 4)); // 4 = ±20 km

                var ruleAtoms = this.CreateRuleAtoms(subdomainParts);
                var rules     = this.GeneratePossibleRules(ruleAtoms);

                this.IncrementGeohashCounts(rulesToGeohashCounts, rulesToCounts, geohashes, rules);

                var domainOcc = this.IncrementOccurrences(domainCounts, domain);

                if (domainOcc % pruneIntervalCount == 0)
                {
                    this.PruneCounts(rulesToGeohashCounts, rulesToCounts, minKeepThreshold: pruneMinKeepThreshold);
                }
            }

            return(new PatternMiningResult()
            {
                RulesGeohashCounts = rulesGeohashCounts,
                RulesCounts = rulesCounts
            });
        }