public Dictionary <string, Dictionary <PatternRule, List <PatternMiningCoordinates> > > MinePatternsFromGT( GroundTruthParser datasetParser, string inPath, int minRuleOcc, double clusterThresholdKm, int minItemsPerCluster, double minSupportRatioPerCluster, int pruneIntervalCount = 10000, int pruneMinKeepThreshold = 10) { // Example: frontiernet.net 435463 var domainCounts = new Dictionary <string, int>(); // Keys example: frontiernet.net wlfr|rtl1 79 var ruleCountsForDomains = new Dictionary <string, Dictionary <PatternRule, int> >(); // Keys example: frontiernet.net wlfr|rtl1 X,Y (coordinates) var domainsToRulesToCoordinates = new Dictionary <string, Dictionary <PatternRule, HashSet <PatternMiningCoordinates> > >(); /* * // Keys example: frontiernet.net wlfr|rtl1 drkh7 15 * var rulesGeohashCounts = new Dictionary<string, Dictionary<PatternRule, Dictionary<string, int>>>(); */ var processCount = 0; foreach (var gtItem in datasetParser.Parse(inPath, populateTextualLocationInfo: true)) { var hostname = gtItem.Hostname; var splitResults = HostnameSplitter.Split(hostname); if (splitResults == null || splitResults.DomainInfo?.RegistrableDomain == null || splitResults.SubdomainParts == null || splitResults.SubdomainParts.Count == 0) { continue; } processCount++; if (processCount % 100000 == 0) { Console.WriteLine(processCount); } var domain = splitResults.DomainInfo.RegistrableDomain; var subdomainParts = splitResults.SubdomainParts; var ruleAtoms = this.CreateRuleAtoms(subdomainParts); var rules = this.GeneratePossibleRules(ruleAtoms); this.AddRulesCoordinatesToDomain(domainCounts, ruleCountsForDomains, domainsToRulesToCoordinates, domain, rules, gtItem, pruneIntervalCount, pruneMinKeepThreshold); } this.DeleteRulesBelowOccThreshold(domainCounts, ruleCountsForDomains, domainsToRulesToCoordinates, minRuleOcc); this.DeleteEquivalentRules(ruleCountsForDomains, domainsToRulesToCoordinates); var domainsToRulesToCentroids = this.FindClusterCentroids(domainCounts, ruleCountsForDomains, domainsToRulesToCoordinates, clusterThresholdKm, minItemsPerCluster, minSupportRatioPerCluster); return(domainsToRulesToCentroids); }
public PatternMiningResult MineCommonStringGeohashesFromGT( GroundTruthParser datasetParser, string inPath, int pruneIntervalCount = 10000, int pruneMinKeepThreshold = 10) { /* * Conceptual example for: static-32-213-114-101.wlfr.ct.frontiernet.net * key: frontiernet.net * value: * key: wlfr|rtl1 (it means the string "wlfr, located at right-to-left index 1") * value: * key: drkh7 (geohash with precision 5 -> +/- 2.4 km) * value: 100 (we found it 100 times in the dataset for this key) * */ // Keys example: frontiernet.net wlfr|rtl1 drkh7 15 var rulesGeohashCounts = new Dictionary <string, Dictionary <PatternRule, Dictionary <string, int> > >(); // Keys example: frontiernet.net wlfr|rtl1 79 var rulesCounts = new Dictionary <string, Dictionary <PatternRule, int> >(); // Example: frontiernet.net 435463 var domainCounts = new Dictionary <string, int>(); var processCount = 0; foreach (var item in datasetParser.Parse(inPath, populateTextualLocationInfo: true)) { var hostname = item.Hostname; var splitResults = HostnameSplitter.Split(hostname); if (splitResults == null || splitResults.DomainInfo?.RegistrableDomain == null || splitResults.SubdomainParts == null || splitResults.SubdomainParts.Count == 0) { continue; } processCount++; if (processCount % 100000 == 0) { Console.WriteLine(processCount); } var domain = splitResults.DomainInfo.RegistrableDomain; var subdomainParts = splitResults.SubdomainParts; var rulesToGeohashCounts = this.AddRetrieveDomainToRulesGeohashCounts(rulesGeohashCounts, domain); var rulesToCounts = this.AddRetrieveDomainToRulesCounts(rulesCounts, domain); /* * Geohash Precision: # km # 1 ±2500 # 2 ±630 # 3 ±78 # 4 ±20 # 5 ±2.4 # 6 ±0.61 # 7 ±0.076 # 8 ±0.019 */ var geohashes = new HashSet <string>(); //geohashes.Add(GeoHash.Encode(item.Latitude, item.Longitude, numberOfChars: 2)); // 2 = ±630km //geohashes.Add(GeoHash.Encode(item.Latitude, item.Longitude, numberOfChars: 3)); // 3 = ±78km geohashes.Add(GeoHash.Encode(item.Latitude, item.Longitude, numberOfChars: 4)); // 4 = ±20 km var ruleAtoms = this.CreateRuleAtoms(subdomainParts); var rules = this.GeneratePossibleRules(ruleAtoms); this.IncrementGeohashCounts(rulesToGeohashCounts, rulesToCounts, geohashes, rules); var domainOcc = this.IncrementOccurrences(domainCounts, domain); if (domainOcc % pruneIntervalCount == 0) { this.PruneCounts(rulesToGeohashCounts, rulesToCounts, minKeepThreshold: pruneMinKeepThreshold); } } return(new PatternMiningResult() { RulesGeohashCounts = rulesGeohashCounts, RulesCounts = rulesCounts }); }