public virtual void AppendFeatures(string hostname, GeonamesCityEntity cityEntity, Features features) { if (hostname == null) { throw new ArgumentNullException("hostname"); } if (cityEntity == null) { throw new ArgumentNullException("cityEntity"); } if (features == null) { throw new ArgumentNullException("features"); } var parsedHostname = HostnameSplitter.Split(hostname); if (this.FeaturesConfig.InitializeDefaultFeatures) { this.InitializeDefaultFeatureValues(features); } this.AppendFeatures(parsedHostname, cityEntity, features); }
public Dictionary <string, Dictionary <PatternRule, List <PatternMiningCoordinates> > > MinePatternsFromGT( GroundTruthParser datasetParser, string inPath, int minRuleOcc, double clusterThresholdKm, int minItemsPerCluster, double minSupportRatioPerCluster, int pruneIntervalCount = 10000, int pruneMinKeepThreshold = 10) { // Example: frontiernet.net 435463 var domainCounts = new Dictionary <string, int>(); // Keys example: frontiernet.net wlfr|rtl1 79 var ruleCountsForDomains = new Dictionary <string, Dictionary <PatternRule, int> >(); // Keys example: frontiernet.net wlfr|rtl1 X,Y (coordinates) var domainsToRulesToCoordinates = new Dictionary <string, Dictionary <PatternRule, HashSet <PatternMiningCoordinates> > >(); /* * // Keys example: frontiernet.net wlfr|rtl1 drkh7 15 * var rulesGeohashCounts = new Dictionary<string, Dictionary<PatternRule, Dictionary<string, int>>>(); */ var processCount = 0; foreach (var gtItem in datasetParser.Parse(inPath, populateTextualLocationInfo: true)) { var hostname = gtItem.Hostname; var splitResults = HostnameSplitter.Split(hostname); if (splitResults == null || splitResults.DomainInfo?.RegistrableDomain == null || splitResults.SubdomainParts == null || splitResults.SubdomainParts.Count == 0) { continue; } processCount++; if (processCount % 100000 == 0) { Console.WriteLine(processCount); } var domain = splitResults.DomainInfo.RegistrableDomain; var subdomainParts = splitResults.SubdomainParts; var ruleAtoms = this.CreateRuleAtoms(subdomainParts); var rules = this.GeneratePossibleRules(ruleAtoms); this.AddRulesCoordinatesToDomain(domainCounts, ruleCountsForDomains, domainsToRulesToCoordinates, domain, rules, gtItem, pruneIntervalCount, pruneMinKeepThreshold); } this.DeleteRulesBelowOccThreshold(domainCounts, ruleCountsForDomains, domainsToRulesToCoordinates, minRuleOcc); this.DeleteEquivalentRules(ruleCountsForDomains, domainsToRulesToCoordinates); var domainsToRulesToCentroids = this.FindClusterCentroids(domainCounts, ruleCountsForDomains, domainsToRulesToCoordinates, clusterThresholdKm, minItemsPerCluster, minSupportRatioPerCluster); return(domainsToRulesToCentroids); }
public virtual Dictionary <GeonamesCityEntity, Features> GenerateCandidatesAndFeatures(string hostname) { if (hostname == null) { throw new ArgumentNullException("hostname"); } var parsedHostname = HostnameSplitter.Split(hostname); return(this.GenerateCandidatesAndFeatures(parsedHostname)); }
private ClassificationResult PickBestByPattern(string hostname, List <ClassificationResult> results) { var splitResults = HostnameSplitter.Split(hostname); if (splitResults == null || splitResults.DomainInfo?.RegistrableDomain == null || splitResults.SubdomainParts == null || splitResults.SubdomainParts.Count == 0) { return(null); } var domain = splitResults.DomainInfo.RegistrableDomain; Dictionary <PatternRule, PatternMiningCoordinates> rulesToCoordinates; if (!this.reducedRules.TryGetValue(domain, out rulesToCoordinates)) { return(null); } var subdomainParts = splitResults.SubdomainParts; if (subdomainParts == null || subdomainParts.Count == 0) { return(null); } var ruleAtoms = this.miner.CreateRuleAtoms(subdomainParts); if (ruleAtoms == null || ruleAtoms.Count == 0) { return(null); } var rules = this.miner.GeneratePossibleRules(ruleAtoms); if (rules == null || rules.Count == 0) { return(null); } var filteredRulesToCoordinates = new Dictionary <PatternRule, PatternMiningCoordinates>(); foreach (var rule in rules) { PatternMiningCoordinates coordinates; if (rulesToCoordinates.TryGetValue(rule, out coordinates)) { filteredRulesToCoordinates[rule] = coordinates; } } ClassificationResult closestResult = null; double smallestDistanceKm = int.MaxValue; PatternRule bestRule = null; foreach (var result in results) { if (result.City != null) { foreach (var entry in filteredRulesToCoordinates) { var rule = entry.Key; var coordinates = entry.Value; var distance = DistanceHelper.Distance(result.City.Latitude, result.City.Longitude, coordinates.Latitude, coordinates.Longitude, DistanceUnit.Kilometer); if (distance < smallestDistanceKm) { closestResult = result; smallestDistanceKm = distance; bestRule = rule; } } } } if (closestResult != null && smallestDistanceKm <= this.distanceThresholdKm) { return(closestResult); } return(null); }
public TrainingData Sample( string citiesPath, string alternateNamesPath, string admin1Path, string admin2Path, string countriesPath, string clliPath, string unlocodePath, string groundTruthPath, int trueNegativesMultiplier, ShouldProcessHostname shouldProcessHostname = null, ShouldContinueIngestingNewHostnames shouldContinueIngestingNewHostnames = null, ShowConsoleStats showConsoleStats = null, FeaturesConfig featuresConfig = null) { if (featuresConfig == null) { featuresConfig = new FeaturesConfig(); } var aggregator = new CityFeaturesAggregator(citiesPath, alternateNamesPath, admin1Path, admin2Path, countriesPath, clliPath, unlocodePath, featuresConfig: featuresConfig); this.Aggregator = aggregator; var trainingData = new TrainingData(tableName: "ReverseDNSGeolocation Training", featuresAggregator: aggregator); this.TrainingData = trainingData; string line; var counter = 0; var storedTruePositivesCount = 0; var storedTrueNegativesCount = 0; var positivesFeaturesDistribution = new Dictionary <CityFeatureType, int>(); var rand = new Random(); using (var file = new StreamReader(groundTruthPath)) { while ((line = file.ReadLine()) != null) { if (string.IsNullOrWhiteSpace(line) || (line.Length > 0 && line[0] == '#')) { continue; } var parts = new List <string>(line.Split(new char[] { '\t' })); if (parts.Count != 13) { continue; } /* * 0 RawIP string * 1 NumericIP uint * 2 Bucket uint? * 3 ReverseDNSHostname string * 4 RealtimeLatitude double? * 5 RealtimeLongitude double? * 6 RealtimeCountryISO string * 7 RealtimeCountryConfidence byte * 8 RealtimeState string * 9 RealtimeStateConfidence byte * 10 RealtimeCity string * 11 RealtimeCityConfidence byte * 12 RealtimeAccuracyKm double? */ var hostname = parts[3]; var trueLatitudeStr = parts[4]; var trueLongitudeStr = parts[5]; if (string.IsNullOrWhiteSpace(hostname) || string.IsNullOrWhiteSpace(trueLatitudeStr) || string.IsNullOrWhiteSpace(trueLongitudeStr)) { continue; } double trueLatitude; double trueLongitude; if (!double.TryParse(trueLatitudeStr, out trueLatitude) || !double.TryParse(trueLongitudeStr, out trueLongitude)) { continue; } var parsedHostname = HostnameSplitter.Split(hostname); if (shouldProcessHostname != null && !shouldProcessHostname(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount)) { continue; } counter++; if (showConsoleStats != null) { showConsoleStats(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount); } var candidatesAndFeatures = aggregator.GenerateCandidatesForHostname(parsedHostname); var truesPositives = new List <DataRow>(); var trueNegatives = new List <DataRow>(); foreach (var candidateEntry in candidatesAndFeatures) { var locationCandidate = candidateEntry.Key; var locationFeatures = candidateEntry.Value; var distance = DistanceHelper.Distance(trueLatitude, trueLongitude, locationCandidate.Latitude, locationCandidate.Longitude, DistanceUnit.Mile); if (distance <= featuresConfig.TruePositiveMaximumDistanceKilometers) { if (this.ShouldAddPositiveExample(positivesFeaturesDistribution, locationFeatures)) { var newRow = trainingData.CreateTrainingRow(locationFeatures, isValidLocation: true); truesPositives.Add(newRow); this.AddToFeaturesDistribution(positivesFeaturesDistribution, locationFeatures); } /* * if (storedTruePositivesCount % 100 == 0) * { * Console.WriteLine($"{hostname} - {locationCandidate}"); * Console.WriteLine("---"); * * foreach (var feature in locationFeatures) * { * Console.WriteLine($"{feature.Key} = {feature.Value}"); * } * * Console.WriteLine("---------------------------------"); * } */ /* * if ((bool)locationFeatures[CityFeatureType.ExactAdmin1NameMatch] == true && (bool)locationFeatures[CityFeatureType.CityAdmin1NameMatch] == true) * { * Console.WriteLine($"{hostname} - {locationCandidate}"); * Console.WriteLine("---"); * * foreach (var feature in locationFeatures) * { * Console.WriteLine($"{feature.Key} = {feature.Value}"); * } * * Console.WriteLine("---------------------------------"); * } */ } else { var newRow = trainingData.CreateTrainingRow(locationFeatures, isValidLocation: false); trueNegatives.Add(newRow); /* * if ((bool)locationFeatures[CityFeatureType.ExactAdmin1NameMatch] == true && (bool)locationFeatures[CityFeatureType.CityAdmin1NameMatch] == true) * { * Console.WriteLine($"{hostname} - {locationCandidate}"); * Console.WriteLine("---"); * * foreach (var feature in locationFeatures) * { * Console.WriteLine($"{feature.Key} = {feature.Value}"); * } * * Console.WriteLine("---------------------------------"); * } */ } } // WARNING: Do not move this above the true negatives selection, at it will bias the data (even more) if (truesPositives.Count > 0) { truesPositives.ForEach(tp => trainingData.AddTrainingRow(tp)); storedTruePositivesCount += truesPositives.Count; } var neededTrueNegativeItemsCount = storedTruePositivesCount * trueNegativesMultiplier; if (trueNegatives.Count > 0 && storedTrueNegativesCount < neededTrueNegativeItemsCount) { var neededItemsCount = 0; if (truesPositives.Count > 0) { neededItemsCount = Math.Min(truesPositives.Count, Math.Min(trueNegatives.Count, neededTrueNegativeItemsCount)); } else { neededItemsCount = Math.Min(trueNegatives.Count, neededTrueNegativeItemsCount); } var extractedRandTrueNegativeItems = trueNegatives.OrderBy(x => rand.Next()).Take(neededItemsCount); foreach (var trueNegativeItem in extractedRandTrueNegativeItems) { trainingData.AddTrainingRow(trueNegativeItem); storedTrueNegativesCount++; } } if (counter % 1000 == 0) { Console.WriteLine("------------------------------------"); foreach (var entry in positivesFeaturesDistribution) { Console.WriteLine($"Positive: {entry.Key}\t{entry.Value}"); } Console.WriteLine("------------------------------------"); } if (shouldContinueIngestingNewHostnames != null && !shouldContinueIngestingNewHostnames(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount)) { showConsoleStats(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount, lastRow: true); break; } } } trainingData.FinalizeData(); return(trainingData); }
public List <ClassificationResult> ExtractCities(string hostname) { this.TotalExtractCitiesTime.Start(); this.HostnameSplittingTime.Start(); var subdomainParts = HostnameSplitter.Split(hostname); this.HostnameSplittingTime.Stop(); var results = new List <ClassificationResult>(); if (subdomainParts == null) { return(results); } this.CandidateGenerationTime.Start(); var candidatesAndFeatures = this.Aggregator.GenerateCandidatesForHostname(subdomainParts); this.CandidateGenerationTime.Stop(); foreach (var candidateAndFeatures in candidatesAndFeatures) { var entity = candidateAndFeatures.Key; var features = candidateAndFeatures.Value; /* * object val; * * if (!features.TryGetValue(CityFeatureType.HostnamePatternMatch, out val)) * { * Console.WriteLine("!!!"); * } * * var valBool = (bool)val; * * if (valBool) * { * Console.WriteLine("!!!"); * } */ var featuresRow = this.TrainingData.CreateTrainingRow(features); var featuresRowArr = featuresRow.ToArray <double>(this.TrainingData.InputColumnNames); int label; this.ClassificationTime.Start(); var probability = this.Classifier.Probability(featuresRowArr, out label); this.ClassificationTime.Stop(); if (label == 1) { /* * if (valBool) * { * Console.WriteLine("!!!"); * } */ if (this.DebugMode) { Console.WriteLine(string.Format(CultureInfo.InvariantCulture, "{0}\t\t\t\t{1}\t\t\t\t{2}", hostname, entity, Math.Round(probability, 3))); } var nonDefaultFeatures = new Features(); foreach (var entry in features) { var featureName = entry.Key; var featureValue = entry.Value; var featureDefaultValue = this.TrainingData.FeatureDefaults[featureName]; // Only show a feature if its value is different than the default //if (featureValue != featureDefaultValue) // This does not work if (!featureValue.Equals(featureDefaultValue)) // This DOES work { nonDefaultFeatures[featureName] = featureValue; /* * if (this.DebugMode) * { * Console.WriteLine(string.Format(CultureInfo.InvariantCulture, "{0}: {1}", featureName, featureValue)); * } */ } } /* * if (this.DebugMode) * { * Console.WriteLine("---"); * Console.WriteLine(string.Format(CultureInfo.InvariantCulture, "Output label: {0} (Probability: {1})", label, probability)); * Console.WriteLine(); * * Console.WriteLine("------------------------"); * Console.WriteLine(); * } */ results.Add(new ClassificationResult() { City = entity, AllFeatures = features, NonDefaultFeatures = nonDefaultFeatures, Score = probability }); } } if (results.Count > 1) { results = results.OrderByDescending(r => r.Score).ToList <ClassificationResult>(); } this.TotalExtractCitiesTime.Stop(); return(results); }
public virtual IEnumerable <TrainingDataSample> Sample() { string line; using (var file = new StreamReader(this.GroundTruthPath)) { while ((line = file.ReadLine()) != null) { if (string.IsNullOrWhiteSpace(line) || (line.Length > 0 && line[0] == '#')) { continue; } var parts = new List <string>(line.Split(new char[] { '\t' })); if (parts.Count != 13) { continue; } /* * 0 RawIP string * 1 NumericIP uint * 2 Bucket uint? * 3 ReverseDNSHostname string * 4 RealtimeLatitude double? * 5 RealtimeLongitude double? * 6 RealtimeCountryISO string * 7 RealtimeCountryConfidence byte * 8 RealtimeState string * 9 RealtimeStateConfidence byte * 10 RealtimeCity string * 11 RealtimeCityConfidence byte * 12 RealtimeAccuracyKm double? */ var hostname = parts[3]; var trueLatitudeStr = parts[4]; var trueLongitudeStr = parts[5]; if (string.IsNullOrWhiteSpace(hostname) || string.IsNullOrWhiteSpace(trueLatitudeStr) || string.IsNullOrWhiteSpace(trueLongitudeStr)) { continue; } double trueLatitude; double trueLongitude; if (!double.TryParse(trueLatitudeStr, out trueLatitude) || !double.TryParse(trueLongitudeStr, out trueLongitude)) { continue; } this.ValidLinesCounter++; if (this.ValidLinesCounter % 100000 == 0) { Console.WriteLine($"ValidLinesCounter = {this.ValidLinesCounter}, ProcessingAttemptedCounter = {this.ProcessingAttemptedCounter}, SelectedTruePositivesCount = {this.SelectedTruePositivesCount}, SelectedTrueNegativesCount = {this.SelectedTrueNegativesCount}"); } var parsedHostname = HostnameSplitter.Split(hostname); if (!this.ShouldAcceptDomain(hostname, parsedHostname)) { continue; } this.ProcessingAttemptedCounter++; if (this.ProcessingAttemptedCounter > this.maxSamples) { break; } //// !!!!!! this.ShowConsoleStats(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount); var candidatesAndFeatures = this.Aggregator.GenerateCandidatesForHostname(parsedHostname); var truesPositives = new List <TrainingDataSample>(); var trueNegatives = new List <TrainingDataSample>(); foreach (var candidateEntry in candidatesAndFeatures) { var locationCandidate = candidateEntry.Key; var locationFeatures = candidateEntry.Value; //var distance = DistanceHelper.Distance(trueLatitude, trueLongitude, locationCandidate.Latitude, locationCandidate.Longitude, DistanceUnit.Mile); var distance = DistanceHelper.Distance(trueLatitude, trueLongitude, locationCandidate.Latitude, locationCandidate.Longitude, DistanceUnit.Kilometer); //if (distance <= this.FeaturesConfig.TruePositiveMaximumDistanceMiles) if (distance <= this.FeaturesConfig.TruePositiveMaximumDistanceKilometers) { var positiveSampleFeaturesSignature = this.GenerateFeaturesBoolSignature(locationFeatures, isPositiveSample: true); truesPositives.Add(new TrainingDataSample() { Hostname = hostname, City = locationCandidate, FeaturesSignature = positiveSampleFeaturesSignature, Features = locationFeatures, IsPositiveExample = true }); } else { var negativeSampleFeaturesSignature = this.GenerateFeaturesBoolSignature(locationFeatures, isPositiveSample: false); trueNegatives.Add(new TrainingDataSample() { Hostname = hostname, City = locationCandidate, FeaturesSignature = negativeSampleFeaturesSignature, Features = locationFeatures, IsPositiveExample = false }); } } // WARNING: Do not move this above the true negatives selection, at it will bias the data (even more) if (truesPositives.Count > 0) { foreach (var truePositive in truesPositives) { yield return(truePositive); } this.SelectedTruePositivesCount += truesPositives.Count; } foreach (var trueNegativeItem in trueNegatives) { yield return(trueNegativeItem); this.SelectedTrueNegativesCount++; this.IncrementFeatureDistribution(this.NegativeSamplesFeatureDistribution, trueNegativeItem.FeaturesSignature); } } } }
private ClassificationResult FilterByPatterns(string hostname, ClassificationResult result) { if (result == null) { return(null); } var splitResults = HostnameSplitter.Split(hostname); if (splitResults == null || splitResults.DomainInfo?.RegistrableDomain == null || splitResults.SubdomainParts == null || splitResults.SubdomainParts.Count == 0) { return(null); } var domain = splitResults.DomainInfo.RegistrableDomain; Dictionary <PatternRule, PatternMiningCoordinates> rulesToCoordinates; if (!this.reducedRules.TryGetValue(domain, out rulesToCoordinates)) { return(null); } var subdomainParts = splitResults.SubdomainParts; if (subdomainParts == null || subdomainParts.Count == 0) { return(null); } var ruleAtoms = this.miner.CreateRuleAtoms(subdomainParts); if (ruleAtoms == null || ruleAtoms.Count == 0) { return(null); } var rules = this.miner.GeneratePossibleRules(ruleAtoms); if (rules == null || rules.Count == 0) { return(null); } var filteredRulesToCoordinates = new Dictionary <PatternRule, PatternMiningCoordinates>(); foreach (var rule in rules) { PatternMiningCoordinates coordinates; if (rulesToCoordinates.TryGetValue(rule, out coordinates)) { var distance = DistanceHelper.Distance(result.City.Latitude, result.City.Longitude, coordinates.Latitude, coordinates.Longitude, DistanceUnit.Kilometer); if (distance > this.distanceThresholdKm) { return(null); } } } return(result); }
public PatternMiningResult MineCommonStringGeohashesFromGT( GroundTruthParser datasetParser, string inPath, int pruneIntervalCount = 10000, int pruneMinKeepThreshold = 10) { /* * Conceptual example for: static-32-213-114-101.wlfr.ct.frontiernet.net * key: frontiernet.net * value: * key: wlfr|rtl1 (it means the string "wlfr, located at right-to-left index 1") * value: * key: drkh7 (geohash with precision 5 -> +/- 2.4 km) * value: 100 (we found it 100 times in the dataset for this key) * */ // Keys example: frontiernet.net wlfr|rtl1 drkh7 15 var rulesGeohashCounts = new Dictionary <string, Dictionary <PatternRule, Dictionary <string, int> > >(); // Keys example: frontiernet.net wlfr|rtl1 79 var rulesCounts = new Dictionary <string, Dictionary <PatternRule, int> >(); // Example: frontiernet.net 435463 var domainCounts = new Dictionary <string, int>(); var processCount = 0; foreach (var item in datasetParser.Parse(inPath, populateTextualLocationInfo: true)) { var hostname = item.Hostname; var splitResults = HostnameSplitter.Split(hostname); if (splitResults == null || splitResults.DomainInfo?.RegistrableDomain == null || splitResults.SubdomainParts == null || splitResults.SubdomainParts.Count == 0) { continue; } processCount++; if (processCount % 100000 == 0) { Console.WriteLine(processCount); } var domain = splitResults.DomainInfo.RegistrableDomain; var subdomainParts = splitResults.SubdomainParts; var rulesToGeohashCounts = this.AddRetrieveDomainToRulesGeohashCounts(rulesGeohashCounts, domain); var rulesToCounts = this.AddRetrieveDomainToRulesCounts(rulesCounts, domain); /* * Geohash Precision: # km # 1 ±2500 # 2 ±630 # 3 ±78 # 4 ±20 # 5 ±2.4 # 6 ±0.61 # 7 ±0.076 # 8 ±0.019 */ var geohashes = new HashSet <string>(); //geohashes.Add(GeoHash.Encode(item.Latitude, item.Longitude, numberOfChars: 2)); // 2 = ±630km //geohashes.Add(GeoHash.Encode(item.Latitude, item.Longitude, numberOfChars: 3)); // 3 = ±78km geohashes.Add(GeoHash.Encode(item.Latitude, item.Longitude, numberOfChars: 4)); // 4 = ±20 km var ruleAtoms = this.CreateRuleAtoms(subdomainParts); var rules = this.GeneratePossibleRules(ruleAtoms); this.IncrementGeohashCounts(rulesToGeohashCounts, rulesToCounts, geohashes, rules); var domainOcc = this.IncrementOccurrences(domainCounts, domain); if (domainOcc % pruneIntervalCount == 0) { this.PruneCounts(rulesToGeohashCounts, rulesToCounts, minKeepThreshold: pruneMinKeepThreshold); } } return(new PatternMiningResult() { RulesGeohashCounts = rulesGeohashCounts, RulesCounts = rulesCounts }); }