public DummySampler( CityFeaturesAggregator aggregator, string groundTruthPath, int trueNegativesMultiplier, FeaturesConfig featuresConfig, int maxSamples) { this.ValidLinesCounter = 0; this.ProcessingAttemptedCounter = 0; this.Aggregator = aggregator; this.GroundTruthPath = groundTruthPath; this.TrueNegativesMultiplier = trueNegativesMultiplier; this.FeaturesConfig = featuresConfig; this.PositiveSamplesFeatureDistribution = new Dictionary <string, int>(); this.NegativeSamplesFeatureDistribution = new Dictionary <string, int>(); this.Rand = new Random(Seed: 7); this.SelectedTruePositivesCount = 0; this.SelectedTrueNegativesCount = 0; this.maxSamples = maxSamples; }
public TrainingData(string tableName, CityFeaturesAggregator featuresAggregator) { this.Table = new DataTable(tableName); this.FeaturesAggregator = featuresAggregator; this.FeatureDefaultsValueTypes = featuresAggregator.FeatureDefaultsValueTypes; this.FeatureDefaults = featuresAggregator.FeatureDefaults; this.FeatureGranularities = featuresAggregator.FeatureGranularities; var inputColumnNames = new List <string>(); foreach (var entry in this.FeatureDefaultsValueTypes) { var featureName = entry.Key.ToString(); var featureType = entry.Value; inputColumnNames.Add(featureName); if (featureType.IsGenericType && featureType.GetGenericTypeDefinition() == typeof(Nullable <>)) { // All rows in DataTable are considered to be Nullable, but we need to specify the non-nullable types when we define the columns... for some reason. var nonNullableType = Nullable.GetUnderlyingType(featureType); this.Table.Columns.Add(featureName, nonNullableType); } else { this.Table.Columns.Add(featureName, featureType); } } this.InputColumnNames = inputColumnNames.ToArray <string>(); this.Table.Columns.Add(outputColumnName, typeof(bool)); }
public TrainingDataSampler( CityFeaturesAggregator aggregator, string groundTruthPath, int trueNegativesMultiplier, FeaturesConfig featuresConfig, int maxPositiveSamplesPerDomain, double maxDiffMagnitude = 10d, double forceAddMinPercentile = 0.2d) { this.ValidLinesCounter = 0; this.ProcessingAttemptedCounter = 0; this.HostnameCounts = new Dictionary <string, int>(); this.Aggregator = aggregator; this.GroundTruthPath = groundTruthPath; this.TrueNegativesMultiplier = trueNegativesMultiplier; this.FeaturesConfig = featuresConfig; this.PositiveSamplesFeatureDistribution = new Dictionary <string, int>(); this.NegativeSamplesFeatureDistribution = new Dictionary <string, int>(); this.Rand = new Random(Seed: 7); this.LatestPositiveSampleCities = new LRUCache <GeonamesCityEntity>(10); this.LatestNegativeSampleCities = new LRUCache <GeonamesCityEntity>(10); this.SelectedTruePositivesCount = 0; this.SelectedTrueNegativesCount = 0; this.maxPositiveSamplesPerDomain = maxPositiveSamplesPerDomain; this.maxDiffMagnitude = maxDiffMagnitude; this.forceAddMinPercentile = forceAddMinPercentile; }
public ModelRunner(CityFeaturesAggregator aggregator, TrainingData trainingData, IClassifier classifier, bool debugMode = false) { this.Aggregator = aggregator; this.TrainingData = trainingData; this.Classifier = classifier; this.DebugMode = debugMode; this.HostnameSplittingTime = new Stopwatch(); this.CandidateGenerationTime = new Stopwatch(); this.ClassificationTime = new Stopwatch(); this.TotalExtractCitiesTime = new Stopwatch(); }
public static TrainingData DeserializeFrom(string inPath, CityFeaturesAggregator aggregator) { var formatter = new BinaryFormatter(); using (var stream = new FileStream(path: inPath, mode: FileMode.Open, access: FileAccess.Read, share: FileShare.Read)) { var trainingData = (TrainingData)formatter.Deserialize(stream); stream.Close(); trainingData.backingFeaturesAggregator = aggregator; return(trainingData); } }
public TrainingData Convert( CityFeaturesAggregator aggregator, ITrainingDataSampler sampler) { var trainingData = new TrainingData(tableName: "ReverseDNSGeolocation Training", featuresAggregator: aggregator); foreach (var sample in sampler.Sample()) { var newRow = trainingData.CreateTrainingRow(sample.Features, isValidLocation: sample.IsPositiveExample); trainingData.AddTrainingRow(newRow); } trainingData.FinalizeData(); return(trainingData); }
public TrainingData Sample( string citiesPath, string alternateNamesPath, string admin1Path, string admin2Path, string countriesPath, string clliPath, string unlocodePath, string groundTruthPath, int trueNegativesMultiplier, ShouldProcessHostname shouldProcessHostname = null, ShouldContinueIngestingNewHostnames shouldContinueIngestingNewHostnames = null, ShowConsoleStats showConsoleStats = null, FeaturesConfig featuresConfig = null) { if (featuresConfig == null) { featuresConfig = new FeaturesConfig(); } var aggregator = new CityFeaturesAggregator(citiesPath, alternateNamesPath, admin1Path, admin2Path, countriesPath, clliPath, unlocodePath, featuresConfig: featuresConfig); this.Aggregator = aggregator; var trainingData = new TrainingData(tableName: "ReverseDNSGeolocation Training", featuresAggregator: aggregator); this.TrainingData = trainingData; string line; var counter = 0; var storedTruePositivesCount = 0; var storedTrueNegativesCount = 0; var positivesFeaturesDistribution = new Dictionary <CityFeatureType, int>(); var rand = new Random(); using (var file = new StreamReader(groundTruthPath)) { while ((line = file.ReadLine()) != null) { if (string.IsNullOrWhiteSpace(line) || (line.Length > 0 && line[0] == '#')) { continue; } var parts = new List <string>(line.Split(new char[] { '\t' })); if (parts.Count != 13) { continue; } /* * 0 RawIP string * 1 NumericIP uint * 2 Bucket uint? * 3 ReverseDNSHostname string * 4 RealtimeLatitude double? * 5 RealtimeLongitude double? * 6 RealtimeCountryISO string * 7 RealtimeCountryConfidence byte * 8 RealtimeState string * 9 RealtimeStateConfidence byte * 10 RealtimeCity string * 11 RealtimeCityConfidence byte * 12 RealtimeAccuracyKm double? */ var hostname = parts[3]; var trueLatitudeStr = parts[4]; var trueLongitudeStr = parts[5]; if (string.IsNullOrWhiteSpace(hostname) || string.IsNullOrWhiteSpace(trueLatitudeStr) || string.IsNullOrWhiteSpace(trueLongitudeStr)) { continue; } double trueLatitude; double trueLongitude; if (!double.TryParse(trueLatitudeStr, out trueLatitude) || !double.TryParse(trueLongitudeStr, out trueLongitude)) { continue; } var parsedHostname = HostnameSplitter.Split(hostname); if (shouldProcessHostname != null && !shouldProcessHostname(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount)) { continue; } counter++; if (showConsoleStats != null) { showConsoleStats(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount); } var candidatesAndFeatures = aggregator.GenerateCandidatesForHostname(parsedHostname); var truesPositives = new List <DataRow>(); var trueNegatives = new List <DataRow>(); foreach (var candidateEntry in candidatesAndFeatures) { var locationCandidate = candidateEntry.Key; var locationFeatures = candidateEntry.Value; var distance = DistanceHelper.Distance(trueLatitude, trueLongitude, locationCandidate.Latitude, locationCandidate.Longitude, DistanceUnit.Mile); if (distance <= featuresConfig.TruePositiveMaximumDistanceKilometers) { if (this.ShouldAddPositiveExample(positivesFeaturesDistribution, locationFeatures)) { var newRow = trainingData.CreateTrainingRow(locationFeatures, isValidLocation: true); truesPositives.Add(newRow); this.AddToFeaturesDistribution(positivesFeaturesDistribution, locationFeatures); } /* * if (storedTruePositivesCount % 100 == 0) * { * Console.WriteLine($"{hostname} - {locationCandidate}"); * Console.WriteLine("---"); * * foreach (var feature in locationFeatures) * { * Console.WriteLine($"{feature.Key} = {feature.Value}"); * } * * Console.WriteLine("---------------------------------"); * } */ /* * if ((bool)locationFeatures[CityFeatureType.ExactAdmin1NameMatch] == true && (bool)locationFeatures[CityFeatureType.CityAdmin1NameMatch] == true) * { * Console.WriteLine($"{hostname} - {locationCandidate}"); * Console.WriteLine("---"); * * foreach (var feature in locationFeatures) * { * Console.WriteLine($"{feature.Key} = {feature.Value}"); * } * * Console.WriteLine("---------------------------------"); * } */ } else { var newRow = trainingData.CreateTrainingRow(locationFeatures, isValidLocation: false); trueNegatives.Add(newRow); /* * if ((bool)locationFeatures[CityFeatureType.ExactAdmin1NameMatch] == true && (bool)locationFeatures[CityFeatureType.CityAdmin1NameMatch] == true) * { * Console.WriteLine($"{hostname} - {locationCandidate}"); * Console.WriteLine("---"); * * foreach (var feature in locationFeatures) * { * Console.WriteLine($"{feature.Key} = {feature.Value}"); * } * * Console.WriteLine("---------------------------------"); * } */ } } // WARNING: Do not move this above the true negatives selection, at it will bias the data (even more) if (truesPositives.Count > 0) { truesPositives.ForEach(tp => trainingData.AddTrainingRow(tp)); storedTruePositivesCount += truesPositives.Count; } var neededTrueNegativeItemsCount = storedTruePositivesCount * trueNegativesMultiplier; if (trueNegatives.Count > 0 && storedTrueNegativesCount < neededTrueNegativeItemsCount) { var neededItemsCount = 0; if (truesPositives.Count > 0) { neededItemsCount = Math.Min(truesPositives.Count, Math.Min(trueNegatives.Count, neededTrueNegativeItemsCount)); } else { neededItemsCount = Math.Min(trueNegatives.Count, neededTrueNegativeItemsCount); } var extractedRandTrueNegativeItems = trueNegatives.OrderBy(x => rand.Next()).Take(neededItemsCount); foreach (var trueNegativeItem in extractedRandTrueNegativeItems) { trainingData.AddTrainingRow(trueNegativeItem); storedTrueNegativesCount++; } } if (counter % 1000 == 0) { Console.WriteLine("------------------------------------"); foreach (var entry in positivesFeaturesDistribution) { Console.WriteLine($"Positive: {entry.Key}\t{entry.Value}"); } Console.WriteLine("------------------------------------"); } if (shouldContinueIngestingNewHostnames != null && !shouldContinueIngestingNewHostnames(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount)) { showConsoleStats(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount, lastRow: true); break; } } } trainingData.FinalizeData(); return(trainingData); }