Exemple #1
0
        public DummySampler(
            CityFeaturesAggregator aggregator,
            string groundTruthPath,
            int trueNegativesMultiplier,
            FeaturesConfig featuresConfig,
            int maxSamples)
        {
            this.ValidLinesCounter          = 0;
            this.ProcessingAttemptedCounter = 0;

            this.Aggregator              = aggregator;
            this.GroundTruthPath         = groundTruthPath;
            this.TrueNegativesMultiplier = trueNegativesMultiplier;
            this.FeaturesConfig          = featuresConfig;

            this.PositiveSamplesFeatureDistribution = new Dictionary <string, int>();
            this.NegativeSamplesFeatureDistribution = new Dictionary <string, int>();

            this.Rand = new Random(Seed: 7);

            this.SelectedTruePositivesCount = 0;
            this.SelectedTrueNegativesCount = 0;

            this.maxSamples = maxSamples;
        }
        public TrainingData(string tableName, CityFeaturesAggregator featuresAggregator)
        {
            this.Table = new DataTable(tableName);
            this.FeaturesAggregator        = featuresAggregator;
            this.FeatureDefaultsValueTypes = featuresAggregator.FeatureDefaultsValueTypes;
            this.FeatureDefaults           = featuresAggregator.FeatureDefaults;
            this.FeatureGranularities      = featuresAggregator.FeatureGranularities;

            var inputColumnNames = new List <string>();

            foreach (var entry in this.FeatureDefaultsValueTypes)
            {
                var featureName = entry.Key.ToString();
                var featureType = entry.Value;

                inputColumnNames.Add(featureName);

                if (featureType.IsGenericType && featureType.GetGenericTypeDefinition() == typeof(Nullable <>))
                {
                    // All rows in DataTable are considered to be Nullable, but we need to specify the non-nullable types when we define the columns... for some reason.
                    var nonNullableType = Nullable.GetUnderlyingType(featureType);
                    this.Table.Columns.Add(featureName, nonNullableType);
                }
                else
                {
                    this.Table.Columns.Add(featureName, featureType);
                }
            }

            this.InputColumnNames = inputColumnNames.ToArray <string>();

            this.Table.Columns.Add(outputColumnName, typeof(bool));
        }
        public TrainingDataSampler(
            CityFeaturesAggregator aggregator,
            string groundTruthPath,
            int trueNegativesMultiplier,
            FeaturesConfig featuresConfig,
            int maxPositiveSamplesPerDomain,
            double maxDiffMagnitude      = 10d,
            double forceAddMinPercentile = 0.2d)
        {
            this.ValidLinesCounter          = 0;
            this.ProcessingAttemptedCounter = 0;
            this.HostnameCounts             = new Dictionary <string, int>();

            this.Aggregator              = aggregator;
            this.GroundTruthPath         = groundTruthPath;
            this.TrueNegativesMultiplier = trueNegativesMultiplier;
            this.FeaturesConfig          = featuresConfig;

            this.PositiveSamplesFeatureDistribution = new Dictionary <string, int>();
            this.NegativeSamplesFeatureDistribution = new Dictionary <string, int>();

            this.Rand = new Random(Seed: 7);

            this.LatestPositiveSampleCities = new LRUCache <GeonamesCityEntity>(10);
            this.LatestNegativeSampleCities = new LRUCache <GeonamesCityEntity>(10);

            this.SelectedTruePositivesCount = 0;
            this.SelectedTrueNegativesCount = 0;

            this.maxPositiveSamplesPerDomain = maxPositiveSamplesPerDomain;
            this.maxDiffMagnitude            = maxDiffMagnitude;
            this.forceAddMinPercentile       = forceAddMinPercentile;
        }
Exemple #4
0
        public ModelRunner(CityFeaturesAggregator aggregator, TrainingData trainingData, IClassifier classifier, bool debugMode = false)
        {
            this.Aggregator   = aggregator;
            this.TrainingData = trainingData;
            this.Classifier   = classifier;
            this.DebugMode    = debugMode;

            this.HostnameSplittingTime   = new Stopwatch();
            this.CandidateGenerationTime = new Stopwatch();
            this.ClassificationTime      = new Stopwatch();
            this.TotalExtractCitiesTime  = new Stopwatch();
        }
        public static TrainingData DeserializeFrom(string inPath, CityFeaturesAggregator aggregator)
        {
            var formatter = new BinaryFormatter();

            using (var stream = new FileStream(path: inPath, mode: FileMode.Open, access: FileAccess.Read, share: FileShare.Read))
            {
                var trainingData = (TrainingData)formatter.Deserialize(stream);
                stream.Close();

                trainingData.backingFeaturesAggregator = aggregator;

                return(trainingData);
            }
        }
Exemple #6
0
        public TrainingData Convert(
            CityFeaturesAggregator aggregator,
            ITrainingDataSampler sampler)
        {
            var trainingData = new TrainingData(tableName: "ReverseDNSGeolocation Training", featuresAggregator: aggregator);

            foreach (var sample in sampler.Sample())
            {
                var newRow = trainingData.CreateTrainingRow(sample.Features, isValidLocation: sample.IsPositiveExample);
                trainingData.AddTrainingRow(newRow);
            }

            trainingData.FinalizeData();

            return(trainingData);
        }
Exemple #7
0
        public TrainingData Sample(
            string citiesPath,
            string alternateNamesPath,
            string admin1Path,
            string admin2Path,
            string countriesPath,
            string clliPath,
            string unlocodePath,
            string groundTruthPath,
            int trueNegativesMultiplier,
            ShouldProcessHostname shouldProcessHostname = null,
            ShouldContinueIngestingNewHostnames shouldContinueIngestingNewHostnames = null,
            ShowConsoleStats showConsoleStats = null,
            FeaturesConfig featuresConfig     = null)
        {
            if (featuresConfig == null)
            {
                featuresConfig = new FeaturesConfig();
            }

            var aggregator = new CityFeaturesAggregator(citiesPath, alternateNamesPath, admin1Path, admin2Path, countriesPath, clliPath, unlocodePath, featuresConfig: featuresConfig);

            this.Aggregator = aggregator;

            var trainingData = new TrainingData(tableName: "ReverseDNSGeolocation Training", featuresAggregator: aggregator);

            this.TrainingData = trainingData;

            string line;
            var    counter = 0;

            var storedTruePositivesCount = 0;
            var storedTrueNegativesCount = 0;

            var positivesFeaturesDistribution = new Dictionary <CityFeatureType, int>();

            var rand = new Random();

            using (var file = new StreamReader(groundTruthPath))
            {
                while ((line = file.ReadLine()) != null)
                {
                    if (string.IsNullOrWhiteSpace(line) ||
                        (line.Length > 0 && line[0] == '#'))
                    {
                        continue;
                    }

                    var parts = new List <string>(line.Split(new char[] { '\t' }));

                    if (parts.Count != 13)
                    {
                        continue;
                    }

                    /*
                     * 0  RawIP   string
                     * 1  NumericIP   uint
                     * 2  Bucket  uint?
                     * 3  ReverseDNSHostname  string
                     * 4  RealtimeLatitude    double?
                     * 5  RealtimeLongitude   double?
                     * 6  RealtimeCountryISO  string
                     * 7  RealtimeCountryConfidence   byte
                     * 8  RealtimeState   string
                     * 9  RealtimeStateConfidence byte
                     * 10 RealtimeCity    string
                     * 11 RealtimeCityConfidence  byte
                     * 12 RealtimeAccuracyKm  double?
                     */

                    var hostname = parts[3];

                    var trueLatitudeStr  = parts[4];
                    var trueLongitudeStr = parts[5];

                    if (string.IsNullOrWhiteSpace(hostname) ||
                        string.IsNullOrWhiteSpace(trueLatitudeStr) ||
                        string.IsNullOrWhiteSpace(trueLongitudeStr))
                    {
                        continue;
                    }

                    double trueLatitude;
                    double trueLongitude;

                    if (!double.TryParse(trueLatitudeStr, out trueLatitude) ||
                        !double.TryParse(trueLongitudeStr, out trueLongitude))
                    {
                        continue;
                    }

                    var parsedHostname = HostnameSplitter.Split(hostname);

                    if (shouldProcessHostname != null && !shouldProcessHostname(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount))
                    {
                        continue;
                    }

                    counter++;

                    if (showConsoleStats != null)
                    {
                        showConsoleStats(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount);
                    }

                    var candidatesAndFeatures = aggregator.GenerateCandidatesForHostname(parsedHostname);

                    var truesPositives = new List <DataRow>();
                    var trueNegatives  = new List <DataRow>();

                    foreach (var candidateEntry in candidatesAndFeatures)
                    {
                        var locationCandidate = candidateEntry.Key;
                        var locationFeatures  = candidateEntry.Value;

                        var distance = DistanceHelper.Distance(trueLatitude, trueLongitude, locationCandidate.Latitude, locationCandidate.Longitude, DistanceUnit.Mile);

                        if (distance <= featuresConfig.TruePositiveMaximumDistanceKilometers)
                        {
                            if (this.ShouldAddPositiveExample(positivesFeaturesDistribution, locationFeatures))
                            {
                                var newRow = trainingData.CreateTrainingRow(locationFeatures, isValidLocation: true);
                                truesPositives.Add(newRow);

                                this.AddToFeaturesDistribution(positivesFeaturesDistribution, locationFeatures);
                            }

                            /*
                             * if (storedTruePositivesCount % 100 == 0)
                             * {
                             *  Console.WriteLine($"{hostname} - {locationCandidate}");
                             *  Console.WriteLine("---");
                             *
                             *  foreach (var feature in locationFeatures)
                             *  {
                             *      Console.WriteLine($"{feature.Key} = {feature.Value}");
                             *  }
                             *
                             *  Console.WriteLine("---------------------------------");
                             * }
                             */

                            /*
                             * if ((bool)locationFeatures[CityFeatureType.ExactAdmin1NameMatch] == true && (bool)locationFeatures[CityFeatureType.CityAdmin1NameMatch] == true)
                             * {
                             *  Console.WriteLine($"{hostname} - {locationCandidate}");
                             *  Console.WriteLine("---");
                             *
                             *  foreach (var feature in locationFeatures)
                             *  {
                             *      Console.WriteLine($"{feature.Key} = {feature.Value}");
                             *  }
                             *
                             *  Console.WriteLine("---------------------------------");
                             * }
                             */
                        }
                        else
                        {
                            var newRow = trainingData.CreateTrainingRow(locationFeatures, isValidLocation: false);
                            trueNegatives.Add(newRow);

                            /*
                             * if ((bool)locationFeatures[CityFeatureType.ExactAdmin1NameMatch] == true && (bool)locationFeatures[CityFeatureType.CityAdmin1NameMatch] == true)
                             * {
                             *  Console.WriteLine($"{hostname} - {locationCandidate}");
                             *  Console.WriteLine("---");
                             *
                             *  foreach (var feature in locationFeatures)
                             *  {
                             *      Console.WriteLine($"{feature.Key} = {feature.Value}");
                             *  }
                             *
                             *  Console.WriteLine("---------------------------------");
                             * }
                             */
                        }
                    }

                    // WARNING: Do not move this above the true negatives selection, at it will bias the data (even more)
                    if (truesPositives.Count > 0)
                    {
                        truesPositives.ForEach(tp => trainingData.AddTrainingRow(tp));
                        storedTruePositivesCount += truesPositives.Count;
                    }

                    var neededTrueNegativeItemsCount = storedTruePositivesCount * trueNegativesMultiplier;

                    if (trueNegatives.Count > 0 && storedTrueNegativesCount < neededTrueNegativeItemsCount)
                    {
                        var neededItemsCount = 0;

                        if (truesPositives.Count > 0)
                        {
                            neededItemsCount = Math.Min(truesPositives.Count, Math.Min(trueNegatives.Count, neededTrueNegativeItemsCount));
                        }
                        else
                        {
                            neededItemsCount = Math.Min(trueNegatives.Count, neededTrueNegativeItemsCount);
                        }

                        var extractedRandTrueNegativeItems = trueNegatives.OrderBy(x => rand.Next()).Take(neededItemsCount);

                        foreach (var trueNegativeItem in extractedRandTrueNegativeItems)
                        {
                            trainingData.AddTrainingRow(trueNegativeItem);
                            storedTrueNegativesCount++;
                        }
                    }

                    if (counter % 1000 == 0)
                    {
                        Console.WriteLine("------------------------------------");

                        foreach (var entry in positivesFeaturesDistribution)
                        {
                            Console.WriteLine($"Positive: {entry.Key}\t{entry.Value}");
                        }

                        Console.WriteLine("------------------------------------");
                    }

                    if (shouldContinueIngestingNewHostnames != null && !shouldContinueIngestingNewHostnames(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount))
                    {
                        showConsoleStats(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount, lastRow: true);
                        break;
                    }
                }
            }

            trainingData.FinalizeData();

            return(trainingData);
        }