Esempio n. 1
0
        public GeonamesCityEntity FindClosestCityForCoordinates(double latitude, double longitude)
        {
            var coordinatesGeohash = GeoHash.Encode(latitude, longitude, numberOfChars: 3); // 3 = ±78km
            var neighborGeohashes  = GeoHash.Neighbors(coordinatesGeohash);

            var targetGeohashes = new HashSet <string>(neighborGeohashes);

            targetGeohashes.Add(coordinatesGeohash);

            var targetCities = new List <GeonamesCityEntity>();

            foreach (var targetGeohash in targetGeohashes)
            {
                List <GeonamesCityEntity> citiesInTargetGeohash;

                if (this.GeohashesToCities.TryGetValue(targetGeohash, out citiesInTargetGeohash))
                {
                    targetCities.AddRange(citiesInTargetGeohash);
                }
            }

            GeonamesCityEntity closestCity         = null;
            double             closestCityDistance = double.MaxValue;

            foreach (var targetCity in targetCities)
            {
                var distance = DistanceHelper.Distance(targetCity.Latitude, targetCity.Longitude, latitude, longitude, DistanceUnit.Kilometer);

                if (distance <= 50 && distance < closestCityDistance)
                {
                    closestCity         = targetCity;
                    closestCityDistance = distance;
                }
            }

            return(closestCity);
        }
Esempio n. 2
0
        public TrainingData Sample(
            string citiesPath,
            string alternateNamesPath,
            string admin1Path,
            string admin2Path,
            string countriesPath,
            string clliPath,
            string unlocodePath,
            string groundTruthPath,
            int trueNegativesMultiplier,
            ShouldProcessHostname shouldProcessHostname = null,
            ShouldContinueIngestingNewHostnames shouldContinueIngestingNewHostnames = null,
            ShowConsoleStats showConsoleStats = null,
            FeaturesConfig featuresConfig     = null)
        {
            if (featuresConfig == null)
            {
                featuresConfig = new FeaturesConfig();
            }

            var aggregator = new CityFeaturesAggregator(citiesPath, alternateNamesPath, admin1Path, admin2Path, countriesPath, clliPath, unlocodePath, featuresConfig: featuresConfig);

            this.Aggregator = aggregator;

            var trainingData = new TrainingData(tableName: "ReverseDNSGeolocation Training", featuresAggregator: aggregator);

            this.TrainingData = trainingData;

            string line;
            var    counter = 0;

            var storedTruePositivesCount = 0;
            var storedTrueNegativesCount = 0;

            var positivesFeaturesDistribution = new Dictionary <CityFeatureType, int>();

            var rand = new Random();

            using (var file = new StreamReader(groundTruthPath))
            {
                while ((line = file.ReadLine()) != null)
                {
                    if (string.IsNullOrWhiteSpace(line) ||
                        (line.Length > 0 && line[0] == '#'))
                    {
                        continue;
                    }

                    var parts = new List <string>(line.Split(new char[] { '\t' }));

                    if (parts.Count != 13)
                    {
                        continue;
                    }

                    /*
                     * 0  RawIP   string
                     * 1  NumericIP   uint
                     * 2  Bucket  uint?
                     * 3  ReverseDNSHostname  string
                     * 4  RealtimeLatitude    double?
                     * 5  RealtimeLongitude   double?
                     * 6  RealtimeCountryISO  string
                     * 7  RealtimeCountryConfidence   byte
                     * 8  RealtimeState   string
                     * 9  RealtimeStateConfidence byte
                     * 10 RealtimeCity    string
                     * 11 RealtimeCityConfidence  byte
                     * 12 RealtimeAccuracyKm  double?
                     */

                    var hostname = parts[3];

                    var trueLatitudeStr  = parts[4];
                    var trueLongitudeStr = parts[5];

                    if (string.IsNullOrWhiteSpace(hostname) ||
                        string.IsNullOrWhiteSpace(trueLatitudeStr) ||
                        string.IsNullOrWhiteSpace(trueLongitudeStr))
                    {
                        continue;
                    }

                    double trueLatitude;
                    double trueLongitude;

                    if (!double.TryParse(trueLatitudeStr, out trueLatitude) ||
                        !double.TryParse(trueLongitudeStr, out trueLongitude))
                    {
                        continue;
                    }

                    var parsedHostname = HostnameSplitter.Split(hostname);

                    if (shouldProcessHostname != null && !shouldProcessHostname(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount))
                    {
                        continue;
                    }

                    counter++;

                    if (showConsoleStats != null)
                    {
                        showConsoleStats(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount);
                    }

                    var candidatesAndFeatures = aggregator.GenerateCandidatesForHostname(parsedHostname);

                    var truesPositives = new List <DataRow>();
                    var trueNegatives  = new List <DataRow>();

                    foreach (var candidateEntry in candidatesAndFeatures)
                    {
                        var locationCandidate = candidateEntry.Key;
                        var locationFeatures  = candidateEntry.Value;

                        var distance = DistanceHelper.Distance(trueLatitude, trueLongitude, locationCandidate.Latitude, locationCandidate.Longitude, DistanceUnit.Mile);

                        if (distance <= featuresConfig.TruePositiveMaximumDistanceKilometers)
                        {
                            if (this.ShouldAddPositiveExample(positivesFeaturesDistribution, locationFeatures))
                            {
                                var newRow = trainingData.CreateTrainingRow(locationFeatures, isValidLocation: true);
                                truesPositives.Add(newRow);

                                this.AddToFeaturesDistribution(positivesFeaturesDistribution, locationFeatures);
                            }

                            /*
                             * if (storedTruePositivesCount % 100 == 0)
                             * {
                             *  Console.WriteLine($"{hostname} - {locationCandidate}");
                             *  Console.WriteLine("---");
                             *
                             *  foreach (var feature in locationFeatures)
                             *  {
                             *      Console.WriteLine($"{feature.Key} = {feature.Value}");
                             *  }
                             *
                             *  Console.WriteLine("---------------------------------");
                             * }
                             */

                            /*
                             * if ((bool)locationFeatures[CityFeatureType.ExactAdmin1NameMatch] == true && (bool)locationFeatures[CityFeatureType.CityAdmin1NameMatch] == true)
                             * {
                             *  Console.WriteLine($"{hostname} - {locationCandidate}");
                             *  Console.WriteLine("---");
                             *
                             *  foreach (var feature in locationFeatures)
                             *  {
                             *      Console.WriteLine($"{feature.Key} = {feature.Value}");
                             *  }
                             *
                             *  Console.WriteLine("---------------------------------");
                             * }
                             */
                        }
                        else
                        {
                            var newRow = trainingData.CreateTrainingRow(locationFeatures, isValidLocation: false);
                            trueNegatives.Add(newRow);

                            /*
                             * if ((bool)locationFeatures[CityFeatureType.ExactAdmin1NameMatch] == true && (bool)locationFeatures[CityFeatureType.CityAdmin1NameMatch] == true)
                             * {
                             *  Console.WriteLine($"{hostname} - {locationCandidate}");
                             *  Console.WriteLine("---");
                             *
                             *  foreach (var feature in locationFeatures)
                             *  {
                             *      Console.WriteLine($"{feature.Key} = {feature.Value}");
                             *  }
                             *
                             *  Console.WriteLine("---------------------------------");
                             * }
                             */
                        }
                    }

                    // WARNING: Do not move this above the true negatives selection, at it will bias the data (even more)
                    if (truesPositives.Count > 0)
                    {
                        truesPositives.ForEach(tp => trainingData.AddTrainingRow(tp));
                        storedTruePositivesCount += truesPositives.Count;
                    }

                    var neededTrueNegativeItemsCount = storedTruePositivesCount * trueNegativesMultiplier;

                    if (trueNegatives.Count > 0 && storedTrueNegativesCount < neededTrueNegativeItemsCount)
                    {
                        var neededItemsCount = 0;

                        if (truesPositives.Count > 0)
                        {
                            neededItemsCount = Math.Min(truesPositives.Count, Math.Min(trueNegatives.Count, neededTrueNegativeItemsCount));
                        }
                        else
                        {
                            neededItemsCount = Math.Min(trueNegatives.Count, neededTrueNegativeItemsCount);
                        }

                        var extractedRandTrueNegativeItems = trueNegatives.OrderBy(x => rand.Next()).Take(neededItemsCount);

                        foreach (var trueNegativeItem in extractedRandTrueNegativeItems)
                        {
                            trainingData.AddTrainingRow(trueNegativeItem);
                            storedTrueNegativesCount++;
                        }
                    }

                    if (counter % 1000 == 0)
                    {
                        Console.WriteLine("------------------------------------");

                        foreach (var entry in positivesFeaturesDistribution)
                        {
                            Console.WriteLine($"Positive: {entry.Key}\t{entry.Value}");
                        }

                        Console.WriteLine("------------------------------------");
                    }

                    if (shouldContinueIngestingNewHostnames != null && !shouldContinueIngestingNewHostnames(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount))
                    {
                        showConsoleStats(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount, lastRow: true);
                        break;
                    }
                }
            }

            trainingData.FinalizeData();

            return(trainingData);
        }
Esempio n. 3
0
        public virtual IEnumerable <TrainingDataSample> Sample()
        {
            string line;

            using (var file = new StreamReader(this.GroundTruthPath))
            {
                while ((line = file.ReadLine()) != null)
                {
                    if (string.IsNullOrWhiteSpace(line) ||
                        (line.Length > 0 && line[0] == '#'))
                    {
                        continue;
                    }

                    var parts = new List <string>(line.Split(new char[] { '\t' }));

                    if (parts.Count != 13)
                    {
                        continue;
                    }

                    /*
                     * 0  RawIP   string
                     * 1  NumericIP   uint
                     * 2  Bucket  uint?
                     * 3  ReverseDNSHostname  string
                     * 4  RealtimeLatitude    double?
                     * 5  RealtimeLongitude   double?
                     * 6  RealtimeCountryISO  string
                     * 7  RealtimeCountryConfidence   byte
                     * 8  RealtimeState   string
                     * 9  RealtimeStateConfidence byte
                     * 10 RealtimeCity    string
                     * 11 RealtimeCityConfidence  byte
                     * 12 RealtimeAccuracyKm  double?
                     */

                    var hostname = parts[3];

                    var trueLatitudeStr  = parts[4];
                    var trueLongitudeStr = parts[5];

                    if (string.IsNullOrWhiteSpace(hostname) ||
                        string.IsNullOrWhiteSpace(trueLatitudeStr) ||
                        string.IsNullOrWhiteSpace(trueLongitudeStr))
                    {
                        continue;
                    }

                    double trueLatitude;
                    double trueLongitude;

                    if (!double.TryParse(trueLatitudeStr, out trueLatitude) ||
                        !double.TryParse(trueLongitudeStr, out trueLongitude))
                    {
                        continue;
                    }

                    this.ValidLinesCounter++;

                    if (this.ValidLinesCounter % 100000 == 0)
                    {
                        Console.WriteLine($"ValidLinesCounter = {this.ValidLinesCounter}, ProcessingAttemptedCounter = {this.ProcessingAttemptedCounter}, SelectedTruePositivesCount = {this.SelectedTruePositivesCount}, SelectedTrueNegativesCount = {this.SelectedTrueNegativesCount}");
                    }

                    var parsedHostname = HostnameSplitter.Split(hostname);

                    if (!this.ShouldAcceptDomain(hostname, parsedHostname))
                    {
                        continue;
                    }

                    this.ProcessingAttemptedCounter++;

                    if (this.ProcessingAttemptedCounter > this.maxSamples)
                    {
                        break;
                    }

                    //// !!!!!! this.ShowConsoleStats(hostname, parsedHostname, storedTruePositivesCount, storedTrueNegativesCount);

                    var candidatesAndFeatures = this.Aggregator.GenerateCandidatesForHostname(parsedHostname);

                    var truesPositives = new List <TrainingDataSample>();
                    var trueNegatives  = new List <TrainingDataSample>();

                    foreach (var candidateEntry in candidatesAndFeatures)
                    {
                        var locationCandidate = candidateEntry.Key;
                        var locationFeatures  = candidateEntry.Value;

                        //var distance = DistanceHelper.Distance(trueLatitude, trueLongitude, locationCandidate.Latitude, locationCandidate.Longitude, DistanceUnit.Mile);
                        var distance = DistanceHelper.Distance(trueLatitude, trueLongitude, locationCandidate.Latitude, locationCandidate.Longitude, DistanceUnit.Kilometer);

                        //if (distance <= this.FeaturesConfig.TruePositiveMaximumDistanceMiles)
                        if (distance <= this.FeaturesConfig.TruePositiveMaximumDistanceKilometers)
                        {
                            var positiveSampleFeaturesSignature = this.GenerateFeaturesBoolSignature(locationFeatures, isPositiveSample: true);

                            truesPositives.Add(new TrainingDataSample()
                            {
                                Hostname          = hostname,
                                City              = locationCandidate,
                                FeaturesSignature = positiveSampleFeaturesSignature,
                                Features          = locationFeatures,
                                IsPositiveExample = true
                            });
                        }
                        else
                        {
                            var negativeSampleFeaturesSignature = this.GenerateFeaturesBoolSignature(locationFeatures, isPositiveSample: false);

                            trueNegatives.Add(new TrainingDataSample()
                            {
                                Hostname          = hostname,
                                City              = locationCandidate,
                                FeaturesSignature = negativeSampleFeaturesSignature,
                                Features          = locationFeatures,
                                IsPositiveExample = false
                            });
                        }
                    }

                    // WARNING: Do not move this above the true negatives selection, at it will bias the data (even more)
                    if (truesPositives.Count > 0)
                    {
                        foreach (var truePositive in truesPositives)
                        {
                            yield return(truePositive);
                        }

                        this.SelectedTruePositivesCount += truesPositives.Count;
                    }

                    foreach (var trueNegativeItem in trueNegatives)
                    {
                        yield return(trueNegativeItem);

                        this.SelectedTrueNegativesCount++;
                        this.IncrementFeatureDistribution(this.NegativeSamplesFeatureDistribution, trueNegativeItem.FeaturesSignature);
                    }
                }
            }
        }