public override void IngestCityEntity(GeonamesCityEntity entity)
        {
            var nameVariations = this.GenerateVariationsForNames(entity.AlternateNames);

            foreach (var nameVariation in nameVariations)
            {
                var features = this.InitializeDefaultFeatureValues();

                features[CityFeatureType.AlternateCityAbbreviationMatch] = true;

                if (entity.Population > 0)
                {
                    features[CityFeatureType.AlternateCityAbbreviationPopulation] = (uint?)entity.Population;
                }

                features[CityFeatureType.AlternateCityAbbreviationLetters] = (byte?)nameVariation.Length;

                EntitiesToFeatures entitiesToFeatures;

                if (!variationsToEntitiesToFeatures.TryGetValue(nameVariation, out entitiesToFeatures))
                {
                    entitiesToFeatures = new EntitiesToFeatures();
                    variationsToEntitiesToFeatures[nameVariation] = entitiesToFeatures;
                }

                entitiesToFeatures[entity] = features;
            }
        }
        public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features)
        {
            if (parsedHostname == null || string.IsNullOrWhiteSpace(parsedHostname.TLD) || cityEntity == null || cityEntity.CountryEntity == null || string.IsNullOrWhiteSpace(cityEntity.CountryEntity.TLD) || features == null)
            {
                return;
            }

            var countryTld = cityEntity.CountryEntity.TLD.ToLowerInvariant();

            var hostnameTLD      = parsedHostname.TLD;
            var hostnameTLDParts = hostnameTLD.Split('.');

            var lastPart = hostnameTLDParts[hostnameTLDParts.Length - 1];

            if (!string.IsNullOrWhiteSpace(lastPart))
            {
                string convertedTld;
                var    hostnameTld = string.Format(CultureInfo.InvariantCulture, ".{0}", lastPart.ToLowerInvariant());

                if (countryTld == hostnameTld)
                {
                    features[CityFeatureType.TLDMatch] = true;
                }
                else if (
                    parsedHostname.DomainInfo?.RegistrableDomain != null &&
                    DomainsToCountryTlds.TryGetValue(parsedHostname.DomainInfo.RegistrableDomain, out convertedTld) &&
                    countryTld == convertedTld)
                {
                    features[CityFeatureType.TLDMatch] = true;
                }
            }
        }
Example #3
0
        public override void IngestCityEntity(GeonamesCityEntity entity)
        {
            if (entity?.AlternateNames != null)
            {
                foreach (var alternateNameEntity in entity.AlternateNames)
                {
                    var nameVariations = this.GenerateVariationsForName(alternateNameEntity);

                    foreach (var nameVariation in nameVariations)
                    {
                        var features = this.InitializeDefaultFeatureValues();

                        features[CityFeatureType.AlternateCityNameMatch] = true;

                        if (entity.Population > 0)
                        {
                            features[CityFeatureType.AlternateCityNamePopulation] = (uint?)entity.Population;
                        }

                        features[CityFeatureType.AlternateCityNameLetters] = (byte?)nameVariation.Length;

                        if (this.FeaturesConfig.UseAlternateNamesCount)
                        {
                            features[CityFeatureType.AlternateCityNameAlternateNamesCount] = (uint?)(entity.AlternateNames?.Count ?? 0);
                        }

                        if (this.FeaturesConfig.UseAlternateNameCategories)
                        {
                            features[CityFeatureType.AlternateCityNameIsPreferredName] = alternateNameEntity.IsPreferredName;
                            features[CityFeatureType.AlternateCityNameIsShortName]     = alternateNameEntity.IsShortName;
                            features[CityFeatureType.AlternateCityNameIsColloquial]    = alternateNameEntity.IsColloquial;
                            features[CityFeatureType.AlternateCityNameIsHistoric]      = alternateNameEntity.IsHistoric;
                        }

                        EntitiesToFeatures entitiesToFeatures;

                        if (!variationsToEntitiesToFeatures.TryGetValue(nameVariation, out entitiesToFeatures))
                        {
                            entitiesToFeatures = new EntitiesToFeatures();
                            variationsToEntitiesToFeatures[nameVariation] = entitiesToFeatures;
                        }

                        Features existingFeatures;

                        if (entitiesToFeatures.TryGetValue(entity, out existingFeatures) && this.FeaturesConfig.UseAlternateNameCategories)
                        {
                            // Merge with existing boolean values
                            existingFeatures[CityFeatureType.AlternateCityNameIsPreferredName] = alternateNameEntity.IsPreferredName || (bool)existingFeatures[CityFeatureType.AlternateCityNameIsPreferredName];
                            existingFeatures[CityFeatureType.AlternateCityNameIsShortName]     = alternateNameEntity.IsShortName || (bool)existingFeatures[CityFeatureType.AlternateCityNameIsShortName];
                            existingFeatures[CityFeatureType.AlternateCityNameIsColloquial]    = alternateNameEntity.IsColloquial || (bool)existingFeatures[CityFeatureType.AlternateCityNameIsColloquial];
                            existingFeatures[CityFeatureType.AlternateCityNameIsHistoric]      = alternateNameEntity.IsHistoric || (bool)existingFeatures[CityFeatureType.AlternateCityNameIsHistoric];
                        }
                        else
                        {
                            entitiesToFeatures[entity] = features;
                        }
                    }
                }
            }
        }
Example #4
0
        public override void IngestCityEntity(GeonamesCityEntity entity)
        {
            var nameVariations = this.GenerateVariationsForCityAdmin1Name(entity.Name, entity.AsciiName, entity.AlternateNames, entity.Admin1Code, entity.Admin1Entity);

            foreach (var nameVariationCompound in nameVariations)
            {
                var features = this.InitializeDefaultFeatureValues();

                features[CityFeatureType.CityAdmin1NameMatch] = true;

                if (entity.Population > 0)
                {
                    features[CityFeatureType.CityAdmin1NamePopulation] = (uint?)entity.Population;
                }

                features[CityFeatureType.CityAdmin1LettersBoth]   = (byte?)nameVariationCompound.FullName.Length;
                features[CityFeatureType.CityAdmin1LettersCity]   = (byte?)nameVariationCompound.FirstComponent.Length;
                features[CityFeatureType.CityAdmin1LettersAdmin1] = (byte?)nameVariationCompound.SecondComponent.Length;

                EntitiesToFeatures entitiesToFeatures;

                if (!variationsToEntitiesToFeatures.TryGetValue(nameVariationCompound.FullName, out entitiesToFeatures))
                {
                    entitiesToFeatures = new EntitiesToFeatures();
                    variationsToEntitiesToFeatures[nameVariationCompound.FullName] = entitiesToFeatures;
                }

                entitiesToFeatures[entity] = features;
            }
        }
        public virtual void AppendFeatures(string hostname, GeonamesCityEntity cityEntity, Features features)
        {
            if (hostname == null)
            {
                throw new ArgumentNullException("hostname");
            }

            if (cityEntity == null)
            {
                throw new ArgumentNullException("cityEntity");
            }

            if (features == null)
            {
                throw new ArgumentNullException("features");
            }

            var parsedHostname = HostnameSplitter.Split(hostname);

            if (this.FeaturesConfig.InitializeDefaultFeatures)
            {
                this.InitializeDefaultFeatureValues(features);
            }

            this.AppendFeatures(parsedHostname, cityEntity, features);
        }
        public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features)
        {
            if (parsedHostname?.SubdomainParts == null || cityEntity == null || cityEntity.CountryEntity == null || string.IsNullOrWhiteSpace(cityEntity.CountryEntity.Name) || features == null)
            {
                return;
            }

            var countryNameVariations = cityEntity.CountryEntity.NameVariationsLower;

            foreach (var subdomainPart in parsedHostname.SubdomainParts)
            {
                if (countryNameVariations.Contains(subdomainPart.Substring))
                {
                    features[CityFeatureType.ExactCountryNameMatch] = true;

                    if (!features.ContainsKey(CityFeatureType.ExactCountryLetters) || features[CityFeatureType.ExactCountryLetters] == null || ((byte)features[CityFeatureType.ExactCountryLetters]) < subdomainPart.Substring.Length)
                    {
                        features[CityFeatureType.ExactCountryLetters] = Convert.ToByte(subdomainPart.Substring.Length);
                    }

                    if (this.FeaturesConfig.UseSlotIndex)
                    {
                        features[CityFeatureType.ExactCountryRTLSlotIndex] = subdomainPart.RTLSlotIndex;
                        features[CityFeatureType.ExactCountryLTRSlotIndex] = subdomainPart.LTRSlotIndex;
                    }
                }
            }
        }
        public override void IngestCityEntity(GeonamesCityEntity entity)
        {
            var codes = entity.UNLOCODECodes;

            if (codes == null || codes.Count == 0)
            {
                return;
            }

            foreach (var code in codes)
            {
                var features = this.InitializeDefaultFeatureValues();

                features[CityFeatureType.ExactUNLOCODECodeMatch] = true;

                if (entity.Population > 0)
                {
                    features[CityFeatureType.ExactUNLOCODECodePopulation] = (uint?)entity.Population;
                }

                EntitiesToFeatures entitiesToFeatures;

                if (!codesToEntitiesToFeatures.TryGetValue(code, out entitiesToFeatures))
                {
                    entitiesToFeatures = new EntitiesToFeatures();
                    codesToEntitiesToFeatures[code] = entitiesToFeatures;
                }

                entitiesToFeatures[entity] = features;
            }
        }
        public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features)
        {
            if (parsedHostname?.SubdomainParts == null || cityEntity == null || cityEntity.Admin1Entity == null || string.IsNullOrWhiteSpace(cityEntity.Admin1Entity.Name) || features == null)
            {
                return;
            }

            var firstLettersAdmin1NameVariations = this.GenerateVariationsForName(cityEntity.Admin1Entity.Name);

            foreach (var subdomainPart in parsedHostname.SubdomainParts)
            {
                if (firstLettersAdmin1NameVariations.Contains(subdomainPart.Substring))
                {
                    features[CityFeatureType.FirstLettersAdmin1NameMatch] = true;

                    if (!features.ContainsKey(CityFeatureType.FirstLettersAdmin1Letters) || features[CityFeatureType.FirstLettersAdmin1Letters] == null || ((byte)features[CityFeatureType.FirstLettersAdmin1Letters]) < subdomainPart.Substring.Length)
                    {
                        features[CityFeatureType.FirstLettersAdmin1Letters]      = Convert.ToByte(subdomainPart.Substring.Length);
                        features[CityFeatureType.FirstLettersAdmin1LettersRatio] = (float?)(subdomainPart.Substring.Length / ((1.0f) * cityEntity.Admin1Entity.Name.Length));
                    }

                    if (this.FeaturesConfig.UseSlotIndex)
                    {
                        features[CityFeatureType.FirstLettersAdmin1RTLSlotIndex] = subdomainPart.RTLSlotIndex;
                        features[CityFeatureType.FirstLettersAdmin1LTRSlotIndex] = subdomainPart.LTRSlotIndex;
                    }
                }
            }
        }
Example #9
0
        public override void IngestCityEntity(GeonamesCityEntity entity)
        {
            if (entity.AirportCodes != null)
            {
                foreach (var airportCodeEntity in entity.AirportCodes)
                {
                    if (!string.IsNullOrWhiteSpace(airportCodeEntity.AlternateName))
                    {
                        var airportCode = airportCodeEntity.AlternateName.ToLowerInvariant();

                        var features = this.InitializeDefaultFeatureValues();

                        features[CityFeatureType.AirportCodeMatch] = true;

                        if (entity.Population > 0)
                        {
                            features[CityFeatureType.AirportCodeCityPopulation] = (uint?)entity.Population;
                        }

                        features[CityFeatureType.AirportCodeLetters] = (byte?)airportCode.Length;

                        EntitiesToFeatures entitiesToFeatures;

                        if (!variationsToEntitiesToFeatures.TryGetValue(airportCode, out entitiesToFeatures))
                        {
                            entitiesToFeatures = new EntitiesToFeatures();
                            variationsToEntitiesToFeatures[airportCode] = entitiesToFeatures;
                        }

                        entitiesToFeatures[entity] = features;
                    }
                }
            }
        }
Example #10
0
        public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features)
        {
            if (parsedHostname == null ||
                parsedHostname.DomainInfo?.RegistrableDomain == null ||
                parsedHostname.SubdomainParts == null ||
                parsedHostname.SubdomainParts.Count == 0 ||
                cityEntity == null)
            {
                return;
            }

            var domain = parsedHostname.DomainInfo.RegistrableDomain;

            Dictionary <PatternRule, PatternMiningCoordinates> rulesToCoordinates;

            if (!this.hostnamePatternRules.TryGetValue(domain, out rulesToCoordinates))
            {
                return;
            }

            var subdomainParts = parsedHostname.SubdomainParts;

            if (subdomainParts == null || subdomainParts.Count == 0)
            {
                return;
            }

            var ruleAtoms = this.miner.CreateRuleAtoms(subdomainParts);

            if (ruleAtoms == null || ruleAtoms.Count == 0)
            {
                return;
            }

            var rules = this.miner.GeneratePossibleRules(ruleAtoms);

            if (rules == null || rules.Count == 0)
            {
                return;
            }

            foreach (var rule in rules)
            {
                PatternMiningCoordinates coordinates;

                if (rulesToCoordinates.TryGetValue(rule, out coordinates))
                {
                    var distance = DistanceHelper.Distance(cityEntity.Latitude, cityEntity.Longitude, coordinates.Latitude, coordinates.Longitude, DistanceUnit.Kilometer);

                    // TODO: Make this configurable
                    // TODO: Distance should vary depending on geohash length?
                    if (distance <= 100)
                    {
                        features[CityFeatureType.HostnamePatternMatch] = true;
                    }
                }
            }
        }
        public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features)
        {
            if (!this.FeaturesConfig.UseDomainAsFeature)
            {
                return;
            }

            if (parsedHostname?.DomainInfo?.RegistrableDomain == null || features == null)
            {
                return;
            }

            var domain = parsedHostname.DomainInfo.RegistrableDomain;

            features[CityFeatureType.Domain] = domain.GetHashCode();
        }
        public override void IngestCityEntity(GeonamesCityEntity entity)
        {
            var nameVariations      = this.GenerateSimpleVariationsForName(entity.Name);
            var asciiNameVariations = this.GenerateSimpleVariationsForName(entity.AsciiName);

            nameVariations.UnionWith(asciiNameVariations);

            if (this.FeaturesConfig.UseComplexNoVowelsFeature)
            {
                var complexNameVariations = this.GenerateComplexVariationsForName(entity.Name, minLetters: MinLetters);
                nameVariations.UnionWith(complexNameVariations);

                var complexAsciiNameVariations = this.GenerateComplexVariationsForName(entity.AsciiName, minLetters: MinLetters);
                nameVariations.UnionWith(complexAsciiNameVariations);
            }

            foreach (var nameVariation in nameVariations)
            {
                var features = this.InitializeDefaultFeatureValues();

                features[CityFeatureType.NoVowelsCityNameMatch] = true;

                if (entity.Population > 0)
                {
                    features[CityFeatureType.NoVowelsCityNamePopulation] = (uint?)entity.Population;
                }

                features[CityFeatureType.NoVowelsCityNameLetters] = (byte?)nameVariation.Length;

                if (nameVariation.Length > 0)
                {
                    features[CityFeatureType.NoVowelsCityNameLettersRatio] = (float?)(nameVariation.Length / ((1.0f) * entity.Name.Length));
                }

                EntitiesToFeatures entitiesToFeatures;

                if (!variationsToEntitiesToFeatures.TryGetValue(nameVariation, out entitiesToFeatures))
                {
                    entitiesToFeatures = new EntitiesToFeatures();
                    variationsToEntitiesToFeatures[nameVariation] = entitiesToFeatures;
                }

                entitiesToFeatures[entity] = features;
            }
        }
Example #13
0
        public GeonamesCityEntity FindClosestCityForCoordinates(double latitude, double longitude)
        {
            var coordinatesGeohash = GeoHash.Encode(latitude, longitude, numberOfChars: 3); // 3 = ±78km
            var neighborGeohashes  = GeoHash.Neighbors(coordinatesGeohash);

            var targetGeohashes = new HashSet <string>(neighborGeohashes);

            targetGeohashes.Add(coordinatesGeohash);

            var targetCities = new List <GeonamesCityEntity>();

            foreach (var targetGeohash in targetGeohashes)
            {
                List <GeonamesCityEntity> citiesInTargetGeohash;

                if (this.GeohashesToCities.TryGetValue(targetGeohash, out citiesInTargetGeohash))
                {
                    targetCities.AddRange(citiesInTargetGeohash);
                }
            }

            GeonamesCityEntity closestCity         = null;
            double             closestCityDistance = double.MaxValue;

            foreach (var targetCity in targetCities)
            {
                var distance = DistanceHelper.Distance(targetCity.Latitude, targetCity.Longitude, latitude, longitude, DistanceUnit.Kilometer);

                if (distance <= 50 && distance < closestCityDistance)
                {
                    closestCity         = targetCity;
                    closestCityDistance = distance;
                }
            }

            return(closestCity);
        }
        public override void IngestCityEntity(GeonamesCityEntity entity)
        {
            var nameVariations      = this.GenerateVariationsForName(entity.Name);
            var asciiNameVariations = this.GenerateVariationsForName(entity.AsciiName);

            nameVariations.UnionWith(asciiNameVariations);

            foreach (var nameVariation in nameVariations)
            {
                var features = this.InitializeDefaultFeatureValues();

                features[CityFeatureType.ExactCityNameMatch] = true;

                if (entity.Population > 0)
                {
                    features[CityFeatureType.ExactCityNamePopulation] = (uint?)entity.Population;
                }

                features[CityFeatureType.ExactCityNameLetters] = (byte?)nameVariation.Length;

                if (this.FeaturesConfig.UseAlternateNamesCount)
                {
                    features[CityFeatureType.ExactCityNameAlternateNamesCount] = (uint?)(entity.AlternateNames?.Count ?? 0);
                }

                EntitiesToFeatures entitiesToFeatures;

                if (!variationsToEntitiesToFeatures.TryGetValue(nameVariation, out entitiesToFeatures))
                {
                    entitiesToFeatures = new EntitiesToFeatures();
                    variationsToEntitiesToFeatures[nameVariation] = entitiesToFeatures;
                }

                entitiesToFeatures[entity] = features;
            }
        }
 public abstract void IngestCityEntity(GeonamesCityEntity entity);
 public abstract void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features);
 public override void IngestCityEntity(GeonamesCityEntity entity)
 {
 }