public override void IngestCityEntity(GeonamesCityEntity entity) { var nameVariations = this.GenerateVariationsForNames(entity.AlternateNames); foreach (var nameVariation in nameVariations) { var features = this.InitializeDefaultFeatureValues(); features[CityFeatureType.AlternateCityAbbreviationMatch] = true; if (entity.Population > 0) { features[CityFeatureType.AlternateCityAbbreviationPopulation] = (uint?)entity.Population; } features[CityFeatureType.AlternateCityAbbreviationLetters] = (byte?)nameVariation.Length; EntitiesToFeatures entitiesToFeatures; if (!variationsToEntitiesToFeatures.TryGetValue(nameVariation, out entitiesToFeatures)) { entitiesToFeatures = new EntitiesToFeatures(); variationsToEntitiesToFeatures[nameVariation] = entitiesToFeatures; } entitiesToFeatures[entity] = features; } }
public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features) { if (parsedHostname == null || string.IsNullOrWhiteSpace(parsedHostname.TLD) || cityEntity == null || cityEntity.CountryEntity == null || string.IsNullOrWhiteSpace(cityEntity.CountryEntity.TLD) || features == null) { return; } var countryTld = cityEntity.CountryEntity.TLD.ToLowerInvariant(); var hostnameTLD = parsedHostname.TLD; var hostnameTLDParts = hostnameTLD.Split('.'); var lastPart = hostnameTLDParts[hostnameTLDParts.Length - 1]; if (!string.IsNullOrWhiteSpace(lastPart)) { string convertedTld; var hostnameTld = string.Format(CultureInfo.InvariantCulture, ".{0}", lastPart.ToLowerInvariant()); if (countryTld == hostnameTld) { features[CityFeatureType.TLDMatch] = true; } else if ( parsedHostname.DomainInfo?.RegistrableDomain != null && DomainsToCountryTlds.TryGetValue(parsedHostname.DomainInfo.RegistrableDomain, out convertedTld) && countryTld == convertedTld) { features[CityFeatureType.TLDMatch] = true; } } }
public override void IngestCityEntity(GeonamesCityEntity entity) { if (entity?.AlternateNames != null) { foreach (var alternateNameEntity in entity.AlternateNames) { var nameVariations = this.GenerateVariationsForName(alternateNameEntity); foreach (var nameVariation in nameVariations) { var features = this.InitializeDefaultFeatureValues(); features[CityFeatureType.AlternateCityNameMatch] = true; if (entity.Population > 0) { features[CityFeatureType.AlternateCityNamePopulation] = (uint?)entity.Population; } features[CityFeatureType.AlternateCityNameLetters] = (byte?)nameVariation.Length; if (this.FeaturesConfig.UseAlternateNamesCount) { features[CityFeatureType.AlternateCityNameAlternateNamesCount] = (uint?)(entity.AlternateNames?.Count ?? 0); } if (this.FeaturesConfig.UseAlternateNameCategories) { features[CityFeatureType.AlternateCityNameIsPreferredName] = alternateNameEntity.IsPreferredName; features[CityFeatureType.AlternateCityNameIsShortName] = alternateNameEntity.IsShortName; features[CityFeatureType.AlternateCityNameIsColloquial] = alternateNameEntity.IsColloquial; features[CityFeatureType.AlternateCityNameIsHistoric] = alternateNameEntity.IsHistoric; } EntitiesToFeatures entitiesToFeatures; if (!variationsToEntitiesToFeatures.TryGetValue(nameVariation, out entitiesToFeatures)) { entitiesToFeatures = new EntitiesToFeatures(); variationsToEntitiesToFeatures[nameVariation] = entitiesToFeatures; } Features existingFeatures; if (entitiesToFeatures.TryGetValue(entity, out existingFeatures) && this.FeaturesConfig.UseAlternateNameCategories) { // Merge with existing boolean values existingFeatures[CityFeatureType.AlternateCityNameIsPreferredName] = alternateNameEntity.IsPreferredName || (bool)existingFeatures[CityFeatureType.AlternateCityNameIsPreferredName]; existingFeatures[CityFeatureType.AlternateCityNameIsShortName] = alternateNameEntity.IsShortName || (bool)existingFeatures[CityFeatureType.AlternateCityNameIsShortName]; existingFeatures[CityFeatureType.AlternateCityNameIsColloquial] = alternateNameEntity.IsColloquial || (bool)existingFeatures[CityFeatureType.AlternateCityNameIsColloquial]; existingFeatures[CityFeatureType.AlternateCityNameIsHistoric] = alternateNameEntity.IsHistoric || (bool)existingFeatures[CityFeatureType.AlternateCityNameIsHistoric]; } else { entitiesToFeatures[entity] = features; } } } } }
public override void IngestCityEntity(GeonamesCityEntity entity) { var nameVariations = this.GenerateVariationsForCityAdmin1Name(entity.Name, entity.AsciiName, entity.AlternateNames, entity.Admin1Code, entity.Admin1Entity); foreach (var nameVariationCompound in nameVariations) { var features = this.InitializeDefaultFeatureValues(); features[CityFeatureType.CityAdmin1NameMatch] = true; if (entity.Population > 0) { features[CityFeatureType.CityAdmin1NamePopulation] = (uint?)entity.Population; } features[CityFeatureType.CityAdmin1LettersBoth] = (byte?)nameVariationCompound.FullName.Length; features[CityFeatureType.CityAdmin1LettersCity] = (byte?)nameVariationCompound.FirstComponent.Length; features[CityFeatureType.CityAdmin1LettersAdmin1] = (byte?)nameVariationCompound.SecondComponent.Length; EntitiesToFeatures entitiesToFeatures; if (!variationsToEntitiesToFeatures.TryGetValue(nameVariationCompound.FullName, out entitiesToFeatures)) { entitiesToFeatures = new EntitiesToFeatures(); variationsToEntitiesToFeatures[nameVariationCompound.FullName] = entitiesToFeatures; } entitiesToFeatures[entity] = features; } }
public virtual void AppendFeatures(string hostname, GeonamesCityEntity cityEntity, Features features) { if (hostname == null) { throw new ArgumentNullException("hostname"); } if (cityEntity == null) { throw new ArgumentNullException("cityEntity"); } if (features == null) { throw new ArgumentNullException("features"); } var parsedHostname = HostnameSplitter.Split(hostname); if (this.FeaturesConfig.InitializeDefaultFeatures) { this.InitializeDefaultFeatureValues(features); } this.AppendFeatures(parsedHostname, cityEntity, features); }
public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features) { if (parsedHostname?.SubdomainParts == null || cityEntity == null || cityEntity.CountryEntity == null || string.IsNullOrWhiteSpace(cityEntity.CountryEntity.Name) || features == null) { return; } var countryNameVariations = cityEntity.CountryEntity.NameVariationsLower; foreach (var subdomainPart in parsedHostname.SubdomainParts) { if (countryNameVariations.Contains(subdomainPart.Substring)) { features[CityFeatureType.ExactCountryNameMatch] = true; if (!features.ContainsKey(CityFeatureType.ExactCountryLetters) || features[CityFeatureType.ExactCountryLetters] == null || ((byte)features[CityFeatureType.ExactCountryLetters]) < subdomainPart.Substring.Length) { features[CityFeatureType.ExactCountryLetters] = Convert.ToByte(subdomainPart.Substring.Length); } if (this.FeaturesConfig.UseSlotIndex) { features[CityFeatureType.ExactCountryRTLSlotIndex] = subdomainPart.RTLSlotIndex; features[CityFeatureType.ExactCountryLTRSlotIndex] = subdomainPart.LTRSlotIndex; } } } }
public override void IngestCityEntity(GeonamesCityEntity entity) { var codes = entity.UNLOCODECodes; if (codes == null || codes.Count == 0) { return; } foreach (var code in codes) { var features = this.InitializeDefaultFeatureValues(); features[CityFeatureType.ExactUNLOCODECodeMatch] = true; if (entity.Population > 0) { features[CityFeatureType.ExactUNLOCODECodePopulation] = (uint?)entity.Population; } EntitiesToFeatures entitiesToFeatures; if (!codesToEntitiesToFeatures.TryGetValue(code, out entitiesToFeatures)) { entitiesToFeatures = new EntitiesToFeatures(); codesToEntitiesToFeatures[code] = entitiesToFeatures; } entitiesToFeatures[entity] = features; } }
public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features) { if (parsedHostname?.SubdomainParts == null || cityEntity == null || cityEntity.Admin1Entity == null || string.IsNullOrWhiteSpace(cityEntity.Admin1Entity.Name) || features == null) { return; } var firstLettersAdmin1NameVariations = this.GenerateVariationsForName(cityEntity.Admin1Entity.Name); foreach (var subdomainPart in parsedHostname.SubdomainParts) { if (firstLettersAdmin1NameVariations.Contains(subdomainPart.Substring)) { features[CityFeatureType.FirstLettersAdmin1NameMatch] = true; if (!features.ContainsKey(CityFeatureType.FirstLettersAdmin1Letters) || features[CityFeatureType.FirstLettersAdmin1Letters] == null || ((byte)features[CityFeatureType.FirstLettersAdmin1Letters]) < subdomainPart.Substring.Length) { features[CityFeatureType.FirstLettersAdmin1Letters] = Convert.ToByte(subdomainPart.Substring.Length); features[CityFeatureType.FirstLettersAdmin1LettersRatio] = (float?)(subdomainPart.Substring.Length / ((1.0f) * cityEntity.Admin1Entity.Name.Length)); } if (this.FeaturesConfig.UseSlotIndex) { features[CityFeatureType.FirstLettersAdmin1RTLSlotIndex] = subdomainPart.RTLSlotIndex; features[CityFeatureType.FirstLettersAdmin1LTRSlotIndex] = subdomainPart.LTRSlotIndex; } } } }
public override void IngestCityEntity(GeonamesCityEntity entity) { if (entity.AirportCodes != null) { foreach (var airportCodeEntity in entity.AirportCodes) { if (!string.IsNullOrWhiteSpace(airportCodeEntity.AlternateName)) { var airportCode = airportCodeEntity.AlternateName.ToLowerInvariant(); var features = this.InitializeDefaultFeatureValues(); features[CityFeatureType.AirportCodeMatch] = true; if (entity.Population > 0) { features[CityFeatureType.AirportCodeCityPopulation] = (uint?)entity.Population; } features[CityFeatureType.AirportCodeLetters] = (byte?)airportCode.Length; EntitiesToFeatures entitiesToFeatures; if (!variationsToEntitiesToFeatures.TryGetValue(airportCode, out entitiesToFeatures)) { entitiesToFeatures = new EntitiesToFeatures(); variationsToEntitiesToFeatures[airportCode] = entitiesToFeatures; } entitiesToFeatures[entity] = features; } } } }
public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features) { if (parsedHostname == null || parsedHostname.DomainInfo?.RegistrableDomain == null || parsedHostname.SubdomainParts == null || parsedHostname.SubdomainParts.Count == 0 || cityEntity == null) { return; } var domain = parsedHostname.DomainInfo.RegistrableDomain; Dictionary <PatternRule, PatternMiningCoordinates> rulesToCoordinates; if (!this.hostnamePatternRules.TryGetValue(domain, out rulesToCoordinates)) { return; } var subdomainParts = parsedHostname.SubdomainParts; if (subdomainParts == null || subdomainParts.Count == 0) { return; } var ruleAtoms = this.miner.CreateRuleAtoms(subdomainParts); if (ruleAtoms == null || ruleAtoms.Count == 0) { return; } var rules = this.miner.GeneratePossibleRules(ruleAtoms); if (rules == null || rules.Count == 0) { return; } foreach (var rule in rules) { PatternMiningCoordinates coordinates; if (rulesToCoordinates.TryGetValue(rule, out coordinates)) { var distance = DistanceHelper.Distance(cityEntity.Latitude, cityEntity.Longitude, coordinates.Latitude, coordinates.Longitude, DistanceUnit.Kilometer); // TODO: Make this configurable // TODO: Distance should vary depending on geohash length? if (distance <= 100) { features[CityFeatureType.HostnamePatternMatch] = true; } } } }
public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features) { if (!this.FeaturesConfig.UseDomainAsFeature) { return; } if (parsedHostname?.DomainInfo?.RegistrableDomain == null || features == null) { return; } var domain = parsedHostname.DomainInfo.RegistrableDomain; features[CityFeatureType.Domain] = domain.GetHashCode(); }
public override void IngestCityEntity(GeonamesCityEntity entity) { var nameVariations = this.GenerateSimpleVariationsForName(entity.Name); var asciiNameVariations = this.GenerateSimpleVariationsForName(entity.AsciiName); nameVariations.UnionWith(asciiNameVariations); if (this.FeaturesConfig.UseComplexNoVowelsFeature) { var complexNameVariations = this.GenerateComplexVariationsForName(entity.Name, minLetters: MinLetters); nameVariations.UnionWith(complexNameVariations); var complexAsciiNameVariations = this.GenerateComplexVariationsForName(entity.AsciiName, minLetters: MinLetters); nameVariations.UnionWith(complexAsciiNameVariations); } foreach (var nameVariation in nameVariations) { var features = this.InitializeDefaultFeatureValues(); features[CityFeatureType.NoVowelsCityNameMatch] = true; if (entity.Population > 0) { features[CityFeatureType.NoVowelsCityNamePopulation] = (uint?)entity.Population; } features[CityFeatureType.NoVowelsCityNameLetters] = (byte?)nameVariation.Length; if (nameVariation.Length > 0) { features[CityFeatureType.NoVowelsCityNameLettersRatio] = (float?)(nameVariation.Length / ((1.0f) * entity.Name.Length)); } EntitiesToFeatures entitiesToFeatures; if (!variationsToEntitiesToFeatures.TryGetValue(nameVariation, out entitiesToFeatures)) { entitiesToFeatures = new EntitiesToFeatures(); variationsToEntitiesToFeatures[nameVariation] = entitiesToFeatures; } entitiesToFeatures[entity] = features; } }
public GeonamesCityEntity FindClosestCityForCoordinates(double latitude, double longitude) { var coordinatesGeohash = GeoHash.Encode(latitude, longitude, numberOfChars: 3); // 3 = ±78km var neighborGeohashes = GeoHash.Neighbors(coordinatesGeohash); var targetGeohashes = new HashSet <string>(neighborGeohashes); targetGeohashes.Add(coordinatesGeohash); var targetCities = new List <GeonamesCityEntity>(); foreach (var targetGeohash in targetGeohashes) { List <GeonamesCityEntity> citiesInTargetGeohash; if (this.GeohashesToCities.TryGetValue(targetGeohash, out citiesInTargetGeohash)) { targetCities.AddRange(citiesInTargetGeohash); } } GeonamesCityEntity closestCity = null; double closestCityDistance = double.MaxValue; foreach (var targetCity in targetCities) { var distance = DistanceHelper.Distance(targetCity.Latitude, targetCity.Longitude, latitude, longitude, DistanceUnit.Kilometer); if (distance <= 50 && distance < closestCityDistance) { closestCity = targetCity; closestCityDistance = distance; } } return(closestCity); }
public override void IngestCityEntity(GeonamesCityEntity entity) { var nameVariations = this.GenerateVariationsForName(entity.Name); var asciiNameVariations = this.GenerateVariationsForName(entity.AsciiName); nameVariations.UnionWith(asciiNameVariations); foreach (var nameVariation in nameVariations) { var features = this.InitializeDefaultFeatureValues(); features[CityFeatureType.ExactCityNameMatch] = true; if (entity.Population > 0) { features[CityFeatureType.ExactCityNamePopulation] = (uint?)entity.Population; } features[CityFeatureType.ExactCityNameLetters] = (byte?)nameVariation.Length; if (this.FeaturesConfig.UseAlternateNamesCount) { features[CityFeatureType.ExactCityNameAlternateNamesCount] = (uint?)(entity.AlternateNames?.Count ?? 0); } EntitiesToFeatures entitiesToFeatures; if (!variationsToEntitiesToFeatures.TryGetValue(nameVariation, out entitiesToFeatures)) { entitiesToFeatures = new EntitiesToFeatures(); variationsToEntitiesToFeatures[nameVariation] = entitiesToFeatures; } entitiesToFeatures[entity] = features; } }
public abstract void IngestCityEntity(GeonamesCityEntity entity);
public abstract void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features);
public override void IngestCityEntity(GeonamesCityEntity entity) { }