public ExactUNLOCODEFeaturesGenerator(FeaturesConfig featuresConfig) : base(featuresConfig) { if (this.FeaturesConfig.NullDefaultsAllowed) { FeatureDefaults = new Features() { { CityFeatureType.ExactUNLOCODECodeMatch, false }, { CityFeatureType.ExactUNLOCODECodePopulation, (uint?)null } }; if (this.FeaturesConfig.UseSlotIndex) { FeatureDefaults[CityFeatureType.ExactUNLOCODECodeRTLSlotIndex] = (uint?)null; FeatureDefaults[CityFeatureType.ExactUNLOCODECodeLTRSlotIndex] = (uint?)null; } } else { FeatureDefaults = new Features() { { CityFeatureType.ExactUNLOCODECodeMatch, false }, { CityFeatureType.ExactUNLOCODECodePopulation, (uint?)0 } }; if (this.FeaturesConfig.UseSlotIndex) { FeatureDefaults[CityFeatureType.ExactUNLOCODECodeRTLSlotIndex] = (byte?)byte.MaxValue; FeatureDefaults[CityFeatureType.ExactUNLOCODECodeLTRSlotIndex] = (byte?)byte.MaxValue; } } FeatureDefaultsValueTypes = new FeatureValueTypes() { { CityFeatureType.ExactUNLOCODECodeMatch, typeof(bool) }, { CityFeatureType.ExactUNLOCODECodePopulation, typeof(uint?) } }; if (this.FeaturesConfig.UseSlotIndex) { FeatureDefaultsValueTypes[CityFeatureType.ExactUNLOCODECodeRTLSlotIndex] = typeof(byte?); FeatureDefaultsValueTypes[CityFeatureType.ExactUNLOCODECodeLTRSlotIndex] = typeof(byte?); } FeatureGranularities = new FeatureGranularities() { { CityFeatureType.ExactUNLOCODECodeMatch, FeatureGranularity.Discrete }, { CityFeatureType.ExactUNLOCODECodePopulation, FeatureGranularity.Continuous } }; if (this.FeaturesConfig.UseSlotIndex) { FeatureGranularities[CityFeatureType.ExactUNLOCODECodeRTLSlotIndex] = FeatureGranularity.Discrete; FeatureGranularities[CityFeatureType.ExactUNLOCODECodeLTRSlotIndex] = FeatureGranularity.Discrete; } }
public CityFeaturesGenerator(FeaturesConfig featuresConfig) { this.FeaturesConfig = featuresConfig; }
public CityFeaturesAggregator( string citiesPath, string alternateNamesPath, string admin1Path, string admin2Path, string countriesPath, string clliPath, string unlocodePath, List <CityFeaturesGenerator> cityFeatureGenerators = null, List <AddOnCityFeaturesGenerator> cityAddOnFeatureGenerators = null, List <CityFeaturesGenerator> externalCityFeatureGenerators = null, List <AddOnCityFeaturesGenerator> externalCityAddOnFeatureGenerators = null, FeaturesConfig featuresConfig = null) : this(cityFeatureGenerators, cityAddOnFeatureGenerators, externalCityFeatureGenerators, externalCityAddOnFeatureGenerators, featuresConfig) { var alternateNamesDict = GeonamesAlternateNamesParser.ParseToDict(alternateNamesPath); var admin1Dict = GeonamesAdminParser.ParseToDict(admin1Path); var admin2Dict = GeonamesAdminParser.ParseToDict(admin2Path); var countryEntities = GeonamesCountriesParser.ParseToList(countriesPath); var countryCodesDict = GeonamesCountriesParser.ListToISOCodeDict(countryEntities); var geonamesIdsToCLLICodes = CLLICodesParser.ParseToDict(clliPath); Dictionary <int, HashSet <string> > geonamesIdsToUNLOCODECodes = null; // Needed if enabling ExactUNLOCODEFeaturesGenerator below /* * if (unlocodePath != null) * { * geonamesIdsToUNLOCODECodes = UNLOCODECodesParser.ParseToDict(unlocodePath); * } */ var total = 0; var withPop = 0; foreach (var entity in GeonamesCitiesParser.Parse(citiesPath, alternateNamesDict, admin1Dict, admin2Dict, countryCodesDict, geonamesIdsToCLLICodes, geonamesIdsToUNLOCODECodes)) { total++; if (this.featuresConfig.MinimumPopulation == 0 || entity.Population >= this.featuresConfig.MinimumPopulation) { withPop++; foreach (var cityFeatureGenerator in this.cityFeatureGenerators) { cityFeatureGenerator.IngestCityEntity(entity); } } if (total % 1000 == 0) { Console.WriteLine($"Loading cities - total: {total}, withPop: {withPop}"); } } /* * foreach (var cityFeatureGenerator in this.cityFeatureGenerators) * { * try * { * Console.WriteLine($"Estimated bytes for {cityFeatureGenerator.GetType().FullName}: {FeatureUtils.EstimateObjectSizeInBytes(cityFeatureGenerator)}"); * } * catch (Exception) * { * Console.WriteLine($"Could not estimate bytes for {cityFeatureGenerator.GetType().FullName}"); * } * } */ }
// Serialization fails due to too much data /* * public void SerializeTo(string outPath) * { * var formatter = new BinaryFormatter(); * * using (var stream = new FileStream(path: outPath, mode: FileMode.Create, access: FileAccess.Write, share: FileShare.None)) * { * formatter.Serialize(stream, this); * stream.Close(); * } * } * * public static CityFeaturesAggregator DeserializeFrom(string inPath) * { * var formatter = new BinaryFormatter(); * using (var stream = new FileStream(path: inPath, mode: FileMode.Open, access: FileAccess.Read, share: FileShare.Read)) * { * var aggregator = (CityFeaturesAggregator)formatter.Deserialize(stream); * stream.Close(); * * return aggregator; * } * } */ public CityFeaturesAggregator( List <CityFeaturesGenerator> cityFeatureGenerators = null, List <AddOnCityFeaturesGenerator> cityAddOnFeatureGenerators = null, List <CityFeaturesGenerator> externalCityFeatureGenerators = null, List <AddOnCityFeaturesGenerator> externalCityAddOnFeatureGenerators = null, FeaturesConfig featuresConfig = null) { this.PrimaryFeatureStopWatches = new Dictionary <string, Stopwatch>(); this.SecondaryFeaturesStopWatch = new Stopwatch(); this.FeatureHostnameHitCounts = new Dictionary <string, int>(); this.TotalHostnamesSeen = 0; this.TotalCandidatesSeen = 0; if (featuresConfig == null) { this.featuresConfig = new FeaturesConfig(); } else { this.featuresConfig = featuresConfig; } if (cityFeatureGenerators == null) { this.cityFeatureGenerators = new List <CityFeaturesGenerator>() { new AlternateCityAbbreviationsFeaturesGenerator(this.featuresConfig), new AlternateCityFeaturesGenerator(this.featuresConfig), new CityAbbreviationsFeaturesGenerator(this.featuresConfig), new CityAdmin1FeaturesGenerator(this.featuresConfig), new CityCountryFeaturesGenerator(this.featuresConfig), new ExactCityFeaturesGenerator(this.featuresConfig), new FirstLettersCityFeaturesGenerator(this.featuresConfig), new NoVowelsCityFeaturesGenerator(this.featuresConfig), new AirportCodeFeaturesGenerator(this.featuresConfig), new ExactCLLIFeaturesGenerator(this.featuresConfig), //// If you want to uncomment this, also make sure to incomment loading its dictionary above in this file! ////new ExactUNLOCODEFeaturesGenerator(this.featuresConfig) //// UNLOCODE disabled }; } else { this.cityFeatureGenerators = cityFeatureGenerators; } if (cityAddOnFeatureGenerators == null) { this.cityAddOnFeatureGenerators = new List <AddOnCityFeaturesGenerator>() { new ExactAdmin1AddOnFeaturesGenerator(this.featuresConfig), new ExactCountryAddOnFeaturesGenerator(this.featuresConfig), new TLDAddOnFeaturesGenerator(this.featuresConfig), new FirstLettersAdmin1AddOnFeaturesGenerator(this.featuresConfig), new DomainNameAddOnFeaturesGenerator(this.featuresConfig) }; } else { this.cityAddOnFeatureGenerators = cityAddOnFeatureGenerators; } if (externalCityFeatureGenerators != null) { this.cityFeatureGenerators.AddRange(externalCityFeatureGenerators); } if (externalCityAddOnFeatureGenerators != null) { this.cityAddOnFeatureGenerators.AddRange(externalCityAddOnFeatureGenerators); } foreach (var generator in this.cityFeatureGenerators) { var stopwatchName = generator.GetType().ToString(); PrimaryFeatureStopWatches[stopwatchName] = new Stopwatch(); FeatureHostnameHitCounts[stopwatchName] = 0; } }
public AlternateCityFeaturesGenerator(FeaturesConfig featuresConfig) : base(featuresConfig) { if (this.FeaturesConfig.NullDefaultsAllowed) { FeatureDefaults = new Features() { { CityFeatureType.AlternateCityNameMatch, false }, { CityFeatureType.AlternateCityNamePopulation, (uint?)null }, { CityFeatureType.AlternateCityNameLetters, (byte?)null }, { CityFeatureType.AlternateCityNameAlternateNamesCount, (uint?)null } }; if (this.FeaturesConfig.UseAlternateNameCategories) { FeatureDefaults[CityFeatureType.AlternateCityNameIsPreferredName] = false; FeatureDefaults[CityFeatureType.AlternateCityNameIsShortName] = false; FeatureDefaults[CityFeatureType.AlternateCityNameIsColloquial] = false; FeatureDefaults[CityFeatureType.AlternateCityNameIsHistoric] = false; } if (this.FeaturesConfig.UseSlotIndex) { FeatureDefaults[CityFeatureType.AlternateCityNameRTLSlotIndex] = false; FeatureDefaults[CityFeatureType.AlternateCityNameLTRSlotIndex] = false; } } else { FeatureDefaults = new Features() { { CityFeatureType.AlternateCityNameMatch, false }, { CityFeatureType.AlternateCityNamePopulation, (uint?)0 }, { CityFeatureType.AlternateCityNameLetters, (byte?)0 }, { CityFeatureType.AlternateCityNameAlternateNamesCount, (uint?)0 } }; if (this.FeaturesConfig.UseAlternateNameCategories) { FeatureDefaults[CityFeatureType.AlternateCityNameIsPreferredName] = false; FeatureDefaults[CityFeatureType.AlternateCityNameIsShortName] = false; FeatureDefaults[CityFeatureType.AlternateCityNameIsColloquial] = false; FeatureDefaults[CityFeatureType.AlternateCityNameIsHistoric] = false; } if (this.FeaturesConfig.UseSlotIndex) { FeatureDefaults[CityFeatureType.AlternateCityNameRTLSlotIndex] = false; FeatureDefaults[CityFeatureType.AlternateCityNameLTRSlotIndex] = false; } } FeatureDefaultsValueTypes = new FeatureValueTypes() { { CityFeatureType.AlternateCityNameMatch, typeof(bool) }, { CityFeatureType.AlternateCityNamePopulation, typeof(uint?) }, { CityFeatureType.AlternateCityNameLetters, typeof(byte?) }, { CityFeatureType.AlternateCityNameAlternateNamesCount, typeof(uint?) } }; if (this.FeaturesConfig.UseAlternateNameCategories) { FeatureDefaultsValueTypes[CityFeatureType.AlternateCityNameIsPreferredName] = typeof(bool); FeatureDefaultsValueTypes[CityFeatureType.AlternateCityNameIsShortName] = typeof(bool); FeatureDefaultsValueTypes[CityFeatureType.AlternateCityNameIsColloquial] = typeof(bool); FeatureDefaultsValueTypes[CityFeatureType.AlternateCityNameIsHistoric] = typeof(bool); } if (this.FeaturesConfig.UseSlotIndex) { FeatureDefaultsValueTypes[CityFeatureType.AlternateCityNameRTLSlotIndex] = typeof(bool); FeatureDefaultsValueTypes[CityFeatureType.AlternateCityNameLTRSlotIndex] = typeof(bool); } FeatureGranularities = new FeatureGranularities() { { CityFeatureType.AlternateCityNameMatch, FeatureGranularity.Discrete }, { CityFeatureType.AlternateCityNamePopulation, FeatureGranularity.Continuous }, { CityFeatureType.AlternateCityNameLetters, FeatureGranularity.Continuous }, { CityFeatureType.AlternateCityNameAlternateNamesCount, FeatureGranularity.Continuous } }; if (this.FeaturesConfig.UseAlternateNameCategories) { FeatureGranularities[CityFeatureType.AlternateCityNameIsPreferredName] = FeatureGranularity.Discrete; FeatureGranularities[CityFeatureType.AlternateCityNameIsShortName] = FeatureGranularity.Discrete; FeatureGranularities[CityFeatureType.AlternateCityNameIsColloquial] = FeatureGranularity.Discrete; FeatureGranularities[CityFeatureType.AlternateCityNameIsHistoric] = FeatureGranularity.Discrete; } if (this.FeaturesConfig.UseSlotIndex) { FeatureGranularities[CityFeatureType.AlternateCityNameRTLSlotIndex] = FeatureGranularity.Discrete; FeatureGranularities[CityFeatureType.AlternateCityNameLTRSlotIndex] = FeatureGranularity.Discrete; } }
public CityAdmin1FeaturesGenerator(FeaturesConfig featuresConfig) : base(featuresConfig) { if (this.FeaturesConfig.NullDefaultsAllowed) { FeatureDefaults = new Features() { { CityFeatureType.CityAdmin1NameMatch, false }, { CityFeatureType.CityAdmin1NamePopulation, (uint?)null }, { CityFeatureType.CityAdmin1LettersBoth, (byte?)null }, { CityFeatureType.CityAdmin1LettersCity, (byte?)null }, { CityFeatureType.CityAdmin1LettersAdmin1, (byte?)null } }; if (this.FeaturesConfig.UseSlotIndex) { FeatureDefaults[CityFeatureType.CityAdmin1RTLSlotIndex] = (byte?)null; FeatureDefaults[CityFeatureType.CityAdmin1LTRSlotIndex] = (byte?)null; } } else { FeatureDefaults = new Features() { { CityFeatureType.CityAdmin1NameMatch, false }, { CityFeatureType.CityAdmin1NamePopulation, (uint?)0 }, { CityFeatureType.CityAdmin1LettersBoth, (byte?)0 }, { CityFeatureType.CityAdmin1LettersCity, (byte?)0 }, { CityFeatureType.CityAdmin1LettersAdmin1, (byte?)0 } }; if (this.FeaturesConfig.UseSlotIndex) { FeatureDefaults[CityFeatureType.CityAdmin1RTLSlotIndex] = (byte?)byte.MaxValue; FeatureDefaults[CityFeatureType.CityAdmin1LTRSlotIndex] = (byte?)byte.MaxValue; } } FeatureDefaultsValueTypes = new FeatureValueTypes() { { CityFeatureType.CityAdmin1NameMatch, typeof(bool) }, { CityFeatureType.CityAdmin1NamePopulation, typeof(uint?) }, { CityFeatureType.CityAdmin1LettersBoth, typeof(byte?) }, { CityFeatureType.CityAdmin1LettersCity, typeof(byte?) }, { CityFeatureType.CityAdmin1LettersAdmin1, typeof(byte?) } }; if (this.FeaturesConfig.UseSlotIndex) { FeatureDefaultsValueTypes[CityFeatureType.CityAdmin1RTLSlotIndex] = typeof(byte?); FeatureDefaultsValueTypes[CityFeatureType.CityAdmin1LTRSlotIndex] = typeof(byte?); } FeatureGranularities = new FeatureGranularities() { { CityFeatureType.CityAdmin1NameMatch, FeatureGranularity.Discrete }, { CityFeatureType.CityAdmin1NamePopulation, FeatureGranularity.Continuous }, { CityFeatureType.CityAdmin1LettersBoth, FeatureGranularity.Continuous }, { CityFeatureType.CityAdmin1LettersCity, FeatureGranularity.Continuous }, { CityFeatureType.CityAdmin1LettersAdmin1, FeatureGranularity.Continuous } }; if (this.FeaturesConfig.UseSlotIndex) { FeatureGranularities[CityFeatureType.CityAdmin1RTLSlotIndex] = FeatureGranularity.Discrete; FeatureGranularities[CityFeatureType.CityAdmin1LTRSlotIndex] = FeatureGranularity.Discrete; } }
public NoVowelsCityFeaturesGenerator(FeaturesConfig featuresConfig) : base(featuresConfig) { if (this.FeaturesConfig.NullDefaultsAllowed) { FeatureDefaults = new Features() { { CityFeatureType.NoVowelsCityNameMatch, false }, { CityFeatureType.NoVowelsCityNamePopulation, (uint?)null }, { CityFeatureType.NoVowelsCityNameLetters, (byte?)null }, { CityFeatureType.NoVowelsCityNameLettersRatio, (float?)null } }; if (this.FeaturesConfig.UseSlotIndex) { FeatureDefaults[CityFeatureType.NoVowelsCityRTLSlotIndex] = (byte?)null; FeatureDefaults[CityFeatureType.NoVowelsCityLTRSlotIndex] = (byte?)null; } } else { FeatureDefaults = new Features() { { CityFeatureType.NoVowelsCityNameMatch, false }, { CityFeatureType.NoVowelsCityNamePopulation, (uint?)0 }, { CityFeatureType.NoVowelsCityNameLetters, (byte?)0 }, { CityFeatureType.NoVowelsCityNameLettersRatio, (float?)0 } }; if (this.FeaturesConfig.UseSlotIndex) { FeatureDefaults[CityFeatureType.NoVowelsCityRTLSlotIndex] = (byte?)byte.MaxValue; FeatureDefaults[CityFeatureType.NoVowelsCityLTRSlotIndex] = (byte?)byte.MaxValue; } } FeatureDefaultsValueTypes = new FeatureValueTypes() { { CityFeatureType.NoVowelsCityNameMatch, typeof(bool) }, { CityFeatureType.NoVowelsCityNamePopulation, typeof(uint?) }, { CityFeatureType.NoVowelsCityNameLetters, typeof(byte?) }, { CityFeatureType.NoVowelsCityNameLettersRatio, typeof(float?) } }; if (this.FeaturesConfig.UseSlotIndex) { FeatureDefaultsValueTypes[CityFeatureType.NoVowelsCityRTLSlotIndex] = typeof(byte?); FeatureDefaultsValueTypes[CityFeatureType.NoVowelsCityLTRSlotIndex] = typeof(byte?); } FeatureGranularities = new FeatureGranularities() { { CityFeatureType.NoVowelsCityNameMatch, FeatureGranularity.Discrete }, { CityFeatureType.NoVowelsCityNamePopulation, FeatureGranularity.Continuous }, { CityFeatureType.NoVowelsCityNameLetters, FeatureGranularity.Continuous }, { CityFeatureType.NoVowelsCityNameLettersRatio, FeatureGranularity.Continuous } }; if (this.FeaturesConfig.UseSlotIndex) { FeatureGranularities[CityFeatureType.NoVowelsCityRTLSlotIndex] = FeatureGranularity.Discrete; FeatureGranularities[CityFeatureType.NoVowelsCityLTRSlotIndex] = FeatureGranularity.Discrete; } }