public ExactUNLOCODEFeaturesGenerator(FeaturesConfig featuresConfig) : base(featuresConfig)
        {
            if (this.FeaturesConfig.NullDefaultsAllowed)
            {
                FeatureDefaults = new Features()
                {
                    { CityFeatureType.ExactUNLOCODECodeMatch, false },
                    { CityFeatureType.ExactUNLOCODECodePopulation, (uint?)null }
                };

                if (this.FeaturesConfig.UseSlotIndex)
                {
                    FeatureDefaults[CityFeatureType.ExactUNLOCODECodeRTLSlotIndex] = (uint?)null;
                    FeatureDefaults[CityFeatureType.ExactUNLOCODECodeLTRSlotIndex] = (uint?)null;
                }
            }
            else
            {
                FeatureDefaults = new Features()
                {
                    { CityFeatureType.ExactUNLOCODECodeMatch, false },
                    { CityFeatureType.ExactUNLOCODECodePopulation, (uint?)0 }
                };

                if (this.FeaturesConfig.UseSlotIndex)
                {
                    FeatureDefaults[CityFeatureType.ExactUNLOCODECodeRTLSlotIndex] = (byte?)byte.MaxValue;
                    FeatureDefaults[CityFeatureType.ExactUNLOCODECodeLTRSlotIndex] = (byte?)byte.MaxValue;
                }
            }

            FeatureDefaultsValueTypes = new FeatureValueTypes()
            {
                { CityFeatureType.ExactUNLOCODECodeMatch, typeof(bool) },
                { CityFeatureType.ExactUNLOCODECodePopulation, typeof(uint?) }
            };

            if (this.FeaturesConfig.UseSlotIndex)
            {
                FeatureDefaultsValueTypes[CityFeatureType.ExactUNLOCODECodeRTLSlotIndex] = typeof(byte?);
                FeatureDefaultsValueTypes[CityFeatureType.ExactUNLOCODECodeLTRSlotIndex] = typeof(byte?);
            }

            FeatureGranularities = new FeatureGranularities()
            {
                { CityFeatureType.ExactUNLOCODECodeMatch, FeatureGranularity.Discrete },
                { CityFeatureType.ExactUNLOCODECodePopulation, FeatureGranularity.Continuous }
            };

            if (this.FeaturesConfig.UseSlotIndex)
            {
                FeatureGranularities[CityFeatureType.ExactUNLOCODECodeRTLSlotIndex] = FeatureGranularity.Discrete;
                FeatureGranularities[CityFeatureType.ExactUNLOCODECodeLTRSlotIndex] = FeatureGranularity.Discrete;
            }
        }
 public CityFeaturesGenerator(FeaturesConfig featuresConfig)
 {
     this.FeaturesConfig = featuresConfig;
 }
示例#3
0
        public CityFeaturesAggregator(
            string citiesPath,
            string alternateNamesPath,
            string admin1Path,
            string admin2Path,
            string countriesPath,
            string clliPath,
            string unlocodePath,
            List <CityFeaturesGenerator> cityFeatureGenerators                   = null,
            List <AddOnCityFeaturesGenerator> cityAddOnFeatureGenerators         = null,
            List <CityFeaturesGenerator> externalCityFeatureGenerators           = null,
            List <AddOnCityFeaturesGenerator> externalCityAddOnFeatureGenerators = null,
            FeaturesConfig featuresConfig = null)
            : this(cityFeatureGenerators, cityAddOnFeatureGenerators, externalCityFeatureGenerators, externalCityAddOnFeatureGenerators, featuresConfig)
        {
            var alternateNamesDict     = GeonamesAlternateNamesParser.ParseToDict(alternateNamesPath);
            var admin1Dict             = GeonamesAdminParser.ParseToDict(admin1Path);
            var admin2Dict             = GeonamesAdminParser.ParseToDict(admin2Path);
            var countryEntities        = GeonamesCountriesParser.ParseToList(countriesPath);
            var countryCodesDict       = GeonamesCountriesParser.ListToISOCodeDict(countryEntities);
            var geonamesIdsToCLLICodes = CLLICodesParser.ParseToDict(clliPath);

            Dictionary <int, HashSet <string> > geonamesIdsToUNLOCODECodes = null;

            // Needed if enabling ExactUNLOCODEFeaturesGenerator below

            /*
             * if (unlocodePath != null)
             * {
             *  geonamesIdsToUNLOCODECodes = UNLOCODECodesParser.ParseToDict(unlocodePath);
             * }
             */

            var total   = 0;
            var withPop = 0;

            foreach (var entity in GeonamesCitiesParser.Parse(citiesPath, alternateNamesDict, admin1Dict, admin2Dict, countryCodesDict, geonamesIdsToCLLICodes, geonamesIdsToUNLOCODECodes))
            {
                total++;

                if (this.featuresConfig.MinimumPopulation == 0 || entity.Population >= this.featuresConfig.MinimumPopulation)
                {
                    withPop++;

                    foreach (var cityFeatureGenerator in this.cityFeatureGenerators)
                    {
                        cityFeatureGenerator.IngestCityEntity(entity);
                    }
                }

                if (total % 1000 == 0)
                {
                    Console.WriteLine($"Loading cities - total: {total}, withPop: {withPop}");
                }
            }

            /*
             * foreach (var cityFeatureGenerator in this.cityFeatureGenerators)
             * {
             *  try
             *  {
             *      Console.WriteLine($"Estimated bytes for {cityFeatureGenerator.GetType().FullName}: {FeatureUtils.EstimateObjectSizeInBytes(cityFeatureGenerator)}");
             *  }
             *  catch (Exception)
             *  {
             *      Console.WriteLine($"Could not estimate bytes for  {cityFeatureGenerator.GetType().FullName}");
             *  }
             * }
             */
        }
示例#4
0
        // Serialization fails due to too much data

        /*
         * public void SerializeTo(string outPath)
         * {
         *  var formatter = new BinaryFormatter();
         *
         *  using (var stream = new FileStream(path: outPath, mode: FileMode.Create, access: FileAccess.Write, share: FileShare.None))
         *  {
         *      formatter.Serialize(stream, this);
         *      stream.Close();
         *  }
         * }
         *
         * public static CityFeaturesAggregator DeserializeFrom(string inPath)
         * {
         *  var formatter = new BinaryFormatter();
         *  using (var stream = new FileStream(path: inPath, mode: FileMode.Open, access: FileAccess.Read, share: FileShare.Read))
         *  {
         *      var aggregator = (CityFeaturesAggregator)formatter.Deserialize(stream);
         *      stream.Close();
         *
         *      return aggregator;
         *  }
         * }
         */

        public CityFeaturesAggregator(
            List <CityFeaturesGenerator> cityFeatureGenerators                   = null,
            List <AddOnCityFeaturesGenerator> cityAddOnFeatureGenerators         = null,
            List <CityFeaturesGenerator> externalCityFeatureGenerators           = null,
            List <AddOnCityFeaturesGenerator> externalCityAddOnFeatureGenerators = null,
            FeaturesConfig featuresConfig = null)
        {
            this.PrimaryFeatureStopWatches  = new Dictionary <string, Stopwatch>();
            this.SecondaryFeaturesStopWatch = new Stopwatch();
            this.FeatureHostnameHitCounts   = new Dictionary <string, int>();
            this.TotalHostnamesSeen         = 0;
            this.TotalCandidatesSeen        = 0;

            if (featuresConfig == null)
            {
                this.featuresConfig = new FeaturesConfig();
            }
            else
            {
                this.featuresConfig = featuresConfig;
            }

            if (cityFeatureGenerators == null)
            {
                this.cityFeatureGenerators = new List <CityFeaturesGenerator>()
                {
                    new AlternateCityAbbreviationsFeaturesGenerator(this.featuresConfig),
                    new AlternateCityFeaturesGenerator(this.featuresConfig),
                    new CityAbbreviationsFeaturesGenerator(this.featuresConfig),
                    new CityAdmin1FeaturesGenerator(this.featuresConfig),
                    new CityCountryFeaturesGenerator(this.featuresConfig),
                    new ExactCityFeaturesGenerator(this.featuresConfig),
                    new FirstLettersCityFeaturesGenerator(this.featuresConfig),
                    new NoVowelsCityFeaturesGenerator(this.featuresConfig),
                    new AirportCodeFeaturesGenerator(this.featuresConfig),
                    new ExactCLLIFeaturesGenerator(this.featuresConfig),

                    //// If you want to uncomment this, also make sure to incomment loading its dictionary above in this file!
                    ////new ExactUNLOCODEFeaturesGenerator(this.featuresConfig) //// UNLOCODE disabled
                };
            }
            else
            {
                this.cityFeatureGenerators = cityFeatureGenerators;
            }

            if (cityAddOnFeatureGenerators == null)
            {
                this.cityAddOnFeatureGenerators = new List <AddOnCityFeaturesGenerator>()
                {
                    new ExactAdmin1AddOnFeaturesGenerator(this.featuresConfig),
                    new ExactCountryAddOnFeaturesGenerator(this.featuresConfig),
                    new TLDAddOnFeaturesGenerator(this.featuresConfig),
                    new FirstLettersAdmin1AddOnFeaturesGenerator(this.featuresConfig),
                    new DomainNameAddOnFeaturesGenerator(this.featuresConfig)
                };
            }
            else
            {
                this.cityAddOnFeatureGenerators = cityAddOnFeatureGenerators;
            }

            if (externalCityFeatureGenerators != null)
            {
                this.cityFeatureGenerators.AddRange(externalCityFeatureGenerators);
            }

            if (externalCityAddOnFeatureGenerators != null)
            {
                this.cityAddOnFeatureGenerators.AddRange(externalCityAddOnFeatureGenerators);
            }

            foreach (var generator in this.cityFeatureGenerators)
            {
                var stopwatchName = generator.GetType().ToString();
                PrimaryFeatureStopWatches[stopwatchName] = new Stopwatch();
                FeatureHostnameHitCounts[stopwatchName]  = 0;
            }
        }
示例#5
0
        public AlternateCityFeaturesGenerator(FeaturesConfig featuresConfig) : base(featuresConfig)
        {
            if (this.FeaturesConfig.NullDefaultsAllowed)
            {
                FeatureDefaults = new Features()
                {
                    { CityFeatureType.AlternateCityNameMatch, false },
                    { CityFeatureType.AlternateCityNamePopulation, (uint?)null },
                    { CityFeatureType.AlternateCityNameLetters, (byte?)null },
                    { CityFeatureType.AlternateCityNameAlternateNamesCount, (uint?)null }
                };

                if (this.FeaturesConfig.UseAlternateNameCategories)
                {
                    FeatureDefaults[CityFeatureType.AlternateCityNameIsPreferredName] = false;
                    FeatureDefaults[CityFeatureType.AlternateCityNameIsShortName]     = false;
                    FeatureDefaults[CityFeatureType.AlternateCityNameIsColloquial]    = false;
                    FeatureDefaults[CityFeatureType.AlternateCityNameIsHistoric]      = false;
                }

                if (this.FeaturesConfig.UseSlotIndex)
                {
                    FeatureDefaults[CityFeatureType.AlternateCityNameRTLSlotIndex] = false;
                    FeatureDefaults[CityFeatureType.AlternateCityNameLTRSlotIndex] = false;
                }
            }
            else
            {
                FeatureDefaults = new Features()
                {
                    { CityFeatureType.AlternateCityNameMatch, false },
                    { CityFeatureType.AlternateCityNamePopulation, (uint?)0 },
                    { CityFeatureType.AlternateCityNameLetters, (byte?)0 },
                    { CityFeatureType.AlternateCityNameAlternateNamesCount, (uint?)0 }
                };

                if (this.FeaturesConfig.UseAlternateNameCategories)
                {
                    FeatureDefaults[CityFeatureType.AlternateCityNameIsPreferredName] = false;
                    FeatureDefaults[CityFeatureType.AlternateCityNameIsShortName]     = false;
                    FeatureDefaults[CityFeatureType.AlternateCityNameIsColloquial]    = false;
                    FeatureDefaults[CityFeatureType.AlternateCityNameIsHistoric]      = false;
                }

                if (this.FeaturesConfig.UseSlotIndex)
                {
                    FeatureDefaults[CityFeatureType.AlternateCityNameRTLSlotIndex] = false;
                    FeatureDefaults[CityFeatureType.AlternateCityNameLTRSlotIndex] = false;
                }
            }

            FeatureDefaultsValueTypes = new FeatureValueTypes()
            {
                { CityFeatureType.AlternateCityNameMatch, typeof(bool) },
                { CityFeatureType.AlternateCityNamePopulation, typeof(uint?) },
                { CityFeatureType.AlternateCityNameLetters, typeof(byte?) },
                { CityFeatureType.AlternateCityNameAlternateNamesCount, typeof(uint?) }
            };

            if (this.FeaturesConfig.UseAlternateNameCategories)
            {
                FeatureDefaultsValueTypes[CityFeatureType.AlternateCityNameIsPreferredName] = typeof(bool);
                FeatureDefaultsValueTypes[CityFeatureType.AlternateCityNameIsShortName]     = typeof(bool);
                FeatureDefaultsValueTypes[CityFeatureType.AlternateCityNameIsColloquial]    = typeof(bool);
                FeatureDefaultsValueTypes[CityFeatureType.AlternateCityNameIsHistoric]      = typeof(bool);
            }

            if (this.FeaturesConfig.UseSlotIndex)
            {
                FeatureDefaultsValueTypes[CityFeatureType.AlternateCityNameRTLSlotIndex] = typeof(bool);
                FeatureDefaultsValueTypes[CityFeatureType.AlternateCityNameLTRSlotIndex] = typeof(bool);
            }

            FeatureGranularities = new FeatureGranularities()
            {
                { CityFeatureType.AlternateCityNameMatch, FeatureGranularity.Discrete },
                { CityFeatureType.AlternateCityNamePopulation, FeatureGranularity.Continuous },
                { CityFeatureType.AlternateCityNameLetters, FeatureGranularity.Continuous },
                { CityFeatureType.AlternateCityNameAlternateNamesCount, FeatureGranularity.Continuous }
            };

            if (this.FeaturesConfig.UseAlternateNameCategories)
            {
                FeatureGranularities[CityFeatureType.AlternateCityNameIsPreferredName] = FeatureGranularity.Discrete;
                FeatureGranularities[CityFeatureType.AlternateCityNameIsShortName]     = FeatureGranularity.Discrete;
                FeatureGranularities[CityFeatureType.AlternateCityNameIsColloquial]    = FeatureGranularity.Discrete;
                FeatureGranularities[CityFeatureType.AlternateCityNameIsHistoric]      = FeatureGranularity.Discrete;
            }

            if (this.FeaturesConfig.UseSlotIndex)
            {
                FeatureGranularities[CityFeatureType.AlternateCityNameRTLSlotIndex] = FeatureGranularity.Discrete;
                FeatureGranularities[CityFeatureType.AlternateCityNameLTRSlotIndex] = FeatureGranularity.Discrete;
            }
        }
示例#6
0
        public CityAdmin1FeaturesGenerator(FeaturesConfig featuresConfig) : base(featuresConfig)
        {
            if (this.FeaturesConfig.NullDefaultsAllowed)
            {
                FeatureDefaults = new Features()
                {
                    { CityFeatureType.CityAdmin1NameMatch, false },
                    { CityFeatureType.CityAdmin1NamePopulation, (uint?)null },
                    { CityFeatureType.CityAdmin1LettersBoth, (byte?)null },
                    { CityFeatureType.CityAdmin1LettersCity, (byte?)null },
                    { CityFeatureType.CityAdmin1LettersAdmin1, (byte?)null }
                };

                if (this.FeaturesConfig.UseSlotIndex)
                {
                    FeatureDefaults[CityFeatureType.CityAdmin1RTLSlotIndex] = (byte?)null;
                    FeatureDefaults[CityFeatureType.CityAdmin1LTRSlotIndex] = (byte?)null;
                }
            }
            else
            {
                FeatureDefaults = new Features()
                {
                    { CityFeatureType.CityAdmin1NameMatch, false },
                    { CityFeatureType.CityAdmin1NamePopulation, (uint?)0 },
                    { CityFeatureType.CityAdmin1LettersBoth, (byte?)0 },
                    { CityFeatureType.CityAdmin1LettersCity, (byte?)0 },
                    { CityFeatureType.CityAdmin1LettersAdmin1, (byte?)0 }
                };

                if (this.FeaturesConfig.UseSlotIndex)
                {
                    FeatureDefaults[CityFeatureType.CityAdmin1RTLSlotIndex] = (byte?)byte.MaxValue;
                    FeatureDefaults[CityFeatureType.CityAdmin1LTRSlotIndex] = (byte?)byte.MaxValue;
                }
            }

            FeatureDefaultsValueTypes = new FeatureValueTypes()
            {
                { CityFeatureType.CityAdmin1NameMatch, typeof(bool) },
                { CityFeatureType.CityAdmin1NamePopulation, typeof(uint?) },
                { CityFeatureType.CityAdmin1LettersBoth, typeof(byte?) },
                { CityFeatureType.CityAdmin1LettersCity, typeof(byte?) },
                { CityFeatureType.CityAdmin1LettersAdmin1, typeof(byte?) }
            };

            if (this.FeaturesConfig.UseSlotIndex)
            {
                FeatureDefaultsValueTypes[CityFeatureType.CityAdmin1RTLSlotIndex] = typeof(byte?);
                FeatureDefaultsValueTypes[CityFeatureType.CityAdmin1LTRSlotIndex] = typeof(byte?);
            }

            FeatureGranularities = new FeatureGranularities()
            {
                { CityFeatureType.CityAdmin1NameMatch, FeatureGranularity.Discrete },
                { CityFeatureType.CityAdmin1NamePopulation, FeatureGranularity.Continuous },
                { CityFeatureType.CityAdmin1LettersBoth, FeatureGranularity.Continuous },
                { CityFeatureType.CityAdmin1LettersCity, FeatureGranularity.Continuous },
                { CityFeatureType.CityAdmin1LettersAdmin1, FeatureGranularity.Continuous }
            };

            if (this.FeaturesConfig.UseSlotIndex)
            {
                FeatureGranularities[CityFeatureType.CityAdmin1RTLSlotIndex] = FeatureGranularity.Discrete;
                FeatureGranularities[CityFeatureType.CityAdmin1LTRSlotIndex] = FeatureGranularity.Discrete;
            }
        }
        public NoVowelsCityFeaturesGenerator(FeaturesConfig featuresConfig) : base(featuresConfig)
        {
            if (this.FeaturesConfig.NullDefaultsAllowed)
            {
                FeatureDefaults = new Features()
                {
                    { CityFeatureType.NoVowelsCityNameMatch, false },
                    { CityFeatureType.NoVowelsCityNamePopulation, (uint?)null },
                    { CityFeatureType.NoVowelsCityNameLetters, (byte?)null },
                    { CityFeatureType.NoVowelsCityNameLettersRatio, (float?)null }
                };

                if (this.FeaturesConfig.UseSlotIndex)
                {
                    FeatureDefaults[CityFeatureType.NoVowelsCityRTLSlotIndex] = (byte?)null;
                    FeatureDefaults[CityFeatureType.NoVowelsCityLTRSlotIndex] = (byte?)null;
                }
            }
            else
            {
                FeatureDefaults = new Features()
                {
                    { CityFeatureType.NoVowelsCityNameMatch, false },
                    { CityFeatureType.NoVowelsCityNamePopulation, (uint?)0 },
                    { CityFeatureType.NoVowelsCityNameLetters, (byte?)0 },
                    { CityFeatureType.NoVowelsCityNameLettersRatio, (float?)0 }
                };

                if (this.FeaturesConfig.UseSlotIndex)
                {
                    FeatureDefaults[CityFeatureType.NoVowelsCityRTLSlotIndex] = (byte?)byte.MaxValue;
                    FeatureDefaults[CityFeatureType.NoVowelsCityLTRSlotIndex] = (byte?)byte.MaxValue;
                }
            }

            FeatureDefaultsValueTypes = new FeatureValueTypes()
            {
                { CityFeatureType.NoVowelsCityNameMatch, typeof(bool) },
                { CityFeatureType.NoVowelsCityNamePopulation, typeof(uint?) },
                { CityFeatureType.NoVowelsCityNameLetters, typeof(byte?) },
                { CityFeatureType.NoVowelsCityNameLettersRatio, typeof(float?) }
            };

            if (this.FeaturesConfig.UseSlotIndex)
            {
                FeatureDefaultsValueTypes[CityFeatureType.NoVowelsCityRTLSlotIndex] = typeof(byte?);
                FeatureDefaultsValueTypes[CityFeatureType.NoVowelsCityLTRSlotIndex] = typeof(byte?);
            }

            FeatureGranularities = new FeatureGranularities()
            {
                { CityFeatureType.NoVowelsCityNameMatch, FeatureGranularity.Discrete },
                { CityFeatureType.NoVowelsCityNamePopulation, FeatureGranularity.Continuous },
                { CityFeatureType.NoVowelsCityNameLetters, FeatureGranularity.Continuous },
                { CityFeatureType.NoVowelsCityNameLettersRatio, FeatureGranularity.Continuous }
            };

            if (this.FeaturesConfig.UseSlotIndex)
            {
                FeatureGranularities[CityFeatureType.NoVowelsCityRTLSlotIndex] = FeatureGranularity.Discrete;
                FeatureGranularities[CityFeatureType.NoVowelsCityLTRSlotIndex] = FeatureGranularity.Discrete;
            }
        }