public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features)
        {
            if (parsedHostname?.SubdomainParts == null || cityEntity == null || cityEntity.CountryEntity == null || string.IsNullOrWhiteSpace(cityEntity.CountryEntity.Name) || features == null)
            {
                return;
            }

            var countryNameVariations = cityEntity.CountryEntity.NameVariationsLower;

            foreach (var subdomainPart in parsedHostname.SubdomainParts)
            {
                if (countryNameVariations.Contains(subdomainPart.Substring))
                {
                    features[CityFeatureType.ExactCountryNameMatch] = true;

                    if (!features.ContainsKey(CityFeatureType.ExactCountryLetters) || features[CityFeatureType.ExactCountryLetters] == null || ((byte)features[CityFeatureType.ExactCountryLetters]) < subdomainPart.Substring.Length)
                    {
                        features[CityFeatureType.ExactCountryLetters] = Convert.ToByte(subdomainPart.Substring.Length);
                    }

                    if (this.FeaturesConfig.UseSlotIndex)
                    {
                        features[CityFeatureType.ExactCountryRTLSlotIndex] = subdomainPart.RTLSlotIndex;
                        features[CityFeatureType.ExactCountryLTRSlotIndex] = subdomainPart.LTRSlotIndex;
                    }
                }
            }
        }
        public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features)
        {
            if (parsedHostname == null || string.IsNullOrWhiteSpace(parsedHostname.TLD) || cityEntity == null || cityEntity.CountryEntity == null || string.IsNullOrWhiteSpace(cityEntity.CountryEntity.TLD) || features == null)
            {
                return;
            }

            var countryTld = cityEntity.CountryEntity.TLD.ToLowerInvariant();

            var hostnameTLD      = parsedHostname.TLD;
            var hostnameTLDParts = hostnameTLD.Split('.');

            var lastPart = hostnameTLDParts[hostnameTLDParts.Length - 1];

            if (!string.IsNullOrWhiteSpace(lastPart))
            {
                string convertedTld;
                var    hostnameTld = string.Format(CultureInfo.InvariantCulture, ".{0}", lastPart.ToLowerInvariant());

                if (countryTld == hostnameTld)
                {
                    features[CityFeatureType.TLDMatch] = true;
                }
                else if (
                    parsedHostname.DomainInfo?.RegistrableDomain != null &&
                    DomainsToCountryTlds.TryGetValue(parsedHostname.DomainInfo.RegistrableDomain, out convertedTld) &&
                    countryTld == convertedTld)
                {
                    features[CityFeatureType.TLDMatch] = true;
                }
            }
        }
        public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features)
        {
            if (parsedHostname?.SubdomainParts == null || cityEntity == null || cityEntity.Admin1Entity == null || string.IsNullOrWhiteSpace(cityEntity.Admin1Entity.Name) || features == null)
            {
                return;
            }

            var firstLettersAdmin1NameVariations = this.GenerateVariationsForName(cityEntity.Admin1Entity.Name);

            foreach (var subdomainPart in parsedHostname.SubdomainParts)
            {
                if (firstLettersAdmin1NameVariations.Contains(subdomainPart.Substring))
                {
                    features[CityFeatureType.FirstLettersAdmin1NameMatch] = true;

                    if (!features.ContainsKey(CityFeatureType.FirstLettersAdmin1Letters) || features[CityFeatureType.FirstLettersAdmin1Letters] == null || ((byte)features[CityFeatureType.FirstLettersAdmin1Letters]) < subdomainPart.Substring.Length)
                    {
                        features[CityFeatureType.FirstLettersAdmin1Letters]      = Convert.ToByte(subdomainPart.Substring.Length);
                        features[CityFeatureType.FirstLettersAdmin1LettersRatio] = (float?)(subdomainPart.Substring.Length / ((1.0f) * cityEntity.Admin1Entity.Name.Length));
                    }

                    if (this.FeaturesConfig.UseSlotIndex)
                    {
                        features[CityFeatureType.FirstLettersAdmin1RTLSlotIndex] = subdomainPart.RTLSlotIndex;
                        features[CityFeatureType.FirstLettersAdmin1LTRSlotIndex] = subdomainPart.LTRSlotIndex;
                    }
                }
            }
        }
        private bool ShouldAcceptDomain(string hostname, HostnameSplitterResult parsedHostname)
        {
            if (parsedHostname?.DomainInfo?.RegistrableDomain == null || string.IsNullOrEmpty(hostname) || !hostname.Contains(".") || hostname.Contains(" ") || hostname.Contains(","))
            {
                return(false);
            }

            var domain = parsedHostname.DomainInfo.RegistrableDomain;

            /*
             * if (hostname.ToLowerInvariant().Contains("comcast.net"))
             * {
             *  return false;
             * }
             */

            int currentHostnameCount;

            if (this.HostnameCounts.TryGetValue(domain, out currentHostnameCount))
            {
                return(currentHostnameCount <= this.maxPositiveSamplesPerDomain);
            }
            else
            {
                return(true);
            }
        }
Beispiel #5
0
        public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features)
        {
            if (parsedHostname == null ||
                parsedHostname.DomainInfo?.RegistrableDomain == null ||
                parsedHostname.SubdomainParts == null ||
                parsedHostname.SubdomainParts.Count == 0 ||
                cityEntity == null)
            {
                return;
            }

            var domain = parsedHostname.DomainInfo.RegistrableDomain;

            Dictionary <PatternRule, PatternMiningCoordinates> rulesToCoordinates;

            if (!this.hostnamePatternRules.TryGetValue(domain, out rulesToCoordinates))
            {
                return;
            }

            var subdomainParts = parsedHostname.SubdomainParts;

            if (subdomainParts == null || subdomainParts.Count == 0)
            {
                return;
            }

            var ruleAtoms = this.miner.CreateRuleAtoms(subdomainParts);

            if (ruleAtoms == null || ruleAtoms.Count == 0)
            {
                return;
            }

            var rules = this.miner.GeneratePossibleRules(ruleAtoms);

            if (rules == null || rules.Count == 0)
            {
                return;
            }

            foreach (var rule in rules)
            {
                PatternMiningCoordinates coordinates;

                if (rulesToCoordinates.TryGetValue(rule, out coordinates))
                {
                    var distance = DistanceHelper.Distance(cityEntity.Latitude, cityEntity.Longitude, coordinates.Latitude, coordinates.Longitude, DistanceUnit.Kilometer);

                    // TODO: Make this configurable
                    // TODO: Distance should vary depending on geohash length?
                    if (distance <= 100)
                    {
                        features[CityFeatureType.HostnamePatternMatch] = true;
                    }
                }
            }
        }
Beispiel #6
0
        private bool ShouldAcceptDomain(string hostname, HostnameSplitterResult parsedHostname)
        {
            if (parsedHostname?.DomainInfo?.RegistrableDomain == null || string.IsNullOrEmpty(hostname) || !hostname.Contains(".") || hostname.Contains(" ") || hostname.Contains(","))
            {
                return(false);
            }

            return(true);
        }
        private bool ShouldContinueIngestingNewHostnames(string hostname, HostnameSplitterResult parsedHostname, int storedTruePositivesCount, int storedTrueNegativesCount)
        {
            ///if (this.ValidLinesCounter > 10000000)

            /*
             * if (this.ValidLinesCounter > 30000000)
             * {
             *  return false;
             * }
             */

            return(true);
        }
        public override void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features)
        {
            if (!this.FeaturesConfig.UseDomainAsFeature)
            {
                return;
            }

            if (parsedHostname?.DomainInfo?.RegistrableDomain == null || features == null)
            {
                return;
            }

            var domain = parsedHostname.DomainInfo.RegistrableDomain;

            features[CityFeatureType.Domain] = domain.GetHashCode();
        }
        private void IncrementHostnameCount(HostnameSplitterResult parsedHostname)
        {
            var domain = parsedHostname?.DomainInfo?.RegistrableDomain;

            if (string.IsNullOrWhiteSpace(domain))
            {
                return;
            }

            int currentHostnameCount;

            if (!this.HostnameCounts.TryGetValue(domain, out currentHostnameCount))
            {
                currentHostnameCount = 0;
            }

            this.HostnameCounts[domain] = currentHostnameCount + 1;
        }
 public abstract Dictionary <GeonamesCityEntity, Features> GenerateCandidatesAndFeatures(HostnameSplitterResult parsedHostname);
        public override Dictionary <GeonamesCityEntity, Features> GenerateCandidatesAndFeatures(HostnameSplitterResult parsedHostname)
        {
            var candidatesAndFeatures = new Dictionary <GeonamesCityEntity, Features>();

            var combinedParts = new HashSet <SubdomainPart>();

            if (parsedHostname?.SubdomainParts != null)
            {
                combinedParts.UnionWith(parsedHostname.SubdomainParts);
            }

            foreach (var subdomainPart in combinedParts)
            {
                EntitiesToFeatures entitiesToFeatures;

                if (codesToEntitiesToFeatures.TryGetValue(subdomainPart.Substring, out entitiesToFeatures))
                {
                    foreach (var entry in entitiesToFeatures)
                    {
                        var features = new Features();

                        foreach (var featureEntry in entry.Value)
                        {
                            features[featureEntry.Key] = featureEntry.Value;
                        }

                        if (this.FeaturesConfig.UseSlotIndex)
                        {
                            features[CityFeatureType.ExactUNLOCODECodeRTLSlotIndex] = subdomainPart.RTLSlotIndex;
                            features[CityFeatureType.ExactUNLOCODECodeLTRSlotIndex] = subdomainPart.LTRSlotIndex;
                        }

                        candidatesAndFeatures[entry.Key] = features;
                    }
                }
            }

            return(candidatesAndFeatures);
        }
Beispiel #12
0
        public Dictionary <GeonamesCityEntity, Features> GenerateCandidatesForHostname(HostnameSplitterResult parsedHostname)
        {
            this.TotalHostnamesSeen++;

            var candidatesAndFeatures = new Dictionary <GeonamesCityEntity, Features>();

            foreach (var cityFeatureGenerator in this.cityFeatureGenerators)
            {
                var stopwatchName = cityFeatureGenerator.GetType().ToString();
                PrimaryFeatureStopWatches[stopwatchName].Start();
                var localCandidatesAndFeatures = cityFeatureGenerator.GenerateCandidatesAndFeatures(parsedHostname);
                PrimaryFeatureStopWatches[stopwatchName].Stop();

                if (localCandidatesAndFeatures.Count > 0)
                {
                    FeatureHostnameHitCounts[stopwatchName]++;
                }

                foreach (var localCandidatesAndFeaturesEntry in localCandidatesAndFeatures)
                {
                    var cityEntity    = localCandidatesAndFeaturesEntry.Key;
                    var localFeatures = localCandidatesAndFeaturesEntry.Value;

                    Features globalFeatures;

                    if (!candidatesAndFeatures.TryGetValue(cityEntity, out globalFeatures))
                    {
                        globalFeatures = new Features();
                        candidatesAndFeatures[cityEntity] = globalFeatures;
                    }

                    foreach (var localFeatureEntry in localFeatures)
                    {
                        globalFeatures[localFeatureEntry.Key] = localFeatureEntry.Value;
                    }
                }
            }

            foreach (var candidatesAndFeaturesEntry in candidatesAndFeatures)
            {
                var cityEntity = candidatesAndFeaturesEntry.Key;
                var features   = candidatesAndFeaturesEntry.Value;

                foreach (var addOnGenerator in this.cityAddOnFeatureGenerators)
                {
                    this.SecondaryFeaturesStopWatch.Start();
                    addOnGenerator.AppendFeatures(parsedHostname, cityEntity, features);
                    this.SecondaryFeaturesStopWatch.Stop();
                }
            }

            // We need to add the defaults outside of the feature generation because
            // if a feature does not match at all, we might still want to fill in the defaults
            if (this.featuresConfig.InitializeDefaultFeatures)
            {
                foreach (var candidatesAndFeaturesEntry in candidatesAndFeatures)
                {
                    var features = candidatesAndFeaturesEntry.Value;

                    foreach (var cityFeatureGenerator in cityFeatureGenerators)
                    {
                        var defaultFeatures = cityFeatureGenerator.FeatureDefaults;

                        foreach (var defaultFeaturesEntry in defaultFeatures)
                        {
                            if (!features.ContainsKey(defaultFeaturesEntry.Key))
                            {
                                features[defaultFeaturesEntry.Key] = defaultFeaturesEntry.Value;
                            }
                        }
                    }

                    foreach (var addOnGenerator in this.cityAddOnFeatureGenerators)
                    {
                        var defaultAddOnFeatures = addOnGenerator.FeatureDefaults;

                        foreach (var defaultAddOnFeaturesEntry in defaultAddOnFeatures)
                        {
                            if (!features.ContainsKey(defaultAddOnFeaturesEntry.Key))
                            {
                                features[defaultAddOnFeaturesEntry.Key] = defaultAddOnFeaturesEntry.Value;
                            }
                        }
                    }
                }
            }

            TotalCandidatesSeen += candidatesAndFeatures.Count;

            return(candidatesAndFeatures);
        }
Beispiel #13
0
        public override Dictionary <GeonamesCityEntity, Features> GenerateCandidatesAndFeatures(HostnameSplitterResult parsedHostname)
        {
            var candidatesAndFeatures = new Dictionary <GeonamesCityEntity, Features>();

            if (parsedHostname?.SubdomainParts == null)
            {
                return(candidatesAndFeatures);
            }

            foreach (var subdomainPart in parsedHostname.SubdomainParts)
            {
                EntitiesToFeatures entitiesToFeatures;

                if (variationsToEntitiesToFeatures.TryGetValue(subdomainPart.Substring, out entitiesToFeatures))
                {
                    foreach (var entry in entitiesToFeatures)
                    {
                        var features = new Features();

                        foreach (var featureEntry in entry.Value)
                        {
                            features[featureEntry.Key] = featureEntry.Value;
                        }

                        if (this.FeaturesConfig.UseSlotIndex)
                        {
                            features[CityFeatureType.AlternateCityNameRTLSlotIndex] = subdomainPart.RTLSlotIndex;
                            features[CityFeatureType.AlternateCityNameLTRSlotIndex] = subdomainPart.LTRSlotIndex;
                        }

                        candidatesAndFeatures[entry.Key] = features;
                    }
                }
            }

            return(candidatesAndFeatures);
        }
 public abstract void AppendFeatures(HostnameSplitterResult parsedHostname, GeonamesCityEntity cityEntity, Features features);
        public override Dictionary <GeonamesCityEntity, Features> GenerateCandidatesAndFeatures(HostnameSplitterResult parsedHostname)
        {
            var candidatesAndFeatures = new Dictionary <GeonamesCityEntity, Features>();

            var domain = parsedHostname?.DomainInfo?.RegistrableDomain;

            var subdomainParts = parsedHostname.SubdomainParts;

            if (subdomainParts == null || subdomainParts.Count == 0)
            {
                return(candidatesAndFeatures);
            }

            var ruleAtoms = this.miner.CreateRuleAtoms(subdomainParts);

            var rules = this.miner.GeneratePossibleRules(ruleAtoms);

            if (rules == null || rules.Count == 0)
            {
                return(candidatesAndFeatures);
            }

            PatternMiningCoordinates bestCoordinates = null;
            Dictionary <PatternRule, PatternMiningCoordinates> rulesToCoordinates;

            if (this.hostnamePatternRules.TryGetValue(domain, out rulesToCoordinates))
            {
                foreach (var rule in rules)
                {
                    PatternMiningCoordinates currentCoordinates;

                    if (rulesToCoordinates.TryGetValue(rule, out currentCoordinates))
                    {
                        if (currentCoordinates.ClosestCity != null)
                        {
                            if (bestCoordinates == null || currentCoordinates.Confidence > bestCoordinates.Confidence)
                            {
                                bestCoordinates = currentCoordinates;
                            }
                        }
                    }
                }
            }

            if (bestCoordinates != null)
            {
                var features = this.InitializeDefaultFeatureValues();
                features[CityFeatureType.HostnamePatternMatch]      = true;
                features[CityFeatureType.HostnamePatternConfidence] = bestCoordinates.Confidence;
                candidatesAndFeatures[bestCoordinates.ClosestCity]  = features;
            }

            return(candidatesAndFeatures);
        }