public void TokenizeTheQuestion(Question question)
        {
            var matches = Regex.Matches(question.QuestionText, @"(^|\s)(in|at)\s(the\s)?", RegexOptions.IgnoreCase);

            foreach (Match match in matches)
            {
                Group group = match.Groups[0];
                var startPosition = (group.Index + group.Length );
                var cityName = question.QuestionText.Substring(startPosition);
                var endPosition = question.QuestionText.Length;

                var possibleEarlierTerminations = Regex.Matches(cityName, @"(^|\s)(when|what('?s?)|\d|(on|in|at|a|right|now|if|then|\?)(\s|$))", RegexOptions.IgnoreCase);
                if(possibleEarlierTerminations.Count > 0)
                {
                    Group terminationGroup = GetEarlierOccurrenceOfGroup(possibleEarlierTerminations);
                    endPosition = startPosition + terminationGroup.Index;
                }

                cityName = question.QuestionText.Substring(startPosition, endPosition - startPosition);
                if(cityName.Replace(" ","").Length == 0) continue;

                var tokenResult = new CityToken(cityName, startPosition);
                question.AddToken(tokenResult);
            }
        }
Пример #2
0
        public static City FindCity(CityToken cityToken)
        {
            var analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
            var queryParser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, CityFieldNames.Name, analyzer);
            var searcher = new IndexSearcher(FSDirectory.Open(ApplicationSettings.CityIndexDirectory), true);
            var sort = new Sort(new[] { new SortField(CityFieldNames.Population, SortField.LONG, true), SortField.FIELD_SCORE });

            var possibleCityDetails = cityToken.GetPossibleCityDetails();
            foreach (var possibleCityDetail in possibleCityDetails)
            {
                var topScoreDocCollector = TopFieldCollector.Create(sort, 5, true, false, false, false);
                var countryCode = string.Empty;
                if (!string.IsNullOrEmpty(possibleCityDetail.CountryName))
                {
                    countryCode = CountryCodes.LookupCountryCode(possibleCityDetail.CountryName);
                    if (string.IsNullOrEmpty(countryCode)) continue;
                }
                var queryText = GetQueryText(possibleCityDetail.CityName, countryCode, possibleCityDetail.AdministrativeDivisionName);

                var query = queryParser.Parse(queryText);
                searcher.Search(query, topScoreDocCollector);
                var results = topScoreDocCollector.TopDocs().ScoreDocs;

                if (topScoreDocCollector.TotalHits > 0)
                {
                    var cities = results.Select(x => new City(searcher.Doc(x.Doc))).ToList();

                    // if the name being searched for matches a country return it
                    foreach (var city in cities)
                    {
                        var countryNameMatches = String.Compare(possibleCityDetail.CityName, city.CountryName, CultureInfo.InvariantCulture, CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase) == 0;
                        if (countryNameMatches)
                            return city;
                    }

                    // if the name matches then return it first
                    foreach (var city in cities)
                    {
                        var cityNameMatches = String.Compare(possibleCityDetail.CityName, city.Name, CultureInfo.InvariantCulture, CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase) == 0;
                        var asciiNameMatches = String.Compare(possibleCityDetail.CityName, city.AsciiName, CultureInfo.InvariantCulture, CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase) == 0;
                        var countryNameMatches = String.IsNullOrEmpty(possibleCityDetail.CountryName) || String.Compare(possibleCityDetail.CountryName, city.CountryName, CultureInfo.InvariantCulture, CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase) == 0;
                        if ((cityNameMatches || asciiNameMatches) && countryNameMatches)
                            return city;
                    }

                    // if there were no direct city name or country name matches then just return the first result (which by default is sorted by population)
                    var firstDocId = results[0].Doc;
                    var firstDocument = searcher.Doc(firstDocId);
                    return new City(firstDocument);
                }
            }

            return null;
        }