public void TokenizeTheQuestion(Question question) { var matches = Regex.Matches(question.QuestionText, @"(^|\s)(in|at)\s(the\s)?", RegexOptions.IgnoreCase); foreach (Match match in matches) { Group group = match.Groups[0]; var startPosition = (group.Index + group.Length ); var cityName = question.QuestionText.Substring(startPosition); var endPosition = question.QuestionText.Length; var possibleEarlierTerminations = Regex.Matches(cityName, @"(^|\s)(when|what('?s?)|\d|(on|in|at|a|right|now|if|then|\?)(\s|$))", RegexOptions.IgnoreCase); if(possibleEarlierTerminations.Count > 0) { Group terminationGroup = GetEarlierOccurrenceOfGroup(possibleEarlierTerminations); endPosition = startPosition + terminationGroup.Index; } cityName = question.QuestionText.Substring(startPosition, endPosition - startPosition); if(cityName.Replace(" ","").Length == 0) continue; var tokenResult = new CityToken(cityName, startPosition); question.AddToken(tokenResult); } }
public static City FindCity(CityToken cityToken) { var analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); var queryParser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, CityFieldNames.Name, analyzer); var searcher = new IndexSearcher(FSDirectory.Open(ApplicationSettings.CityIndexDirectory), true); var sort = new Sort(new[] { new SortField(CityFieldNames.Population, SortField.LONG, true), SortField.FIELD_SCORE }); var possibleCityDetails = cityToken.GetPossibleCityDetails(); foreach (var possibleCityDetail in possibleCityDetails) { var topScoreDocCollector = TopFieldCollector.Create(sort, 5, true, false, false, false); var countryCode = string.Empty; if (!string.IsNullOrEmpty(possibleCityDetail.CountryName)) { countryCode = CountryCodes.LookupCountryCode(possibleCityDetail.CountryName); if (string.IsNullOrEmpty(countryCode)) continue; } var queryText = GetQueryText(possibleCityDetail.CityName, countryCode, possibleCityDetail.AdministrativeDivisionName); var query = queryParser.Parse(queryText); searcher.Search(query, topScoreDocCollector); var results = topScoreDocCollector.TopDocs().ScoreDocs; if (topScoreDocCollector.TotalHits > 0) { var cities = results.Select(x => new City(searcher.Doc(x.Doc))).ToList(); // if the name being searched for matches a country return it foreach (var city in cities) { var countryNameMatches = String.Compare(possibleCityDetail.CityName, city.CountryName, CultureInfo.InvariantCulture, CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase) == 0; if (countryNameMatches) return city; } // if the name matches then return it first foreach (var city in cities) { var cityNameMatches = String.Compare(possibleCityDetail.CityName, city.Name, CultureInfo.InvariantCulture, CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase) == 0; var asciiNameMatches = String.Compare(possibleCityDetail.CityName, city.AsciiName, CultureInfo.InvariantCulture, CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase) == 0; var countryNameMatches = String.IsNullOrEmpty(possibleCityDetail.CountryName) || String.Compare(possibleCityDetail.CountryName, city.CountryName, CultureInfo.InvariantCulture, CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase) == 0; if ((cityNameMatches || asciiNameMatches) && countryNameMatches) return city; } // if there were no direct city name or country name matches then just return the first result (which by default is sorted by population) var firstDocId = results[0].Doc; var firstDocument = searcher.Doc(firstDocId); return new City(firstDocument); } } return null; }