public void EditDistance_SingleInsertionAtStartAndMiddle_2() { int editDistance = ApproximateMatcher.EditDistance("abo", "habho", 2, false); Assert.AreEqual(2, editDistance); }
public void EditDistance_SingleInsertionAtStartAndMiddle_1Max_EditDistanceLargerThanMax() { int editDistance = ApproximateMatcher.EditDistance("abo", "habho", 1, false); Assert.AreEqual(ApproximateMatcher.EditDistanceLargerThanMax, editDistance); }
public void EditDistance_SingleInsertionAtEnd_1() { int editDistance = ApproximateMatcher.EditDistance("abo", "aboh", 1, false); Assert.AreEqual(1, editDistance); }
public void EditDistance_Same_0() { int editDistance = ApproximateMatcher.EditDistance("abo", "abo", 0, false); Assert.AreEqual(0, editDistance); }
public int Compare(LanguageInfo x, LanguageInfo y) { if (x.LanguageTag == y.LanguageTag) { return(0); } if (!x.Names[0].Equals(y.Names[0], StringComparison.InvariantCultureIgnoreCase)) { // Favor ones where some language matches to solve BL-1141 if (x.Names[0].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase)) { return(-1); } if (y.Names[0].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase)) { return(1); } if (x.Names.Count > 1 && x.Names[1].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase)) { return(-1); } if (y.Names.Count > 1 && y.Names[1].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase)) { return(1); } } if (x.LanguageTag.Equals(_searchString, StringComparison.InvariantCultureIgnoreCase)) { return(-1); } if (y.LanguageTag.Equals(_searchString, StringComparison.InvariantCultureIgnoreCase)) { return(1); } if (IetfLanguageTag.GetLanguagePart(x.LanguageTag).Equals(_searchString, StringComparison.InvariantCultureIgnoreCase)) { return(-1); } if (IetfLanguageTag.GetLanguagePart(y.LanguageTag).Equals(_searchString, StringComparison.InvariantCultureIgnoreCase)) { return(1); } // Use the "editing distance" relative to the search string to sort by the primary name. // (But we don't really care once the editing distance gets very large.) // See https://silbloom.myjetbrains.com/youtrack/issue/BL-5847 for motivation. // Timing tests indicate that 1) calculating these distances doesn't slow down the sorting noticeably // and 2) caching these distances in a dictionary also doesn't speed up the sorting noticeably. var xDistance = ApproximateMatcher.EditDistance(_lowerSearch, x.Names[0].ToLowerInvariant(), 25, false); var yDistance = ApproximateMatcher.EditDistance(_lowerSearch, y.Names[0].ToLowerInvariant(), 25, false); var distanceDiff = xDistance - yDistance; if (distanceDiff != 0) { return(distanceDiff); } // If the editing distances for the primary names are the same, sort by the primary name. int res = string.Compare(x.Names[0], y.Names[0], StringComparison.InvariantCultureIgnoreCase); if (res != 0) { return(res); } return(string.Compare(x.LanguageTag, y.LanguageTag, StringComparison.InvariantCultureIgnoreCase)); }
/// <summary> /// Sorting the languages for display is tricky: we want the most relevant languages at the /// top of the list, so we can't simply sort alphabetically by language name or by language tag, /// but need to take both items into account together with the current search string. Ordering /// by relevance is clearly impossible since we'd have to read the user's mind and apply that /// knowledge to the data. But the heuristics we use here may be better than nothing... /// </summary> public int Compare(LanguageInfo x, LanguageInfo y) { if (x.LanguageTag == y.LanguageTag) return 0; // Favor ones where some language name matches the search string to solve BL-1141 // We restrict this to the top 2 names of each language, and to cases where the // corresponding names of the two languages are different. (If both language names // match the search string, there's no good reason to favor one over the other!) if (!x.Names[0].Equals(y.Names[0], StringComparison.InvariantCultureIgnoreCase)) { if (x.Names[0].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase)) return -1; if (y.Names[0].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase)) return 1; } else if (x.Names.Count == 1 || y.Names.Count == 1 || !x.Names[1].Equals(y.Names[1], StringComparison.InvariantCultureIgnoreCase)) { // If we get here, x.Names[0] == y.Names[0]. If both equal the search string, then neither x.Names[1] // nor y.Names[1] should equal the search string since the code adding to Names checks for redundancy. // Also it's possible that neither x.Names[1] nor y.Names[1] exists at this point in the code, or that // only one of them exists, or that both of them exist (in which case they are not equal). if (x.Names.Count > 1 && x.Names[1].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase)) return -1; if (y.Names.Count > 1 && y.Names[1].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase)) return 1; } // Favor a language whose tag matches the search string exactly. (equal tags are handled above) if (x.LanguageTag.Equals(_searchString, StringComparison.InvariantCultureIgnoreCase)) return -1; if (y.LanguageTag.Equals(_searchString, StringComparison.InvariantCultureIgnoreCase)) return 1; // written this way to avoid having to catch predictable exceptions as the user is typing string xlanguage; string ylanguage; string script; string region; string variant; var xtagParses = IetfLanguageTag.TryGetParts(x.LanguageTag, out xlanguage, out script, out region, out variant); var ytagParses = IetfLanguageTag.TryGetParts(y.LanguageTag, out ylanguage, out script, out region, out variant); var bothTagLanguagesMatchSearch = xtagParses && ytagParses && xlanguage == ylanguage && _searchString.Equals(xlanguage, StringComparison.InvariantCultureIgnoreCase); if (!bothTagLanguagesMatchSearch) { // One of the tag language pieces may match the search string even though not both match. In that case, // sort the matching language earlier in the list. if (xtagParses && _searchString.Equals(xlanguage, StringComparison.InvariantCultureIgnoreCase)) return -1; // x.Tag's language part matches search string exactly, so sort it earlier in the list. if (ytagParses && _searchString.Equals(ylanguage, StringComparison.InvariantCultureIgnoreCase)) return 1; // y.Tag's language part matches search string exactly, so sort it earlier in the list. } // If only one language has a name that is an exact match prefer that language var xMatchingNameLoc = x.Names.IndexOf(l => _searchString.Equals(l, StringComparison.InvariantCultureIgnoreCase)); var yMatchingNameLoc = y.Names.IndexOf(l => _searchString.Equals(l, StringComparison.InvariantCultureIgnoreCase)); if (xMatchingNameLoc > 0 && yMatchingNameLoc < 0) { return -1; } if (yMatchingNameLoc > 0 && xMatchingNameLoc < 0) { return 1; } // Order the by country information for exact matches before dropping to editing distance if (x.PrimaryCountry.ToLowerInvariant() == _lowerSearch || y.PrimaryCountry.ToLowerInvariant() == _lowerSearch) { if (x.PrimaryCountry.ToLowerInvariant() == _lowerSearch && y.PrimaryCountry.ToLowerInvariant() != _lowerSearch) { return -1; } if (y.PrimaryCountry.ToLowerInvariant() == _lowerSearch && x.PrimaryCountry.ToLowerInvariant() != _lowerSearch) { return 1; } // Both match the country exactly sort by language name return string.Compare(x.Names[0], y.Names[0], StringComparison.InvariantCultureIgnoreCase); } // Editing distance to a language name is not useful when we've detected that the user appears to be // typing a language tag in that both language tags match what the user has typed. (For example, // it gives a strange and unwanted order to the variants of zh.) In such a case we just order the // matching codes by length (already done) and then alphabetically by code, skipping the sort by // editing distance to the language names. if (!bothTagLanguagesMatchSearch) { // Use the "editing distance" relative to the search string to sort by the primary name. // (But we don't really care once the editing distance gets very large.) // See https://silbloom.myjetbrains.com/youtrack/issue/BL-5847 for motivation. // Timing tests indicate that 1) calculating these distances doesn't slow down the sorting noticeably // and 2) caching these distances in a dictionary also doesn't speed up the sorting noticeably. var xDistance = ApproximateMatcher.EditDistance(_lowerSearch, x.Names[0].ToLowerInvariant(), 25, false); var yDistance = ApproximateMatcher.EditDistance(_lowerSearch, y.Names[0].ToLowerInvariant(), 25, false); var distanceDiff = xDistance - yDistance; if (distanceDiff != 0) return distanceDiff; // If the editing distances for the primary names are the same, sort by the primary name. int res = string.Compare(x.Names[0], y.Names[0], StringComparison.InvariantCultureIgnoreCase); if (res != 0) return res; } // Alphabetize by Language tag if 3 characters or less or if there is a hyphen after the first 2 or 3 chars if (_lowerSearch.Length <= 3 || _lowerSearch.IndexOf('-') == 3 || _lowerSearch.IndexOf('-') == 4) { return string.Compare(x.LanguageTag, y.LanguageTag, StringComparison.InvariantCultureIgnoreCase); } // Otherwise alphabetize by the language name // (tags often have a completely different alphabetical order from the name e.g. 'auc' -> "Waoroni") return string.Compare(x.Names[0], y.Names[0], StringComparison.InvariantCultureIgnoreCase); }
/// <summary> /// Get an list of languages that match the given string in some way (code, name, country) /// </summary> public IEnumerable<LanguageInfo> SuggestLanguages(string searchString) { if (searchString != null) searchString = searchString.Trim(); if (string.IsNullOrEmpty(searchString)) yield break; if (searchString == "*") { // there will be duplicate LanguageInfo entries for 2 and 3 letter codes and equivalent tags var all_languages = new HashSet<LanguageInfo>(_codeToLanguageIndex.Select(l => l.Value)); foreach (LanguageInfo languageInfo in all_languages.OrderBy(l => l, new ResultComparer(searchString))) yield return languageInfo; } // if the search string exactly matches a hard-coded way to say "sign", show all the sign languages // there will be duplicate LanguageInfo entries for equivalent tags else if (new []{"sign", "sign language","signes", "langage des signes", "señas","lenguaje de señas"}.Contains(searchString.ToLowerInvariant())) { var parallelSearch = new HashSet<LanguageInfo>(_codeToLanguageIndex.AsParallel().Select(li => li.Value).Where(l => l.Names.AsQueryable().Any(n => n.ToLowerInvariant().Contains("sign")))); foreach (LanguageInfo languageInfo in parallelSearch) { yield return languageInfo; } } else { IEnumerable<LanguageInfo> matchOnCode = from x in _codeToLanguageIndex where x.Key.StartsWith(searchString, StringComparison.InvariantCultureIgnoreCase) select x.Value; List<LanguageInfo>[] matchOnName = (from x in _nameToLanguageIndex where x.Key.StartsWith(searchString, StringComparison.InvariantCultureIgnoreCase) select x.Value).ToArray(); // Apostrophes can cause trouble in lookup. Unicode TR-29 inexplicably says to use // u2019 (RIGHT SINGLE QUOTATION MARK) for the English apostrophe when it also defines // u02BC (MODIFIER LETTER APOSTROPHE) as a Letter character. Users are quite likely to // type the ASCII apostrophe (u0027) which is defined as Punctuation. The current // data appears to use u2019 in several language names, which means that users might // end up thinking the language isn't in our database. // See https://silbloom.myjetbrains.com/youtrack/issue/BL-6339. if (!matchOnName.Any() && searchString.Contains('\'')) { searchString = searchString.Replace('\'','\u2019'); matchOnName = (from x in _nameToLanguageIndex where x.Key.StartsWith(searchString, StringComparison.InvariantCultureIgnoreCase) select x.Value).ToArray(); } List<LanguageInfo>[] matchOnCountry = (from x in _countryToLanguageIndex where x.Key.StartsWith(searchString, StringComparison.InvariantCultureIgnoreCase) select x.Value).ToArray(); if (!matchOnName.Any()) { // look for approximate matches const int kMaxEditDistance = 3; var itemFormExtractor = new ApproximateMatcher.GetStringDelegate<KeyValuePair<string, List<LanguageInfo>>>(pair => pair.Key); IList<KeyValuePair<string, List<LanguageInfo>>> matches = ApproximateMatcher.FindClosestForms(_nameToLanguageIndex, itemFormExtractor, searchString, ApproximateMatcherOptions.None, kMaxEditDistance); matchOnName = (from m in matches select m.Value).ToArray(); } var combined = new HashSet<LanguageInfo>(matchOnCode); foreach (List<LanguageInfo> l in matchOnName) combined.UnionWith(l); foreach (List<LanguageInfo> l in matchOnCountry) combined.UnionWith(l); var ordered = combined.OrderBy(l => l, new ResultComparer(searchString)); foreach (LanguageInfo languageInfo in ordered) yield return languageInfo; } }