public void EditDistance_SingleInsertionAtStartAndMiddle_2()
        {
            int editDistance = ApproximateMatcher.EditDistance("abo", "habho", 2, false);

            Assert.AreEqual(2, editDistance);
        }
        public void EditDistance_SingleInsertionAtStartAndMiddle_1Max_EditDistanceLargerThanMax()
        {
            int editDistance = ApproximateMatcher.EditDistance("abo", "habho", 1, false);

            Assert.AreEqual(ApproximateMatcher.EditDistanceLargerThanMax, editDistance);
        }
        public void EditDistance_SingleInsertionAtEnd_1()
        {
            int editDistance = ApproximateMatcher.EditDistance("abo", "aboh", 1, false);

            Assert.AreEqual(1, editDistance);
        }
        public void EditDistance_Same_0()
        {
            int editDistance = ApproximateMatcher.EditDistance("abo", "abo", 0, false);

            Assert.AreEqual(0, editDistance);
        }
Exemple #5
0
            public int Compare(LanguageInfo x, LanguageInfo y)
            {
                if (x.LanguageTag == y.LanguageTag)
                {
                    return(0);
                }
                if (!x.Names[0].Equals(y.Names[0], StringComparison.InvariantCultureIgnoreCase))
                {
                    // Favor ones where some language matches to solve BL-1141
                    if (x.Names[0].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase))
                    {
                        return(-1);
                    }
                    if (y.Names[0].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase))
                    {
                        return(1);
                    }
                    if (x.Names.Count > 1 && x.Names[1].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase))
                    {
                        return(-1);
                    }
                    if (y.Names.Count > 1 && y.Names[1].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase))
                    {
                        return(1);
                    }
                }

                if (x.LanguageTag.Equals(_searchString, StringComparison.InvariantCultureIgnoreCase))
                {
                    return(-1);
                }
                if (y.LanguageTag.Equals(_searchString, StringComparison.InvariantCultureIgnoreCase))
                {
                    return(1);
                }

                if (IetfLanguageTag.GetLanguagePart(x.LanguageTag).Equals(_searchString, StringComparison.InvariantCultureIgnoreCase))
                {
                    return(-1);
                }
                if (IetfLanguageTag.GetLanguagePart(y.LanguageTag).Equals(_searchString, StringComparison.InvariantCultureIgnoreCase))
                {
                    return(1);
                }

                // Use the "editing distance" relative to the search string to sort by the primary name.
                // (But we don't really care once the editing distance gets very large.)
                // See https://silbloom.myjetbrains.com/youtrack/issue/BL-5847 for motivation.
                // Timing tests indicate that 1) calculating these distances doesn't slow down the sorting noticeably
                // and 2) caching these distances in a dictionary also doesn't speed up the sorting noticeably.
                var xDistance    = ApproximateMatcher.EditDistance(_lowerSearch, x.Names[0].ToLowerInvariant(), 25, false);
                var yDistance    = ApproximateMatcher.EditDistance(_lowerSearch, y.Names[0].ToLowerInvariant(), 25, false);
                var distanceDiff = xDistance - yDistance;

                if (distanceDiff != 0)
                {
                    return(distanceDiff);
                }

                // If the editing distances for the primary names are the same, sort by the primary name.
                int res = string.Compare(x.Names[0], y.Names[0], StringComparison.InvariantCultureIgnoreCase);

                if (res != 0)
                {
                    return(res);
                }

                return(string.Compare(x.LanguageTag, y.LanguageTag, StringComparison.InvariantCultureIgnoreCase));
            }
			/// <summary>
			/// Sorting the languages for display is tricky: we want the most relevant languages at the
			/// top of the list, so we can't simply sort alphabetically by language name or by language tag,
			/// but need to take both items into account together with the current search string.  Ordering
			/// by relevance is clearly impossible since we'd have to read the user's mind and apply that
			/// knowledge to the data.  But the heuristics we use here may be better than nothing...
			/// </summary>
			public int Compare(LanguageInfo x, LanguageInfo y)
			{
				if (x.LanguageTag == y.LanguageTag)
					return 0;

				// Favor ones where some language name matches the search string to solve BL-1141
				// We restrict this to the top 2 names of each language, and to cases where the
				// corresponding names of the two languages are different.  (If both language names
				// match the search string, there's no good reason to favor one over the other!)
				if (!x.Names[0].Equals(y.Names[0], StringComparison.InvariantCultureIgnoreCase))
				{
					if (x.Names[0].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase))
						return -1;
					if (y.Names[0].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase))
						return 1;
				}
				else if (x.Names.Count == 1 || y.Names.Count == 1 || !x.Names[1].Equals(y.Names[1], StringComparison.InvariantCultureIgnoreCase))
				{
					// If we get here, x.Names[0] == y.Names[0].  If both equal the search string, then neither x.Names[1]
					// nor y.Names[1] should equal the search string since the code adding to Names checks for redundancy.
					// Also it's possible that neither x.Names[1] nor y.Names[1] exists at this point in the code, or that
					// only one of them exists, or that both of them exist (in which case they are not equal).
					if (x.Names.Count > 1 && x.Names[1].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase))
						return -1;
					if (y.Names.Count > 1 && y.Names[1].Equals(_searchString, StringComparison.InvariantCultureIgnoreCase))
						return 1;
				}

				// Favor a language whose tag matches the search string exactly.  (equal tags are handled above)
				if (x.LanguageTag.Equals(_searchString, StringComparison.InvariantCultureIgnoreCase))
					return -1;
				if (y.LanguageTag.Equals(_searchString, StringComparison.InvariantCultureIgnoreCase))
					return 1;

				// written this way to avoid having to catch predictable exceptions as the user is typing
				string xlanguage;
				string ylanguage;
				string script;
				string region;
				string variant;
				var xtagParses = IetfLanguageTag.TryGetParts(x.LanguageTag, out xlanguage, out script, out region, out variant);
				var ytagParses = IetfLanguageTag.TryGetParts(y.LanguageTag, out ylanguage, out script, out region, out variant);
				var bothTagLanguagesMatchSearch = xtagParses && ytagParses && xlanguage == ylanguage &&
					_searchString.Equals(xlanguage, StringComparison.InvariantCultureIgnoreCase);
				if (!bothTagLanguagesMatchSearch)
				{
					// One of the tag language pieces may match the search string even though not both match.  In that case,
					// sort the matching language earlier in the list.
					if (xtagParses && _searchString.Equals(xlanguage, StringComparison.InvariantCultureIgnoreCase))
						return -1;  // x.Tag's language part matches search string exactly, so sort it earlier in the list.
					if (ytagParses && _searchString.Equals(ylanguage, StringComparison.InvariantCultureIgnoreCase))
						return 1;   // y.Tag's language part matches search string exactly, so sort it earlier in the list.
				}

				// If only one language has a name that is an exact match prefer that language
				var xMatchingNameLoc = x.Names.IndexOf(l => _searchString.Equals(l, StringComparison.InvariantCultureIgnoreCase));
				var yMatchingNameLoc = y.Names.IndexOf(l => _searchString.Equals(l, StringComparison.InvariantCultureIgnoreCase));
				if (xMatchingNameLoc > 0 && yMatchingNameLoc < 0)
				{
					return -1;
				}
				if (yMatchingNameLoc > 0 && xMatchingNameLoc < 0)
				{
					return 1;
				}

				// Order the by country information for exact matches before dropping to editing distance
				if (x.PrimaryCountry.ToLowerInvariant() == _lowerSearch || y.PrimaryCountry.ToLowerInvariant() == _lowerSearch)
				{
					if (x.PrimaryCountry.ToLowerInvariant() == _lowerSearch &&
						y.PrimaryCountry.ToLowerInvariant() != _lowerSearch)
					{
						return -1;
					}
					if (y.PrimaryCountry.ToLowerInvariant() == _lowerSearch &&
						x.PrimaryCountry.ToLowerInvariant() != _lowerSearch)
					{
						return 1;
					}
					// Both match the country exactly sort by language name
					return string.Compare(x.Names[0], y.Names[0], StringComparison.InvariantCultureIgnoreCase);
				}

				// Editing distance to a language name is not useful when we've detected that the user appears to be
				// typing a language tag in that both language tags match what the user has typed.  (For example,
				// it gives a strange and unwanted order to the variants of zh.)  In such a case we just order the
				// matching codes by length (already done) and then alphabetically by code, skipping the sort by
				// editing distance to the language names.
				if (!bothTagLanguagesMatchSearch)
				{
					// Use the "editing distance" relative to the search string to sort by the primary name.
					// (But we don't really care once the editing distance gets very large.)
					// See https://silbloom.myjetbrains.com/youtrack/issue/BL-5847 for motivation.
					// Timing tests indicate that 1) calculating these distances doesn't slow down the sorting noticeably
					// and 2) caching these distances in a dictionary also doesn't speed up the sorting noticeably.
					var xDistance = ApproximateMatcher.EditDistance(_lowerSearch, x.Names[0].ToLowerInvariant(), 25, false);
					var yDistance = ApproximateMatcher.EditDistance(_lowerSearch, y.Names[0].ToLowerInvariant(), 25, false);
					var distanceDiff = xDistance - yDistance;
					if (distanceDiff != 0)
						return distanceDiff;

					// If the editing distances for the primary names are the same, sort by the primary name.
					int res = string.Compare(x.Names[0], y.Names[0], StringComparison.InvariantCultureIgnoreCase);
					if (res != 0)
						return res;
				}

				// Alphabetize by Language tag if 3 characters or less or if there is a hyphen after the first 2 or 3 chars
				if (_lowerSearch.Length <= 3 || _lowerSearch.IndexOf('-') == 3 || _lowerSearch.IndexOf('-') == 4)
				{
					return string.Compare(x.LanguageTag, y.LanguageTag, StringComparison.InvariantCultureIgnoreCase);
				}
				// Otherwise alphabetize by the language name
				// (tags often have a completely different alphabetical order from the name e.g. 'auc' -> "Waoroni")
				return string.Compare(x.Names[0], y.Names[0], StringComparison.InvariantCultureIgnoreCase);

			}
		/// <summary>
		/// Get an list of languages that match the given string in some way (code, name, country)
		/// </summary>
		public IEnumerable<LanguageInfo> SuggestLanguages(string searchString)
		{
			if (searchString != null)
				searchString = searchString.Trim();
			if (string.IsNullOrEmpty(searchString))
				yield break;

			if (searchString == "*")
			{
				// there will be duplicate LanguageInfo entries for 2 and 3 letter codes and equivalent tags
				var all_languages = new HashSet<LanguageInfo>(_codeToLanguageIndex.Select(l => l.Value));
				foreach (LanguageInfo languageInfo in all_languages.OrderBy(l => l, new ResultComparer(searchString)))
					yield return languageInfo;
			}
			// if the search string exactly matches a hard-coded way to say "sign", show all the sign languages
			// there will be duplicate LanguageInfo entries for equivalent tags
			else if (new []{"sign", "sign language","signes", "langage des signes", "señas","lenguaje de señas"}.Contains(searchString.ToLowerInvariant()))
			{
				var parallelSearch = new HashSet<LanguageInfo>(_codeToLanguageIndex.AsParallel().Select(li => li.Value).Where(l =>
					l.Names.AsQueryable().Any(n => n.ToLowerInvariant().Contains("sign"))));
				foreach (LanguageInfo languageInfo in parallelSearch)
				{
					yield return languageInfo;
				}
			}
			else
			{
				IEnumerable<LanguageInfo> matchOnCode = from x in _codeToLanguageIndex where x.Key.StartsWith(searchString, StringComparison.InvariantCultureIgnoreCase) select x.Value;
				List<LanguageInfo>[] matchOnName = (from x in _nameToLanguageIndex where x.Key.StartsWith(searchString, StringComparison.InvariantCultureIgnoreCase) select x.Value).ToArray();
				// Apostrophes can cause trouble in lookup.  Unicode TR-29 inexplicably says to use
				// u2019 (RIGHT SINGLE QUOTATION MARK) for the English apostrophe when it also defines
				// u02BC (MODIFIER LETTER APOSTROPHE) as a Letter character.  Users are quite likely to
				// type the ASCII apostrophe (u0027) which is defined as Punctuation.  The current
				// data appears to use u2019 in several language names, which means that users might
				// end up thinking the language isn't in our database.
				// See https://silbloom.myjetbrains.com/youtrack/issue/BL-6339.
				if (!matchOnName.Any() && searchString.Contains('\''))
				{
					searchString = searchString.Replace('\'','\u2019');
					matchOnName = (from x in _nameToLanguageIndex where x.Key.StartsWith(searchString, StringComparison.InvariantCultureIgnoreCase) select x.Value).ToArray();
				}
				List<LanguageInfo>[] matchOnCountry = (from x in _countryToLanguageIndex where x.Key.StartsWith(searchString, StringComparison.InvariantCultureIgnoreCase) select x.Value).ToArray();

				if (!matchOnName.Any())
				{
					// look  for approximate matches
					const int kMaxEditDistance = 3;
					var itemFormExtractor = new ApproximateMatcher.GetStringDelegate<KeyValuePair<string, List<LanguageInfo>>>(pair => pair.Key);
					IList<KeyValuePair<string, List<LanguageInfo>>> matches = ApproximateMatcher.FindClosestForms(_nameToLanguageIndex, itemFormExtractor,
						searchString,
						ApproximateMatcherOptions.None,
						kMaxEditDistance);
					matchOnName = (from m in matches select m.Value).ToArray();
				}

				var combined = new HashSet<LanguageInfo>(matchOnCode);
				foreach (List<LanguageInfo> l in matchOnName)
					combined.UnionWith(l);
				foreach (List<LanguageInfo> l in matchOnCountry)
					combined.UnionWith(l);
				var ordered = combined.OrderBy(l => l, new ResultComparer(searchString));
				foreach (LanguageInfo languageInfo in ordered)
					yield return languageInfo;
			}
		}