/// <summary> /// This will generate all match variations for a specified value, taking into account the minLengthOfPartialMatche and maxLengthOfPartialMatches /// constraints. Note: The original value will be included in the returned set as one of the variations. This will never return null nor a set /// that contains any nulls. /// </summary> private IEnumerable <WeightAdjustingToken> GenerateAllMatchVariations(WeightAdjustingToken token) { if (token == null) { throw new ArgumentNullException("token"); } if (token.Token.Length < _minLengthOfPartialMatches) { return(new WeightAdjustingToken[0]); } var partialMatches = new List <WeightAdjustingToken>(); for (var index = 0; index < token.Token.Length; index++) { for (var length = _minLengthOfPartialMatches; length <= Math.Min(token.Token.Length - index, _maxLengthOfPartialMatches); length++) { // 2019-03-13 Dion: Because this partial matching may split a string in the middle of a sequence of UTF-16 code units that make up a single // Unicode code point (e.g. emoji require multiple UTf-16 code units to make up the code point), we discard any strings that form broken // Unicode code points here. This is done by checking if the end of the string is a "lead surrogate" (aka "high surrogate"). // Not doing this results in problems later on when string.Normalize() is called on these tokens and finds corrupt Unicode text. var partialString = token.Token.Substring(index, length); if (char.IsHighSurrogate(partialString, partialString.Length - 1)) { continue; } // The token's SourceLocation is being maintained for the same reason as they are in GetTokensForPartialMatchGeneration partialMatches.Add(new WeightAdjustingToken( partialString, token.WeightMultiplier, token.SourceLocation )); } // If we only want to extract sub tokens from the start of the string then we only need a single pass of the outer loop if (_fromStartOfTokenOnly) { break; } } return(partialMatches); }
/// <summary> /// Further break any broken token using the optionalPrePartialMatchTokenBreaker, if specified (if not then return a set containing only the /// specified token) /// </summary> private IEnumerable <WeightAdjustingToken> GetTokensForPartialMatchGeneration(WeightAdjustingToken token) { if (token == null) { throw new ArgumentNullException("token"); } if (_optionalPrePartialMatchTokenBreaker == null) { return new[] { token } } ; // The SourceLocation values do not have to be altered; they are used to indicate which word (or segment) in the source content is being matched, // if matching that word partially then we still want that word to be indicated as being matched, even though only a section of that word will // actually be being matched. return(_optionalPrePartialMatchTokenBreaker.Break(token.Token) .Select(t => new WeightAdjustingToken( t.Token, token.WeightMultiplier, token.SourceLocation ))); }