Example #1
0
        /// <summary>
        /// This will generate all match variations for a specified value, taking into account the minLengthOfPartialMatche and maxLengthOfPartialMatches
        /// constraints. Note: The original value will be included in the returned set as one of the variations. This will never return null nor a set
        /// that contains any nulls.
        /// </summary>
        private IEnumerable <WeightAdjustingToken> GenerateAllMatchVariations(WeightAdjustingToken token)
        {
            if (token == null)
            {
                throw new ArgumentNullException("token");
            }

            if (token.Token.Length < _minLengthOfPartialMatches)
            {
                return(new WeightAdjustingToken[0]);
            }

            var partialMatches = new List <WeightAdjustingToken>();

            for (var index = 0; index < token.Token.Length; index++)
            {
                for (var length = _minLengthOfPartialMatches; length <= Math.Min(token.Token.Length - index, _maxLengthOfPartialMatches); length++)
                {
                    // 2019-03-13 Dion: Because this partial matching may split a string in the middle of a sequence of UTF-16 code units that make up a single
                    // Unicode code point (e.g. emoji require multiple UTf-16 code units to make up the code point), we discard any strings that form broken
                    // Unicode code points here. This is done by checking if the end of the string is a "lead surrogate" (aka "high surrogate").
                    // Not doing this results in problems later on when string.Normalize() is called on these tokens and finds corrupt Unicode text.
                    var partialString = token.Token.Substring(index, length);
                    if (char.IsHighSurrogate(partialString, partialString.Length - 1))
                    {
                        continue;
                    }

                    // The token's SourceLocation is being maintained for the same reason as they are in GetTokensForPartialMatchGeneration
                    partialMatches.Add(new WeightAdjustingToken(
                                           partialString,
                                           token.WeightMultiplier,
                                           token.SourceLocation
                                           ));
                }

                // If we only want to extract sub tokens from the start of the string then we only need a single pass of the outer loop
                if (_fromStartOfTokenOnly)
                {
                    break;
                }
            }
            return(partialMatches);
        }
Example #2
0
        /// <summary>
        /// Further break any broken token using the optionalPrePartialMatchTokenBreaker, if specified (if not then return a set containing only the
        /// specified token)
        /// </summary>
        private IEnumerable <WeightAdjustingToken> GetTokensForPartialMatchGeneration(WeightAdjustingToken token)
        {
            if (token == null)
            {
                throw new ArgumentNullException("token");
            }

            if (_optionalPrePartialMatchTokenBreaker == null)
            {
                return new[] { token }
            }
            ;

            // The SourceLocation values do not have to be altered; they are used to indicate which word (or segment) in the source content is being matched,
            // if matching that word partially then we still want that word to be indicated as being matched, even though only a section of that word will
            // actually be being matched.
            return(_optionalPrePartialMatchTokenBreaker.Break(token.Token)
                   .Select(t => new WeightAdjustingToken(
                               t.Token,
                               token.WeightMultiplier,
                               token.SourceLocation
                               )));
        }