Beispiel #1
0
 /// <summary>
 /// A function for splitting a string into tokens ready to be inserted into
 /// the search index.
 ///
 /// This tokenizer will convert its parameter to a string by calling `ToString` and
 /// then will split this string on white space, punctuation and separators.
 /// </summary>
 /// <param name="enumerble">The list of objects to tokenize.</param>
 /// <param name="metadata">Optional metadata can be passed to the tokenizer, this metadata will be cloned and
 /// added as metadata to every token that is created from the object to be tokenized.</param>
 /// <returns>The list of tokens extracted from the string.</returns>
 public IEnumerable <Token> Tokenize(
     IEnumerable <object> enumerable,
     TokenMetadata metadata)
 {
     foreach (object single in enumerable)
     {
         yield return(new Token(
                          single is null ? "" : single.ToString(),
                          metadata is null ? new TokenMetadata() : new TokenMetadata(metadata)));
     }
 }
Beispiel #2
0
        /// <summary>
        /// A function for splitting a string into tokens ready to be inserted into
        /// the search index.
        ///
        /// This tokenizer will convert its parameter to a string by calling `ToString` and
        /// then will split this string on white space, punctuation and separators.
        /// </summary>
        /// <param name="obj">The object to tokenize.</param>
        /// <param name="metadata">Optional metadata can be passed to the tokenizer, this metadata will be cloned and
        /// added as metadata to every token that is created from the object to be tokenized.</param>
        /// <returns>The list of tokens extracted from the string.</returns>
        public IEnumerable <Token> Tokenize(
            object obj,
            TokenMetadata metadata,
            CultureInfo culture,
            Func <char, bool>?separator = null !)
        {
            if (obj is null)
            {
                yield break;
            }

            string str = obj switch
            {
                DateTime dt => dt.ToEcmaString(),
                   _ => obj.ToString()
            };

            int tokenCount = 0;

            separator ??= DefaultSeparator;

            for (int sliceEnd = 0, sliceStart = 0; sliceEnd <= str.Length; sliceEnd++)
            {
                char ch          = sliceEnd < str.Length ? str[sliceEnd] : char.MinValue;
                int  sliceLength = sliceEnd - sliceStart;

                if (separator(ch) || sliceEnd == str.Length)
                {
                    if (sliceLength > 0)
                    {
                        TokenMetadata?tokenMetadata
                            = metadata is null ? new TokenMetadata() : new TokenMetadata(metadata);
                        tokenMetadata["position"] = new Slice(sliceStart, sliceLength);
                        tokenMetadata["index"]    = tokenCount++;

                        yield return(new Token(
                                         str.Substring(sliceStart, sliceEnd - sliceStart).ToLower(culture ?? CultureInfo.CurrentCulture),
                                         tokenMetadata));
                    }

                    sliceStart = sliceEnd + 1;
                }
            }
        }
Beispiel #3
0
 /// <summary>
 /// A function for splitting a string into tokens ready to be inserted into
 /// the search index.
 ///
 /// This tokenizer will convert its parameter to a string by calling `ToString` and
 /// then will split this string on white space, punctuation and separators.
 /// </summary>
 /// <param name="str">The string to tokenize.</param>
 /// <param name="metadata">Optional metadata can be passed to the tokenizer, this metadata will be cloned and
 /// added as metadata to every token that is created from the object to be tokenized.</param>
 /// <returns>The list of tokens extracted from the string.</returns>
 public IEnumerable <Token> Tokenize(
     string str,
     TokenMetadata metadata)
 => Tokenize((object)str, metadata, CultureInfo.CurrentCulture);