public void RoundtripsRegexFlags(PatternTokenizer expected)
        {
            using MemoryStream stream = new MemoryStream();
            using (Utf8JsonWriter writer = new Utf8JsonWriter(stream))
            {
                ((IUtf8JsonSerializable)expected).Write(writer);
            }

            using JsonDocument doc = JsonDocument.Parse(stream.ToArray());
            PatternTokenizer actual = LexicalTokenizer.DeserializeLexicalTokenizer(doc.RootElement) as PatternTokenizer;

            CollectionAssert.AreEqual(expected.Flags, actual?.Flags);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Creates a token stream that tokenizes the given string into token terms
        /// (aka words).
        /// </summary>
        /// <param name="fieldName">
        ///            the name of the field to tokenize (currently ignored). </param>
        /// <param name="reader">
        ///            reader (e.g. charfilter) of the original text. can be null. </param>
        /// <param name="text">
        ///            the string to tokenize </param>
        /// <returns> a new token stream </returns>
        public TokenStreamComponents CreateComponents(string fieldName, TextReader reader, string text)
        {
            // Ideally the Analyzer superclass should have a method with the same signature, 
            // with a default impl that simply delegates to the StringReader flavour. 
            if (reader == null)
            {
                reader = new FastStringReader(text);
            }

            if (pattern == NON_WORD_PATTERN) // fast path
            {
                return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords));
            } // fast path
            else if (pattern == WHITESPACE_PATTERN)
            {
                return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords));
            }

            Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase);
            TokenStream result = (stopWords != null) ? (TokenStream)new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
            return new TokenStreamComponents(tokenizer, result);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Creates the components.
        /// </summary>
        /// <param name="fieldName">Name of the field.</param>
        /// <returns></returns>
        protected override AnalyzerTokenStreamComponents CreateComponents(string fieldName)
        {
            if (String.IsNullOrWhiteSpace(fieldName))
                throw new ArgumentException($"{nameof(fieldName)} cannot be null or blank");

            var pattern = Pattern.compile(_separatorChars);
            var tokenizer = new PatternTokenizer(pattern, -1);
            var stream = _ignoreCase ? new LowerCaseFilter(tokenizer) as TokenStream : tokenizer as TokenStream;

            if (_enableStemming)
                stream = new PorterStemFilter(stream);

            return new AnalyzerTokenStreamComponents(tokenizer, stream);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Creates a token stream that tokenizes the given string into token terms
        /// (aka words).
        /// </summary>
        /// <param name="fieldName">
        ///            the name of the field to tokenize (currently ignored). </param>
        /// <param name="reader">
        ///            reader (e.g. charfilter) of the original text. can be null. </param>
        /// <param name="text">
        ///            the string to tokenize </param>
        /// <returns> a new token stream </returns>
        public TokenStreamComponents createComponents(string fieldName, TextReader reader, string text)
        {
            // Ideally the Analyzer superclass should have a method with the same signature, 
            // with a default impl that simply delegates to the StringReader flavour. 
            if (reader == null)
            {
                reader = new FastStringReader(text);
            }

            if (pattern == NON_WORD_PATTERN) // fast path
            {
                return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords));
            } // fast path
            else if (pattern == WHITESPACE_PATTERN)
            {
                return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords));
            }

            Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase);
            TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
            return new TokenStreamComponents(tokenizer, result);
        }
Ejemplo n.º 5
0
        private static bool NewPatternMatch(PatternTokenizer tokenizer, String str)
        {
            // If no more tokens and str is empty, we matched
            if (!tokenizer.HasNext())
                return (str.Length == 0);

            // Get the next token from the tokenizer
            string token = tokenizer.NextToken();

            // Is it a wild card token?
            if (tokenizer.LastWasWildcard) {
                // Yes, what are the minimum and maximum extent of characters to match
                // by this wildcard string?
                int strLen = str.Length;
                int min = 0;
                int max = 0;
                for (int i = 0; i < token.Length; ++i) {
                    if (token[i] == ONE_CHAR) {
                        ++min;
                        ++max;
                    } else if (token[i] == ZERO_OR_MORE_CHARS) {
                        max = strLen;
                    } else {
                        throw new ApplicationException("Tokenizer error");
                    }
                }
                // If it's not possible to match this size string,
                if (min > strLen) {
                    return false;
                }
                // If there are no more tokens to match,
                if (!tokenizer.HasNext()) {
                    // If str_len falls within the size of the pattern we can match
                    // then return true, otherwise false
                    return strLen >= min && strLen <= max;
                }

                // Search for the index of the next token. It's not possible for this to
                // be a wildcard.
                string next_tok = tokenizer.NextToken();
                int p = min;
                while (true) {
                    p = str.IndexOf(next_tok, p);
                    if (p < 0 || p > max) {
                        // Not found, so fail this
                        return false;
                    }
                    // Recurse at the point we found
                    int state = tokenizer.Position;
                    if (NewPatternMatch(tokenizer, str.Substring(p + next_tok.Length))) {
                        return true;
                    }
                    // Reverse state if the search failed and try again
                    tokenizer.Position = state;
                    ++p;
                }
            }

            // Not a wild card, so match

            // If the string doesn't match the token, we return false
            if (!str.StartsWith(token))
                return false;

            // Otherwise recurse
            return NewPatternMatch(tokenizer, str.Substring(token.Length));
        }
Ejemplo n.º 6
0
 public static bool Match(string pattern, string str, char escapeChar)
 {
     // Create the tokenizer
     PatternTokenizer tokenizer = new PatternTokenizer(pattern, escapeChar);
     return NewPatternMatch(tokenizer, str);
 }