public void RoundtripsRegexFlags(PatternTokenizer expected) { using MemoryStream stream = new MemoryStream(); using (Utf8JsonWriter writer = new Utf8JsonWriter(stream)) { ((IUtf8JsonSerializable)expected).Write(writer); } using JsonDocument doc = JsonDocument.Parse(stream.ToArray()); PatternTokenizer actual = LexicalTokenizer.DeserializeLexicalTokenizer(doc.RootElement) as PatternTokenizer; CollectionAssert.AreEqual(expected.Flags, actual?.Flags); }
/// <summary> /// Creates a token stream that tokenizes the given string into token terms /// (aka words). /// </summary> /// <param name="fieldName"> /// the name of the field to tokenize (currently ignored). </param> /// <param name="reader"> /// reader (e.g. charfilter) of the original text. can be null. </param> /// <param name="text"> /// the string to tokenize </param> /// <returns> a new token stream </returns> public TokenStreamComponents CreateComponents(string fieldName, TextReader reader, string text) { // Ideally the Analyzer superclass should have a method with the same signature, // with a default impl that simply delegates to the StringReader flavour. if (reader == null) { reader = new FastStringReader(text); } if (pattern == NON_WORD_PATTERN) // fast path { return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords)); } // fast path else if (pattern == WHITESPACE_PATTERN) { return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords)); } Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase); TokenStream result = (stopWords != null) ? (TokenStream)new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer; return new TokenStreamComponents(tokenizer, result); }
/// <summary> /// Creates the components. /// </summary> /// <param name="fieldName">Name of the field.</param> /// <returns></returns> protected override AnalyzerTokenStreamComponents CreateComponents(string fieldName) { if (String.IsNullOrWhiteSpace(fieldName)) throw new ArgumentException($"{nameof(fieldName)} cannot be null or blank"); var pattern = Pattern.compile(_separatorChars); var tokenizer = new PatternTokenizer(pattern, -1); var stream = _ignoreCase ? new LowerCaseFilter(tokenizer) as TokenStream : tokenizer as TokenStream; if (_enableStemming) stream = new PorterStemFilter(stream); return new AnalyzerTokenStreamComponents(tokenizer, stream); }
/// <summary> /// Creates a token stream that tokenizes the given string into token terms /// (aka words). /// </summary> /// <param name="fieldName"> /// the name of the field to tokenize (currently ignored). </param> /// <param name="reader"> /// reader (e.g. charfilter) of the original text. can be null. </param> /// <param name="text"> /// the string to tokenize </param> /// <returns> a new token stream </returns> public TokenStreamComponents createComponents(string fieldName, TextReader reader, string text) { // Ideally the Analyzer superclass should have a method with the same signature, // with a default impl that simply delegates to the StringReader flavour. if (reader == null) { reader = new FastStringReader(text); } if (pattern == NON_WORD_PATTERN) // fast path { return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords)); } // fast path else if (pattern == WHITESPACE_PATTERN) { return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords)); } Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase); TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer; return new TokenStreamComponents(tokenizer, result); }
private static bool NewPatternMatch(PatternTokenizer tokenizer, String str) { // If no more tokens and str is empty, we matched if (!tokenizer.HasNext()) return (str.Length == 0); // Get the next token from the tokenizer string token = tokenizer.NextToken(); // Is it a wild card token? if (tokenizer.LastWasWildcard) { // Yes, what are the minimum and maximum extent of characters to match // by this wildcard string? int strLen = str.Length; int min = 0; int max = 0; for (int i = 0; i < token.Length; ++i) { if (token[i] == ONE_CHAR) { ++min; ++max; } else if (token[i] == ZERO_OR_MORE_CHARS) { max = strLen; } else { throw new ApplicationException("Tokenizer error"); } } // If it's not possible to match this size string, if (min > strLen) { return false; } // If there are no more tokens to match, if (!tokenizer.HasNext()) { // If str_len falls within the size of the pattern we can match // then return true, otherwise false return strLen >= min && strLen <= max; } // Search for the index of the next token. It's not possible for this to // be a wildcard. string next_tok = tokenizer.NextToken(); int p = min; while (true) { p = str.IndexOf(next_tok, p); if (p < 0 || p > max) { // Not found, so fail this return false; } // Recurse at the point we found int state = tokenizer.Position; if (NewPatternMatch(tokenizer, str.Substring(p + next_tok.Length))) { return true; } // Reverse state if the search failed and try again tokenizer.Position = state; ++p; } } // Not a wild card, so match // If the string doesn't match the token, we return false if (!str.StartsWith(token)) return false; // Otherwise recurse return NewPatternMatch(tokenizer, str.Substring(token.Length)); }
public static bool Match(string pattern, string str, char escapeChar) { // Create the tokenizer PatternTokenizer tokenizer = new PatternTokenizer(pattern, escapeChar); return NewPatternMatch(tokenizer, str); }