// ================================================= Helper Methods ================================================ /// <summary> /// Determines whether the transition from lastType to type indicates a break /// </summary> /// <param name="lastType"> Last subword type </param> /// <param name="type"> Current subword type </param> /// <returns> {@code true} if the transition indicates a break, {@code false} otherwise </returns> private bool isBreak(int lastType, int type) { if ((type & lastType) != 0) { return(false); } if (!splitOnCaseChange && WordDelimiterFilter.isAlpha(lastType) && WordDelimiterFilter.isAlpha(type)) { // ALPHA->ALPHA: always ignore if case isn't considered. return(false); } else if (WordDelimiterFilter.isUpper(lastType) && WordDelimiterFilter.isAlpha(type)) { // UPPER->letter: Don't split return(false); } else if (!splitOnNumerics && ((WordDelimiterFilter.isAlpha(lastType) && WordDelimiterFilter.isDigit(type)) || (WordDelimiterFilter.isDigit(lastType) && WordDelimiterFilter.isAlpha(type)))) { // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split return(false); } return(true); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new WikipediaTokenizer(reader); TokenStream stream = new SopTokenFilter(tokenizer); stream = new WordDelimiterFilter(TEST_VERSION_CURRENT, stream, table, -50, protWords); stream = new SopTokenFilter(stream); return(new TokenStreamComponents(tokenizer, stream)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { var source = new StandardTokenizer(LuceneVersion.LUCENE_48, reader); TokenStream result = new WordDelimiterFilter(LuceneVersion.LUCENE_48, source, 255, CharArraySet.EMPTY_SET); result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(LuceneVersion.LUCENE_48, result); return(new TokenStreamComponents(source, result)); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { const LuceneVersion version = LuceneVersion.LUCENE_48; Tokenizer baseTokenizer = new MyTokenizer(version, reader); StandardFilter standardFilter = new StandardFilter(version, baseTokenizer); WordDelimiterFilter wordDelimiterFilter = new WordDelimiterFilter(version, standardFilter, WordDelimiterFlags.CATENATE_WORDS | WordDelimiterFlags.GENERATE_WORD_PARTS | WordDelimiterFlags.PRESERVE_ORIGINAL | WordDelimiterFlags.SPLIT_ON_CASE_CHANGE, CharArraySet.EMPTY_SET); LowerCaseFilter lcFilter = new LowerCaseFilter(version, wordDelimiterFilter); return(new TokenStreamComponents(baseTokenizer, lcFilter)); }
public GitHubIndex(Directory indexDirectory, string githubApiKey) { github = new GitHubClient(new ProductHeaderValue("LuceneNetDemo")) { Credentials = new Credentials(githubApiKey) }; analyzer = new PerFieldAnalyzerWrapper( // Example of a pre-built custom analyzer defaultAnalyzer: new HtmlStripAnalyzer(GitHubIndex.MatchVersion), // Example of inline anonymous analyzers fieldAnalyzers: new Dictionary <string, Analyzer> { // Field analyzer for owner { "owner", Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { var source = new KeywordTokenizer(reader); TokenStream result = new ASCIIFoldingFilter(source); result = new LowerCaseFilter(GitHubIndex.MatchVersion, result); return(new TokenStreamComponents(source, result)); }) }, // Field analyzer for name { "name", Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { var source = new StandardTokenizer(GitHubIndex.MatchVersion, reader); TokenStream result = new WordDelimiterFilter(GitHubIndex.MatchVersion, source, ~WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE, CharArraySet.EMPTY_SET); result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(GitHubIndex.MatchVersion, result); return(new TokenStreamComponents(source, result)); }) } }); queryParser = new MultiFieldQueryParser(GitHubIndex.MatchVersion, new[] { "name", "description", "readme" }, analyzer); indexWriter = new IndexWriter(indexDirectory, new IndexWriterConfig(GitHubIndex.MatchVersion, analyzer)); searcherManager = new SearcherManager(indexWriter, true, null); }
/// <summary> /// Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove /// it yet, simply note it. /// </summary> private void setBounds() { while (startBounds < length && (WordDelimiterFilter.isSubwordDelim(charType(text[startBounds])))) { startBounds++; } while (endBounds > startBounds && (WordDelimiterFilter.isSubwordDelim(charType(text[endBounds - 1])))) { endBounds--; } if (endsWithPossessive(endBounds)) { hasFinalPossessive = true; } current = startBounds; }
public virtual void TestCuriousWikipediaString() { CharArraySet protWords = new CharArraySet(TEST_VERSION_CURRENT, new JCG.HashSet <string> { "rrdpafa", "pupmmlu", "xlq", "dyy", "zqrxrrck", "o", "hsrlfvcha" }, false); byte[] table = (byte[])(Array) new sbyte[] { -57, 26, 1, 48, 63, -23, 55, -84, 18, 120, -97, 103, 58, 13, 84, 89, 57, -13, -63, 5, 28, 97, -54, -94, 102, -108, -5, 5, 46, 40, 43, 78, 43, -72, 36, 29, 124, -106, -22, -51, 65, 5, 31, -42, 6, -99, 97, 14, 81, -128, 74, 100, 54, -55, -25, 53, -71, -98, 44, 33, 86, 106, -42, 47, 115, -89, -18, -26, 22, -95, -43, 83, -125, 105, -104, -24, 106, -16, 126, 115, -105, 97, 65, -33, 57, 44, -1, 123, -68, 100, 13, -41, -64, -119, 0, 92, 94, -36, 53, -9, -102, -18, 90, 94, -26, 31, 71, -20 }; Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new WikipediaTokenizer(reader); TokenStream stream = new SopTokenFilter(tokenizer); stream = new WordDelimiterFilter(TEST_VERSION_CURRENT, stream, table, (WordDelimiterFlags)(object)-50, protWords); stream = new SopTokenFilter(stream); return(new TokenStreamComponents(tokenizer, stream)); }); CheckAnalysisConsistency(Random, a, false, "B\u28c3\ue0f8[ \ud800\udfc2 </p> jb"); }
/// <summary> /// Advance to the next subword in the string. /// </summary> /// <returns> index of the next subword, or <seealso cref="#DONE"/> if all subwords have been returned </returns> internal int next() { current = end; if (current == DONE) { return(DONE); } if (skipPossessive) { current += 2; skipPossessive = false; } int lastType = 0; while (current < endBounds && (WordDelimiterFilter.isSubwordDelim(lastType = charType(text[current])))) { current++; } if (current >= endBounds) { return(end = DONE); } for (end = current + 1; end < endBounds; end++) { int type_Renamed = charType(text[end]); if (isBreak(lastType, type_Renamed)) { break; } lastType = type_Renamed; } if (end < endBounds - 1 && endsWithPossessive(end + 2)) { skipPossessive = true; } return(end); }
public OffsetSorter(WordDelimiterFilter outerInstance) { this.outerInstance = outerInstance; }
public WordDelimiterConcatenation(WordDelimiterFilter outerInstance) { this.outerInstance = outerInstance; }
public static Lucene.Net.Analysis.TokenStream GetStandardFilterSet(System.IO.TextReader reader) { var mappingCharFilter = WordDelimiterFilter.GetCharMapper(reader); TokenStream ts = new WhitespaceTokenizer(mappingCharFilter); WordDelimiterFilter filter = new WordDelimiterFilter(ts, 1, 1, 1, 1, 1); TokenStream result = new LowerCaseFilter(filter); return result; }
/// <summary> /// Determines if the text at the given position indicates an English possessive which should be removed /// </summary> /// <param name="pos"> Position in the text to check if it indicates an English possessive </param> /// <returns> {@code true} if the text at the position indicates an English posessive, {@code false} otherwise </returns> private bool endsWithPossessive(int pos) { return(stemEnglishPossessive && pos > 2 && text[pos - 2] == '\'' && (text[pos - 1] == 's' || text[pos - 1] == 'S') && WordDelimiterFilter.isAlpha(charType(text[pos - 3])) && (pos == endBounds || WordDelimiterFilter.isSubwordDelim(charType(text[pos])))); }