// ================================================= Helper Methods ================================================

        /// <summary>
        /// Determines whether the transition from lastType to type indicates a break
        /// </summary>
        /// <param name="lastType"> Last subword type </param>
        /// <param name="type"> Current subword type </param>
        /// <returns> {@code true} if the transition indicates a break, {@code false} otherwise </returns>
        private bool isBreak(int lastType, int type)
        {
            if ((type & lastType) != 0)
            {
                return(false);
            }

            if (!splitOnCaseChange && WordDelimiterFilter.isAlpha(lastType) && WordDelimiterFilter.isAlpha(type))
            {
                // ALPHA->ALPHA: always ignore if case isn't considered.
                return(false);
            }
            else if (WordDelimiterFilter.isUpper(lastType) && WordDelimiterFilter.isAlpha(type))
            {
                // UPPER->letter: Don't split
                return(false);
            }
            else if (!splitOnNumerics && ((WordDelimiterFilter.isAlpha(lastType) && WordDelimiterFilter.isDigit(type)) || (WordDelimiterFilter.isDigit(lastType) && WordDelimiterFilter.isAlpha(type))))
            {
                // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
                return(false);
            }

            return(true);
        }
Exemple #2
0
            public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer   tokenizer = new WikipediaTokenizer(reader);
                TokenStream stream    = new SopTokenFilter(tokenizer);

                stream = new WordDelimiterFilter(TEST_VERSION_CURRENT, stream, table, -50, protWords);
                stream = new SopTokenFilter(stream);
                return(new TokenStreamComponents(tokenizer, stream));
            }
Exemple #3
0
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            var         source = new StandardTokenizer(LuceneVersion.LUCENE_48, reader);
            TokenStream result = new WordDelimiterFilter(LuceneVersion.LUCENE_48, source, 255, CharArraySet.EMPTY_SET);

            result = new ASCIIFoldingFilter(result);
            result = new LowerCaseFilter(LuceneVersion.LUCENE_48, result);
            return(new TokenStreamComponents(source, result));
        }
Exemple #4
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            const LuceneVersion version = LuceneVersion.LUCENE_48;

            Tokenizer           baseTokenizer       = new MyTokenizer(version, reader);
            StandardFilter      standardFilter      = new StandardFilter(version, baseTokenizer);
            WordDelimiterFilter wordDelimiterFilter = new WordDelimiterFilter(version, standardFilter,
                                                                              WordDelimiterFlags.CATENATE_WORDS | WordDelimiterFlags.GENERATE_WORD_PARTS |
                                                                              WordDelimiterFlags.PRESERVE_ORIGINAL | WordDelimiterFlags.SPLIT_ON_CASE_CHANGE, CharArraySet.EMPTY_SET);
            LowerCaseFilter lcFilter = new LowerCaseFilter(version, wordDelimiterFilter);

            return(new TokenStreamComponents(baseTokenizer, lcFilter));
        }
        public GitHubIndex(Directory indexDirectory, string githubApiKey)
        {
            github = new GitHubClient(new ProductHeaderValue("LuceneNetDemo"))
            {
                Credentials = new Credentials(githubApiKey)
            };

            analyzer = new PerFieldAnalyzerWrapper(
                // Example of a pre-built custom analyzer
                defaultAnalyzer: new HtmlStripAnalyzer(GitHubIndex.MatchVersion),

                // Example of inline anonymous analyzers
                fieldAnalyzers: new Dictionary <string, Analyzer>
            {
                // Field analyzer for owner
                {
                    "owner",
                    Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                    {
                        var source         = new KeywordTokenizer(reader);
                        TokenStream result = new ASCIIFoldingFilter(source);
                        result             = new LowerCaseFilter(GitHubIndex.MatchVersion, result);
                        return(new TokenStreamComponents(source, result));
                    })
                },
                // Field analyzer for name
                {
                    "name",
                    Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                    {
                        var source         = new StandardTokenizer(GitHubIndex.MatchVersion, reader);
                        TokenStream result = new WordDelimiterFilter(GitHubIndex.MatchVersion, source, ~WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE, CharArraySet.EMPTY_SET);
                        result             = new ASCIIFoldingFilter(result);
                        result             = new LowerCaseFilter(GitHubIndex.MatchVersion, result);
                        return(new TokenStreamComponents(source, result));
                    })
                }
            });

            queryParser = new MultiFieldQueryParser(GitHubIndex.MatchVersion,
                                                    new[] { "name", "description", "readme" }, analyzer);


            indexWriter     = new IndexWriter(indexDirectory, new IndexWriterConfig(GitHubIndex.MatchVersion, analyzer));
            searcherManager = new SearcherManager(indexWriter, true, null);
        }
        /// <summary>
        /// Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
        /// it yet, simply note it.
        /// </summary>
        private void setBounds()
        {
            while (startBounds < length && (WordDelimiterFilter.isSubwordDelim(charType(text[startBounds]))))
            {
                startBounds++;
            }

            while (endBounds > startBounds && (WordDelimiterFilter.isSubwordDelim(charType(text[endBounds - 1]))))
            {
                endBounds--;
            }
            if (endsWithPossessive(endBounds))
            {
                hasFinalPossessive = true;
            }
            current = startBounds;
        }
Exemple #7
0
        public virtual void TestCuriousWikipediaString()
        {
            CharArraySet protWords = new CharArraySet(TEST_VERSION_CURRENT, new JCG.HashSet <string> {
                "rrdpafa", "pupmmlu", "xlq", "dyy", "zqrxrrck", "o", "hsrlfvcha"
            }, false);

            byte[]   table = (byte[])(Array) new sbyte[] { -57, 26, 1, 48, 63, -23, 55, -84, 18, 120, -97, 103, 58, 13, 84, 89, 57, -13, -63, 5, 28, 97, -54, -94, 102, -108, -5, 5, 46, 40, 43, 78, 43, -72, 36, 29, 124, -106, -22, -51, 65, 5, 31, -42, 6, -99, 97, 14, 81, -128, 74, 100, 54, -55, -25, 53, -71, -98, 44, 33, 86, 106, -42, 47, 115, -89, -18, -26, 22, -95, -43, 83, -125, 105, -104, -24, 106, -16, 126, 115, -105, 97, 65, -33, 57, 44, -1, 123, -68, 100, 13, -41, -64, -119, 0, 92, 94, -36, 53, -9, -102, -18, 90, 94, -26, 31, 71, -20 };
            Analyzer a     = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new WikipediaTokenizer(reader);
                TokenStream stream  = new SopTokenFilter(tokenizer);
                stream = new WordDelimiterFilter(TEST_VERSION_CURRENT, stream, table, (WordDelimiterFlags)(object)-50, protWords);
                stream = new SopTokenFilter(stream);
                return(new TokenStreamComponents(tokenizer, stream));
            });

            CheckAnalysisConsistency(Random, a, false, "B\u28c3\ue0f8[ \ud800\udfc2 </p> jb");
        }
        /// <summary>
        /// Advance to the next subword in the string.
        /// </summary>
        /// <returns> index of the next subword, or <seealso cref="#DONE"/> if all subwords have been returned </returns>
        internal int next()
        {
            current = end;
            if (current == DONE)
            {
                return(DONE);
            }

            if (skipPossessive)
            {
                current       += 2;
                skipPossessive = false;
            }

            int lastType = 0;

            while (current < endBounds && (WordDelimiterFilter.isSubwordDelim(lastType = charType(text[current]))))
            {
                current++;
            }

            if (current >= endBounds)
            {
                return(end = DONE);
            }

            for (end = current + 1; end < endBounds; end++)
            {
                int type_Renamed = charType(text[end]);
                if (isBreak(lastType, type_Renamed))
                {
                    break;
                }
                lastType = type_Renamed;
            }

            if (end < endBounds - 1 && endsWithPossessive(end + 2))
            {
                skipPossessive = true;
            }

            return(end);
        }
 public OffsetSorter(WordDelimiterFilter outerInstance)
 {
     this.outerInstance = outerInstance;
 }
 public WordDelimiterConcatenation(WordDelimiterFilter outerInstance)
 {
     this.outerInstance = outerInstance;
 }
 public static Lucene.Net.Analysis.TokenStream GetStandardFilterSet(System.IO.TextReader reader)
 {
     var mappingCharFilter = WordDelimiterFilter.GetCharMapper(reader);
         TokenStream ts = new WhitespaceTokenizer(mappingCharFilter);
         WordDelimiterFilter filter = new WordDelimiterFilter(ts, 1, 1, 1, 1, 1);
         TokenStream result = new LowerCaseFilter(filter);
         return result;
 }
 /// <summary>
 /// Determines if the text at the given position indicates an English possessive which should be removed
 /// </summary>
 /// <param name="pos"> Position in the text to check if it indicates an English possessive </param>
 /// <returns> {@code true} if the text at the position indicates an English posessive, {@code false} otherwise </returns>
 private bool endsWithPossessive(int pos)
 {
     return(stemEnglishPossessive && pos > 2 && text[pos - 2] == '\'' && (text[pos - 1] == 's' || text[pos - 1] == 'S') && WordDelimiterFilter.isAlpha(charType(text[pos - 3])) && (pos == endBounds || WordDelimiterFilter.isSubwordDelim(charType(text[pos]))));
 }