public void TestReset()
        {
            EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, Side.FRONT, 1, 3);

            AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5 /* abcde */);
            tokenizer.Reset(new StringReader("abcde"));
            AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5 /* abcde */);
        }
Beispiel #2
0
            public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
                //TokenStream stream = new SopTokenFilter(tokenizer);
                TokenStream stream = new ShingleFilter(tokenizer, 5);

                //stream = new SopTokenFilter(stream);
                stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
                //stream = new SopTokenFilter(stream);
                return(new TokenStreamComponents(tokenizer, stream));
            }
Beispiel #3
0
        public virtual void TestUnicodeShinglesAndNgrams()
        {
            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
                //TokenStream stream = new SopTokenFilter(tokenizer);
                TokenStream stream = new ShingleFilter(tokenizer, 5);
                //stream = new SopTokenFilter(stream);
                stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
                //stream = new SopTokenFilter(stream);
                return(new TokenStreamComponents(tokenizer, stream));
            });

            CheckRandomData(Random, analyzer, 2000);
        }
        public void CanUseAllAnalysisComponentOptions()
        {
            Run(() =>
            {
                var tokenizerWithAllTokenCharacterKinds =
                    new EdgeNGramTokenizer(
                        SearchTestUtilities.GenerateName(),
                        minGram: 1,
                        maxGram: 2,
                        tokenChars: GetAllEnumValues <TokenCharacterKind>());

                Tokenizer CreateMicrosoftLanguageTokenizer(MicrosoftTokenizerLanguage mtl) =>
                new MicrosoftLanguageTokenizer(
                    SearchTestUtilities.GenerateName(),
                    maxTokenLength: 200,
                    isSearchTokenizer: false,
                    language: mtl);

                IEnumerable <Tokenizer> tokenizersWithAllMicrosoftLanguages =
                    GetAllEnumValues <MicrosoftTokenizerLanguage>().Select(CreateMicrosoftLanguageTokenizer);

                Tokenizer CreateMicrosoftStemmingLanguageTokenizer(MicrosoftStemmingTokenizerLanguage mtl) =>
                new MicrosoftLanguageStemmingTokenizer(
                    SearchTestUtilities.GenerateName(),
                    maxTokenLength: 200,
                    isSearchTokenizer: false,
                    language: mtl);

                IEnumerable <Tokenizer> tokenizersWithAllMicrosoftStemmingLanguages =
                    GetAllEnumValues <MicrosoftStemmingTokenizerLanguage>().Select(CreateMicrosoftStemmingLanguageTokenizer);

                var tokenFilterWithAllCjkScripts =
                    new CjkBigramTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        ignoreScripts: GetAllEnumValues <CjkBigramTokenFilterScripts>(),
                        outputUnigrams: true);

                TokenFilter CreateEdgeNGramTokenFilter(EdgeNGramTokenFilterSide s) =>
                new EdgeNGramTokenFilterV2(SearchTestUtilities.GenerateName(), minGram: 1, maxGram: 2, side: s);

                IEnumerable <TokenFilter> tokenFiltersWithAllEdgeNGramSides =
                    GetAllEnumValues <EdgeNGramTokenFilterSide>().Select(CreateEdgeNGramTokenFilter);

                TokenFilter CreatePhoneticTokenFilter(PhoneticEncoder pe) =>
                new PhoneticTokenFilter(SearchTestUtilities.GenerateName(), encoder: pe, replaceOriginalTokens: false);

                IEnumerable <TokenFilter> tokenFiltersWithAllPhoneticEncoders =
                    GetAllEnumValues <PhoneticEncoder>().Select(CreatePhoneticTokenFilter);

                IEnumerable <TokenFilter> tokenFiltersWithAllSnowballLanguages =
                    GetAllEnumValues <SnowballTokenFilterLanguage>().Select(l => new SnowballTokenFilter(SearchTestUtilities.GenerateName(), l));

                IEnumerable <TokenFilter> tokenFiltersWithAllStemmerLanguages =
                    GetAllEnumValues <StemmerTokenFilterLanguage>().Select(l => new StemmerTokenFilter(SearchTestUtilities.GenerateName(), l));

                TokenFilter CreateStopTokenFilter(StopwordsList l) =>
                new StopwordsTokenFilter(
                    SearchTestUtilities.GenerateName(),
                    stopwordsList: l,
                    ignoreCase: false,
                    removeTrailingStopWords: true);

                IEnumerable <TokenFilter> tokenFiltersWithAllStopwordLists = GetAllEnumValues <StopwordsList>().Select(CreateStopTokenFilter);

                // Split the tokenizers and token filters into different indexes to get around the 50-item limit.
                Index index = CreateTestIndex();

                index.Tokenizers =
                    new[] { tokenizerWithAllTokenCharacterKinds }
                .Concat(tokenizersWithAllMicrosoftLanguages)
                .Concat(tokenizersWithAllMicrosoftStemmingLanguages).ToArray();

                index.TokenFilters =
                    new[] { tokenFilterWithAllCjkScripts }
                .Concat(tokenFiltersWithAllEdgeNGramSides)
                .Concat(tokenFiltersWithAllPhoneticEncoders)
                .Concat(tokenFiltersWithAllSnowballLanguages)
                .Concat(tokenFiltersWithAllStemmerLanguages)
                .Concat(tokenFiltersWithAllStopwordLists).ToArray();

                TestAnalysisComponents(index);
            });
        }
        public void TestBackUnigram()
        {
            EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, Side.BACK, 1, 1);

            AssertTokenStreamContents(tokenizer, new String[] { "e" }, new int[] { 4 }, new int[] { 5 }, 5 /* abcde */);
        }
        public void TestFrontUnigram()
        {
            EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, Side.FRONT, 1, 1);

            AssertTokenStreamContents(tokenizer, new String[] { "a" }, new int[] { 0 }, new int[] { 1 }, 5 /* abcde */);
        }
        public void TestBackRangeOfNgrams()
        {
            EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, Side.BACK, 1, 3);

            AssertTokenStreamContents(tokenizer, new String[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 }, 5 /* abcde */);
        }
        public void TestFrontRangeOfNgrams()
        {
            EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, Side.FRONT, 1, 3);

            AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5 /* abcde */);
        }
        public void TestOversizedNgrams()
        {
            EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, Side.FRONT, 6, 6);

            AssertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
        }