Exemplo n.º 1
0
        public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller()
        {
            // Analyzing "purple wizard of the", where of and the are removed as a
            // stopwords, leaving two trailing holes:
            Token[]       inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter      = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);

            filter.SetFillerToken("--");

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);

            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.SetFillerToken("");

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);


            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.SetFillerToken(null);

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);


            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.SetFillerToken(null);
            filter.SetTokenSeparator(null);

            AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
        }
Exemplo n.º 2
0
        protected internal virtual void shingleFilterTest(string tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, string[] types, bool outputUnigrams)
        {
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);

            filter.SetTokenSeparator(tokenSeparator);
            filter.SetOutputUnigrams(outputUnigrams);
            shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
        }
Exemplo n.º 3
0
        public override TokenStream Create(TokenStream input)
        {
            ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);

            r.SetOutputUnigrams(outputUnigrams);
            r.SetOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
            r.SetTokenSeparator(tokenSeparator);
            r.SetFillerToken(fillerToken);
            return(r);
        }
Exemplo n.º 4
0
        protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
        {
            ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);

            filter.SetMinShingleSize(minShingleSize);
            filter.SetMaxShingleSize(maxShingleSize);
            filter.SetTokenSeparator(tokenSeparator);
            filter.SetOutputUnigrams(outputUnigrams);
            filter.SetOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
            filter.SetFillerToken(fillerToken);
            return(new TokenStreamComponents(components.Tokenizer, filter));
        }