Exemplo n.º 1
0
 public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource)
 {
     this.outerInstance = outerInstance;
     this.attSource     = attSource;
     this.termAtt       = attSource.GetAttribute(typeof(CharTermAttribute));
     this.offsetAtt     = attSource.GetAttribute(typeof(OffsetAttribute));
 }
Exemplo n.º 2
0
 public override ShingleFilter create(TokenStream input)
 {
     ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
     r.OutputUnigrams = outputUnigrams;
     r.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
     r.TokenSeparator = tokenSeparator;
     r.FillerToken = fillerToken;
     return r;
 }
Exemplo n.º 3
0
        public override ShingleFilter create(TokenStream input)
        {
            ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);

            r.OutputUnigrams             = outputUnigrams;
            r.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
            r.TokenSeparator             = tokenSeparator;
            r.FillerToken = fillerToken;
            return(r);
        }
        protected internal override TokenStreamComponents wrapComponents(string fieldName, TokenStreamComponents components)
        {
            ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);

            filter.MinShingleSize             = minShingleSize;
            filter.MaxShingleSize             = maxShingleSize;
            filter.TokenSeparator             = tokenSeparator;
            filter.OutputUnigrams             = outputUnigrams;
            filter.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
            filter.FillerToken = fillerToken;
            return(new TokenStreamComponents(components.Tokenizer, filter));
        }
Exemplo n.º 5
0
 public CircularSequence(ShingleFilter outerInstance)
 {
     this.outerInstance = outerInstance;
     minValue           = outerInstance.outputUnigrams ? 1 : outerInstance.minShingleSize;
     reset();
 }
Exemplo n.º 6
0
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: protected void shingleFilterTestCommon(ShingleFilter filter, org.apache.lucene.analysis.Token[] tokensToCompare, int[] positionIncrements, String[] types) throws java.io.IOException
        protected internal virtual void shingleFilterTestCommon(ShingleFilter filter, Token[] tokensToCompare, int[] positionIncrements, string[] types)
        {
            string[] text = new string[tokensToCompare.Length];
            int[] startOffsets = new int[tokensToCompare.Length];
            int[] endOffsets = new int[tokensToCompare.Length];

            for (int i = 0; i < tokensToCompare.Length; i++)
            {
              text[i] = new string(tokensToCompare[i].buffer(),0, tokensToCompare[i].length());
              startOffsets[i] = tokensToCompare[i].startOffset();
              endOffsets[i] = tokensToCompare[i].endOffset();
            }

            assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
        }
Exemplo n.º 7
0
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, org.apache.lucene.analysis.Token[] tokensToShingle, org.apache.lucene.analysis.Token[] tokensToCompare, int[] positionIncrements, String[] types, boolean outputUnigrams) throws java.io.IOException
 protected internal virtual void shingleFilterTest(string tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, string[] types, bool outputUnigrams)
 {
     ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
     filter.TokenSeparator = tokenSeparator;
     filter.OutputUnigrams = outputUnigrams;
     shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
 }
Exemplo n.º 8
0
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testTwoTrailingHolesTriShingleWithTokenFiller() throws java.io.IOException
        public virtual void testTwoTrailingHolesTriShingleWithTokenFiller()
        {
            // Analyzing "purple wizard of the", where of and the are removed as a
            // stopwords, leaving two trailing holes:
            Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)};
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = "--";

            assertTokenStreamContents(filter, new string[]{"purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --"}, new int[]{0, 0, 0, 7, 7, 7}, new int[]{6, 13, 20, 13, 20, 20}, new int[]{1, 0, 0, 1, 0, 0}, 20);

             filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = "";

            assertTokenStreamContents(filter, new string[]{"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  "}, new int[]{0, 0, 0, 7, 7, 7}, new int[]{6, 13, 20, 13, 20, 20}, new int[]{1, 0, 0, 1, 0, 0}, 20);

            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = null;

            assertTokenStreamContents(filter, new string[] {"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  "}, new int[] {0, 0, 0, 7, 7, 7}, new int[] {6, 13, 20, 13, 20, 20}, new int[] {1, 0, 0, 1, 0, 0}, 20);

            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = null;
            filter.TokenSeparator = null;

            assertTokenStreamContents(filter, new string[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"}, new int[] {0, 0, 0, 7, 7, 7}, new int[] {6, 13, 20, 13, 20, 20}, new int[] {1, 0, 0, 1, 0, 0}, 20);
        }
Exemplo n.º 9
0
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testTwoTrailingHoles() throws java.io.IOException
        public virtual void testTwoTrailingHoles()
        {
            // Analyzing "purple wizard of the", where of and the are removed as a
            // stopwords, leaving two trailing holes:
            Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)};
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 2);

            assertTokenStreamContents(filter, new string[] {"purple", "purple wizard", "wizard", "wizard _"}, new int[] {0, 0, 7, 7}, new int[] {6, 13, 13, 20}, new int[] {1, 0, 1, 0}, 20);
        }
Exemplo n.º 10
0
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testTrailingHole2() throws java.io.IOException
        public virtual void testTrailingHole2()
        {
            // Analyzing "purple wizard of", where of is removed as a
            // stopword leaving a trailing hole:
            Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)};
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2);

            assertTokenStreamContents(filter, new string[] {"purple", "purple wizard", "wizard", "wizard _"}, new int[] {0, 0, 7, 7}, new int[] {6, 13, 13, 16}, new int[] {1, 0, 1, 0}, 16);
        }
Exemplo n.º 11
0
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testReset() throws Exception
 public virtual void testReset()
 {
     Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
     TokenStream filter = new ShingleFilter(wsTokenizer, 2);
     assertTokenStreamContents(filter, new string[]{"please","please divide","divide","divide this","this","this sentence","sentence"}, new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27}, new string[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE}, new int[]{1,0,1,0,1,0,1});
     wsTokenizer.Reader = new StringReader("please divide this sentence");
     assertTokenStreamContents(filter, new string[]{"please","please divide","divide","divide this","this","this sentence","sentence"}, new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27}, new string[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE}, new int[]{1,0,1,0,1,0,1});
 }
Exemplo n.º 12
0
		public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource)
		{
			this.outerInstance = outerInstance;
		  this.attSource = attSource;
		  this.termAtt = attSource.getAttribute(typeof(CharTermAttribute));
		  this.offsetAtt = attSource.getAttribute(typeof(OffsetAttribute));
		}
Exemplo n.º 13
0
		public CircularSequence(ShingleFilter outerInstance)
		{
			this.outerInstance = outerInstance;
		  minValue = outerInstance.outputUnigrams ? 1 : outerInstance.minShingleSize;
		  reset();
		}
Exemplo n.º 14
0
        public override TokenStream create(TokenStream input)
        {
            var shingle = new ShingleFilter(input, _maxShingleSize ?? ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);

            return(shingle);
        }