Exemplo n.º 1
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testStopListPositions() throws java.io.IOException
        public virtual void testStopListPositions()
        {
            CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
            StopAnalyzer newStop      = new StopAnalyzer(TEST_VERSION_CURRENT, stopWordsSet);
            string       s            = "This is a good test of the english stop analyzer with positions";

            int[]       expectedIncr = new int[] { 1, 1, 1, 3, 1, 1, 1, 2, 1 };
            TokenStream stream       = newStop.tokenStream("test", s);

            try
            {
                assertNotNull(stream);
                int i = 0;
                CharTermAttribute          termAtt    = stream.getAttribute(typeof(CharTermAttribute));
                PositionIncrementAttribute posIncrAtt = stream.addAttribute(typeof(PositionIncrementAttribute));

                stream.reset();
                while (stream.incrementToken())
                {
                    string text = termAtt.ToString();
                    assertFalse(stopWordsSet.contains(text));
                    assertEquals(expectedIncr[i++], posIncrAtt.PositionIncrement);
                }
                stream.end();
            }
            finally
            {
                IOUtils.closeWhileHandlingException(stream);
            }
        }
Exemplo n.º 2
0
 public SingleCharTokenizer(TokenStream input) : base(input)
 {
     _input                      = input;
     _termAttribute              = (TermAttribute)AddAttribute(typeof(TermAttribute));
     _offsetAttribute            = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
     _positionIncrementAttribute = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
 }
Exemplo n.º 3
0
 /// <summary>
 /// Creates NGramTokenFilter with given min and max n-grams. </summary>
 /// <param name="version"> Lucene version to enable correct position increments.
 ///                See <a href="#version">above</a> for details. </param>
 /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
 {
     this.version   = version;
     this.charUtils = version.onOrAfter(Version.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance;
     if (minGram < 1)
     {
         throw new System.ArgumentException("minGram must be greater than zero");
     }
     if (minGram > maxGram)
     {
         throw new System.ArgumentException("minGram must not be greater than maxGram");
     }
     this.minGram = minGram;
     this.maxGram = maxGram;
     if (version.onOrAfter(Version.LUCENE_44))
     {
         posIncAtt = addAttribute(typeof(PositionIncrementAttribute));
         posLenAtt = addAttribute(typeof(PositionLengthAttribute));
     }
     else
     {
         posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
         posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
     }
 }
Exemplo n.º 4
0
        // we only check a few core attributes here.
        // TODO: test other things
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void assertEquals(String s, org.apache.lucene.analysis.TokenStream left, org.apache.lucene.analysis.TokenStream right) throws Exception
        public virtual void assertEquals(string s, TokenStream left, TokenStream right)
        {
            left.reset();
            right.reset();
            CharTermAttribute          leftTerm    = left.addAttribute(typeof(CharTermAttribute));
            CharTermAttribute          rightTerm   = right.addAttribute(typeof(CharTermAttribute));
            OffsetAttribute            leftOffset  = left.addAttribute(typeof(OffsetAttribute));
            OffsetAttribute            rightOffset = right.addAttribute(typeof(OffsetAttribute));
            PositionIncrementAttribute leftPos     = left.addAttribute(typeof(PositionIncrementAttribute));
            PositionIncrementAttribute rightPos    = right.addAttribute(typeof(PositionIncrementAttribute));

            while (left.incrementToken())
            {
                assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
                assertEquals("wrong term text for input: " + s, leftTerm.ToString(), rightTerm.ToString());
                assertEquals("wrong position for input: " + s, leftPos.PositionIncrement, rightPos.PositionIncrement);
                assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
                assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
            }
            ;
            assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
            left.end();
            right.end();
            assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
            left.close();
            right.close();
        }
        /// <summary>
        ///   Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using
        ///   affix rules in the provided HunspellDictionary.
        /// </summary>
        /// <param name="input">TokenStream whose tokens will be stemmed.</param>
        /// <param name="dictionary">HunspellDictionary containing the affix rules and words that will be used to stem the tokens.</param>
        /// <param name="dedup">true if only unique terms should be output.</param>
        public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, Boolean dedup = true)
            : base(input)
        {
            _posIncAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
            _termAtt   = (TermAttribute)AddAttribute(typeof(TermAttribute));

            _dedup   = dedup;
            _stemmer = new HunspellStemmer(dictionary);
        }
Exemplo n.º 6
0
        public SynonymFilter(TokenStream input, SynonymEngine engine) : base(input)
        {
            if (engine == null)
            {
                throw new ArgumentNullException("synonymEngine");
            }
            synonymStack = new Stack <string>();
            this.engine  = engine;

            this.termAtt    = (TermAttribute)AddAttribute <ITermAttribute>();
            this.posIncrAtt = (PositionIncrementAttribute)AddAttribute <IPositionIncrementAttribute>();

            //this.termAtt = this.AddAttribute<string>();
            //this.posIncrAtt = this.AddAttribute<string>();
        }
Exemplo n.º 7
0
            /// <summary>
            /// Sugar: analyzes the text with the analyzer and
            ///  separates by <seealso cref="SynonymMap#WORD_SEPARATOR"/>.
            ///  reuse and its chars must not be null.
            /// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public org.apache.lucene.util.CharsRef analyze(String text, org.apache.lucene.util.CharsRef reuse) throws java.io.IOException
            public virtual CharsRef analyze(string text, CharsRef reuse)
            {
                IOException priorException = null;
                TokenStream ts             = analyzer.tokenStream("", text);

                try
                {
                    CharTermAttribute          termAtt   = ts.addAttribute(typeof(CharTermAttribute));
                    PositionIncrementAttribute posIncAtt = ts.addAttribute(typeof(PositionIncrementAttribute));
                    ts.reset();
                    reuse.length = 0;
                    while (ts.incrementToken())
                    {
                        int length = termAtt.length();
                        if (length == 0)
                        {
                            throw new System.ArgumentException("term: " + text + " analyzed to a zero-length token");
                        }
                        if (posIncAtt.PositionIncrement != 1)
                        {
                            throw new System.ArgumentException("term: " + text + " analyzed to a token with posinc != 1");
                        }
                        reuse.grow(reuse.length + length + 1);   // current + word + separator
                        int end = reuse.offset + reuse.length;
                        if (reuse.length > 0)
                        {
                            reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
                            reuse.length++;
                        }
                        Array.Copy(termAtt.buffer(), 0, reuse.chars, end, length);
                        reuse.length += length;
                    }
                    ts.end();
                }
                catch (IOException e)
                {
                    priorException = e;
                }
                finally
                {
                    IOUtils.closeWhileHandlingException(priorException, ts);
                }
                if (reuse.length == 0)
                {
                    throw new System.ArgumentException("term: " + text + " was completely eliminated by analyzer");
                }
                return(reuse);
            }
Exemplo n.º 8
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: private void testPositons(TypeTokenFilter stpf) throws java.io.IOException
        private void testPositons(TypeTokenFilter stpf)
        {
            TypeAttribute              typeAtt       = stpf.getAttribute(typeof(TypeAttribute));
            CharTermAttribute          termAttribute = stpf.getAttribute(typeof(CharTermAttribute));
            PositionIncrementAttribute posIncrAtt    = stpf.getAttribute(typeof(PositionIncrementAttribute));

            stpf.reset();
            bool enablePositionIncrements = stpf.EnablePositionIncrements;

            while (stpf.incrementToken())
            {
                log("Token: " + termAttribute.ToString() + ": " + typeAtt.type() + " - " + posIncrAtt.PositionIncrement);
                assertEquals("if position increment is enabled the positionIncrementAttribute value should be 3, otherwise 1", posIncrAtt.PositionIncrement, enablePositionIncrements ? 3 : 1);
            }
            stpf.end();
            stpf.close();
        }
Exemplo n.º 9
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: private void doTestStopPositons(StopFilter stpf, boolean enableIcrements) throws java.io.IOException
        private void doTestStopPositons(StopFilter stpf, bool enableIcrements)
        {
            log("---> test with enable-increments-" + (enableIcrements?"enabled":"disabled"));
            stpf.EnablePositionIncrements = enableIcrements;
            CharTermAttribute          termAtt    = stpf.getAttribute(typeof(CharTermAttribute));
            PositionIncrementAttribute posIncrAtt = stpf.getAttribute(typeof(PositionIncrementAttribute));

            stpf.reset();
            for (int i = 0; i < 20; i += 3)
            {
                assertTrue(stpf.incrementToken());
                log("Token " + i + ": " + stpf);
                string w = English.intToEnglish(i).trim();
                assertEquals("expecting token " + i + " to be " + w, w, termAtt.ToString());
                assertEquals("all but first token must have position increment of 3", enableIcrements?(i == 0?1:3):1, posIncrAtt.PositionIncrement);
            }
            assertFalse(stpf.incrementToken());
            stpf.end();
            stpf.close();
        }
Exemplo n.º 10
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testFilterTokens() throws Exception
        public virtual void testFilterTokens()
        {
            SnowballFilter             filter     = new SnowballFilter(new TestTokenStream(this), "English");
            CharTermAttribute          termAtt    = filter.getAttribute(typeof(CharTermAttribute));
            OffsetAttribute            offsetAtt  = filter.getAttribute(typeof(OffsetAttribute));
            TypeAttribute              typeAtt    = filter.getAttribute(typeof(TypeAttribute));
            PayloadAttribute           payloadAtt = filter.getAttribute(typeof(PayloadAttribute));
            PositionIncrementAttribute posIncAtt  = filter.getAttribute(typeof(PositionIncrementAttribute));
            FlagsAttribute             flagsAtt   = filter.getAttribute(typeof(FlagsAttribute));

            filter.incrementToken();

            assertEquals("accent", termAtt.ToString());
            assertEquals(2, offsetAtt.startOffset());
            assertEquals(7, offsetAtt.endOffset());
            assertEquals("wrd", typeAtt.type());
            assertEquals(3, posIncAtt.PositionIncrement);
            assertEquals(77, flagsAtt.Flags);
            assertEquals(new BytesRef(new sbyte[] { 0, 1, 2, 3 }), payloadAtt.Payload);
        }
Exemplo n.º 11
0
        public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int?finalOffset)
        {
            Assert.IsNotNull(output);
            CheckClearAttributesAttribute checkClearAtt = (CheckClearAttributesAttribute)ts.AddAttribute(typeof(CheckClearAttributesAttribute));

            Assert.IsTrue(ts.HasAttribute(typeof(TermAttribute)), "has no TermAttribute");
            TermAttribute termAtt = (TermAttribute)ts.GetAttribute(typeof(TermAttribute));

            OffsetAttribute offsetAtt = null;

            if (startOffsets != null || endOffsets != null || finalOffset != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(OffsetAttribute)), "has no OffsetAttribute");
                offsetAtt = (OffsetAttribute)ts.GetAttribute(typeof(OffsetAttribute));
            }

            TypeAttribute typeAtt = null;

            if (types != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(TypeAttribute)), "has no TypeAttribute");
                typeAtt = (TypeAttribute)ts.GetAttribute(typeof(TypeAttribute));
            }

            PositionIncrementAttribute posIncrAtt = null;

            if (posIncrements != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(PositionIncrementAttribute)), "has no PositionIncrementAttribute");
                posIncrAtt = (PositionIncrementAttribute)ts.GetAttribute(typeof(PositionIncrementAttribute));
            }

            ts.Reset();
            for (int i = 0; i < output.Length; i++)
            {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetTermBuffer("bogusTerm");
                if (offsetAtt != null)
                {
                    offsetAtt.SetOffset(14584724, 24683243);
                }
                if (typeAtt != null)
                {
                    typeAtt.SetType("bogusType");
                }
                if (posIncrAtt != null)
                {
                    posIncrAtt.SetPositionIncrement(45987657);
                }

                checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.Term(), "term " + i);
                if (startOffsets != null)
                {
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i);
                }
                if (endOffsets != null)
                {
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i);
                }
                if (types != null)
                {
                    Assert.AreEqual(types[i], typeAtt.Type(), "type " + i);
                }
                if (posIncrements != null)
                {
                    Assert.AreEqual(posIncrements[i], posIncrAtt.GetPositionIncrement(), "posIncrement " + i);
                }
            }
            Assert.IsFalse(ts.IncrementToken(), "end of stream");
            ts.End();
            if (finalOffset.HasValue)
            {
                Assert.AreEqual(finalOffset, offsetAtt.EndOffset(), "finalOffset ");
            }
            ts.Close();
        }
Exemplo n.º 12
0
	  /// <summary>
	  /// Creates NGramTokenFilter with given min and max n-grams. </summary>
	  /// <param name="version"> Lucene version to enable correct position increments.
	  ///                See <a href="#version">above</a> for details. </param>
	  /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
	  /// <param name="minGram"> the smallest n-gram to generate </param>
	  /// <param name="maxGram"> the largest n-gram to generate </param>
	  public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
	  {
		this.version = version;
		this.charUtils = version.onOrAfter(Version.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance;
		if (minGram < 1)
		{
		  throw new System.ArgumentException("minGram must be greater than zero");
		}
		if (minGram > maxGram)
		{
		  throw new System.ArgumentException("minGram must not be greater than maxGram");
		}
		this.minGram = minGram;
		this.maxGram = maxGram;
		if (version.onOrAfter(Version.LUCENE_44))
		{
		  posIncAtt = addAttribute(typeof(PositionIncrementAttribute));
		  posLenAtt = addAttribute(typeof(PositionLengthAttribute));
		}
		else
		{
		  posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
		  posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
		}
	  }
Exemplo n.º 13
0
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testRandom() throws Exception
        public virtual void testRandom()
        {
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final int alphabetSize = org.apache.lucene.util.TestUtil.nextInt(random(), 2, 7);
            int alphabetSize = TestUtil.Next(random(), 2, 7);

            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final int docLen = atLeast(3000);
            int docLen = atLeast(3000);
            //final int docLen = 50;

            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final String document = getRandomString('a', alphabetSize, docLen);
            string document = getRandomString('a', alphabetSize, docLen);

            if (VERBOSE)
            {
              Console.WriteLine("TEST: doc=" + document);
            }

            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final int numSyn = atLeast(5);
            int numSyn = atLeast(5);
            //final int numSyn = 2;

            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final java.util.Map<String,OneSyn> synMap = new java.util.HashMap<>();
            IDictionary<string, OneSyn> synMap = new Dictionary<string, OneSyn>();
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final java.util.List<OneSyn> syns = new java.util.ArrayList<>();
            IList<OneSyn> syns = new List<OneSyn>();
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final boolean dedup = random().nextBoolean();
            bool dedup = random().nextBoolean();
            if (VERBOSE)
            {
              Console.WriteLine("  dedup=" + dedup);
            }
            b = new SynonymMap.Builder(dedup);
            for (int synIDX = 0;synIDX < numSyn;synIDX++)
            {
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final String synIn = getRandomString('a', alphabetSize, org.apache.lucene.util.TestUtil.nextInt(random(), 1, 5)).trim();
              string synIn = getRandomString('a', alphabetSize, TestUtil.Next(random(), 1, 5)).Trim();
              OneSyn s = synMap[synIn];
              if (s == null)
              {
            s = new OneSyn();
            s.@in = synIn;
            syns.Add(s);
            s.@out = new List<>();
            synMap[synIn] = s;
            s.keepOrig = random().nextBoolean();
              }
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final String synOut = getRandomString('0', 10, org.apache.lucene.util.TestUtil.nextInt(random(), 1, 5)).trim();
              string synOut = getRandomString('0', 10, TestUtil.Next(random(), 1, 5)).Trim();
              [email protected](synOut);
              add(synIn, synOut, s.keepOrig);
              if (VERBOSE)
              {
            Console.WriteLine("  syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig);
              }
            }

            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.reset();
            assertTrue(tokensIn.incrementToken());
            assertFalse(tokensIn.incrementToken());
            tokensIn.end();
            tokensIn.close();

            tokensOut = new SynonymFilter(tokensIn, b.build(), true);
            termAtt = tokensOut.addAttribute(typeof(CharTermAttribute));
            posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute));
            posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute));
            offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute));

            if (dedup)
            {
              pruneDups(syns);
            }

            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final String expected = slowSynMatcher(document, syns, 5);
            string expected = slowSynMatcher(document, syns, 5);

            if (VERBOSE)
            {
              Console.WriteLine("TEST: expected=" + expected);
            }

            verify(document, expected);
        }
Exemplo n.º 14
0
        /*
         * Need to worry about multiple scenarios:
         *  - need to go for the longest match
         *    a b => foo      #shouldn't match if "a b" is followed by "c d"
         *    a b c d => bar
         *  - need to backtrack - retry matches for tokens already read
         *     a b c d => foo
         *       b c => bar
         *     If the input stream is "a b c x", one will consume "a b c d"
         *     trying to match the first rule... all but "a" should be
         *     pushed back so a match may be made on "b c".
         *  - don't try and match generated tokens (thus need separate queue)
         *    matching is not recursive.
         *  - handle optional generation of original tokens in all these cases,
         *    merging token streams to preserve token positions.
         *  - preserve original positionIncrement of first matched token
         */
        public override bool IncrementToken()
        {
            while (true)
            {
                // if there are any generated tokens, return them... don't try any
                // matches against them, as we specifically don't want recursion.
                if (replacement != null && replacement.MoveNext())
                {
                    copy(this, replacement.Current);
                    return(true);
                }

                // common case fast-path of first token not matching anything
                AttributeSource firstTok = nextTok();
                if (firstTok == null)
                {
                    return(false);
                }
                var            termAtt = firstTok.AddAttribute <ICharTermAttribute>();
                SlowSynonymMap result  = map.submap != null?map.submap.Get(termAtt.Buffer(), 0, termAtt.Length) : null;

                if (result == null)
                {
                    copy(this, firstTok);
                    return(true);
                }

                // fast-path failed, clone ourselves if needed
                if (firstTok == this)
                {
                    firstTok = CloneAttributes();
                }
                // OK, we matched a token, so find the longest match.

                matched = new LinkedList <>();

                result = match(result);

                if (result == null)
                {
                    // no match, simply return the first token read.
                    copy(this, firstTok);
                    return(true);
                }

                // reuse, or create new one each time?
                List <AttributeSource> generated = new List <AttributeSource>(result.synonyms.Length + matched.Count + 1);

                //
                // there was a match... let's generate the new tokens, merging
                // in the matched tokens (position increments need adjusting)
                //
                AttributeSource lastTok     = matched.Count == 0 ? firstTok : matched.Last.Value;
                bool            includeOrig = result.IncludeOrig;

                AttributeSource            origTok        = includeOrig ? firstTok : null;
                PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(typeof(PositionIncrementAttribute));
                int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream
                int repPos  = 0;                                // curr position in replacement token stream
                int pos     = 0;                                // current position in merged token stream

                for (int i = 0; i < result.synonyms.Length; i++)
                {
                    Token                      repTok       = result.synonyms[i];
                    AttributeSource            newTok       = firstTok.cloneAttributes();
                    CharTermAttribute          newTermAtt   = newTok.addAttribute(typeof(CharTermAttribute));
                    OffsetAttribute            newOffsetAtt = newTok.addAttribute(typeof(OffsetAttribute));
                    PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(typeof(PositionIncrementAttribute));

                    OffsetAttribute lastOffsetAtt = lastTok.addAttribute(typeof(OffsetAttribute));

                    newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
                    newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
                    repPos += repTok.PositionIncrement;
                    if (i == 0)     // make position of first token equal to original
                    {
                        repPos = origPos;
                    }

                    // if necessary, insert original tokens and adjust position increment
                    while (origTok != null && origPos <= repPos)
                    {
                        PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                        origPosInc.PositionIncrement = origPos - pos;
                        generated.Add(origTok);
                        pos    += origPosInc.PositionIncrement;
                        origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                        if (origTok != null)
                        {
                            origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                            origPos   += origPosInc.PositionIncrement;
                        }
                    }

                    newPosIncAtt.PositionIncrement = repPos - pos;
                    generated.Add(newTok);
                    pos += newPosIncAtt.PositionIncrement;
                }

                // finish up any leftover original tokens
                while (origTok != null)
                {
                    PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                    origPosInc.PositionIncrement = origPos - pos;
                    generated.Add(origTok);
                    pos    += origPosInc.PositionIncrement;
                    origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                    if (origTok != null)
                    {
                        origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                        origPos   += origPosInc.PositionIncrement;
                    }
                }

                // what if we replaced a longer sequence with a shorter one?
                // a/0 b/5 =>  foo/0
                // should I re-create the gap on the next buffered token?

                replacement = generated.GetEnumerator();
                // Now return to the top of the loop to read and return the first
                // generated token.. The reason this is done is that we may have generated
                // nothing at all, and may need to continue with more matching logic.
            }
        }
Exemplo n.º 15
0
        /// <summary>
        /// Retrieve suggestions.
        /// </summary>
        public virtual IList <LookupResult> Lookup(string key, HashSet <BytesRef> contexts, int num)
        {
            if (contexts != null)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            TokenStream ts = queryAnalyzer.TokenStream("", key.ToString());

            try
            {
                TermToBytesRefAttribute    termBytesAtt = ts.AddAttribute <TermToBytesRefAttribute>();
                OffsetAttribute            offsetAtt    = ts.AddAttribute <OffsetAttribute>();
                PositionLengthAttribute    posLenAtt    = ts.AddAttribute <PositionLengthAttribute>();
                PositionIncrementAttribute posIncAtt    = ts.AddAttribute <PositionIncrementAttribute>();
                ts.Reset();

                var lastTokens = new BytesRef[grams];
                //System.out.println("lookup: key='" + key + "'");

                // Run full analysis, but save only the
                // last 1gram, last 2gram, etc.:
                BytesRef tokenBytes   = termBytesAtt.BytesRef;
                int      maxEndOffset = -1;
                bool     sawRealToken = false;
                while (ts.IncrementToken())
                {
                    termBytesAtt.FillBytesRef();
                    sawRealToken |= tokenBytes.Length > 0;
                    // TODO: this is somewhat iffy; today, ShingleFilter
                    // sets posLen to the gram count; maybe we should make
                    // a separate dedicated att for this?
                    int gramCount = posLenAtt.PositionLength;

                    Debug.Assert(gramCount <= grams);

                    // Safety: make sure the recalculated count "agrees":
                    if (CountGrams(tokenBytes) != gramCount)
                    {
                        throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
                    }
                    maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset());
                    lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
                }
                ts.End();

                if (!sawRealToken)
                {
                    throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
                }

                // Carefully fill last tokens with _ tokens;
                // ShingleFilter appraently won't emit "only hole"
                // tokens:
                int endPosInc = posIncAtt.PositionIncrement;

                // Note this will also be true if input is the empty
                // string (in which case we saw no tokens and
                // maxEndOffset is still -1), which in fact works out OK
                // because we fill the unigram with an empty BytesRef
                // below:
                bool lastTokenEnded = offsetAtt.EndOffset() > maxEndOffset || endPosInc > 0;
                //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());

                if (lastTokenEnded)
                {
                    //System.out.println("  lastTokenEnded");
                    // If user hit space after the last token, then
                    // "upgrade" all tokens.  This way "foo " will suggest
                    // all bigrams starting w/ foo, and not any unigrams
                    // starting with "foo":
                    for (int i = grams - 1; i > 0; i--)
                    {
                        BytesRef token = lastTokens[i - 1];
                        if (token == null)
                        {
                            continue;
                        }
                        token.Grow(token.Length + 1);
                        token.Bytes[token.Length] = separator;
                        token.Length++;
                        lastTokens[i] = token;
                    }
                    lastTokens[0] = new BytesRef();
                }

                var arc = new FST.Arc <long?>();

                var bytesReader = fst.BytesReader;

                // Try highest order models first, and if they return
                // results, return that; else, fallback:
                double backoff = 1.0;

                IList <LookupResult> results = new List <LookupResult>(num);

                // We only add a given suffix once, from the highest
                // order model that saw it; for subsequent lower order
                // models we skip it:
                var seen = new HashSet <BytesRef>();

                for (int gram = grams - 1; gram >= 0; gram--)
                {
                    BytesRef token = lastTokens[gram];
                    // Don't make unigram predictions from empty string:
                    if (token == null || (token.Length == 0 && key.Length > 0))
                    {
                        // Input didn't have enough tokens:
                        //System.out.println("  gram=" + gram + ": skip: not enough input");
                        continue;
                    }

                    if (endPosInc > 0 && gram <= endPosInc)
                    {
                        // Skip hole-only predictions; in theory we
                        // shouldn't have to do this, but we'd need to fix
                        // ShingleFilter to produce only-hole tokens:
                        //System.out.println("  break: only holes now");
                        break;
                    }

                    //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

                    // TODO: we could add fuzziness here
                    // match the prefix portion exactly
                    //Pair<Long,BytesRef> prefixOutput = null;
                    long?prefixOutput = null;
                    prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
                    //System.out.println("  prefixOutput=" + prefixOutput);

                    if (prefixOutput == null)
                    {
                        // This model never saw this prefix, e.g. the
                        // trigram model never saw context "purple mushroom"
                        backoff *= ALPHA;
                        continue;
                    }

                    // TODO: we could do this division at build time, and
                    // bake it into the FST?

                    // Denominator for computing scores from current
                    // model's predictions:
                    long contextCount = totTokens;

                    BytesRef lastTokenFragment = null;

                    for (int i = token.Length - 1; i >= 0; i--)
                    {
                        if (token.Bytes[token.Offset + i] == separator)
                        {
                            BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
                            long?    output  = Util.Get(fst, Lucene.Net.Util.Fst.Util.ToIntsRef(context, new IntsRef()));
                            Debug.Assert(output != null);
                            contextCount      = DecodeWeight(output);
                            lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                            break;
                        }
                    }

                    BytesRef finalLastToken;

                    if (lastTokenFragment == null)
                    {
                        finalLastToken = BytesRef.DeepCopyOf(token);
                    }
                    else
                    {
                        finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
                    }
                    Debug.Assert(finalLastToken.Offset == 0);

                    CharsRef spare = new CharsRef();

                    // complete top-N
                    Util.Fst.Util.TopResults <long?> completions = null;
                    try
                    {
                        // Because we store multiple models in one FST
                        // (1gram, 2gram, 3gram), we must restrict the
                        // search so that it only considers the current
                        // model.  For highest order model, this is not
                        // necessary since all completions in the FST
                        // must be from this model, but for lower order
                        // models we have to filter out the higher order
                        // ones:

                        // Must do num+seen.size() for queue depth because we may
                        // reject up to seen.size() paths in acceptResult():
                        Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparator, seen, finalLastToken);

                        // since this search is initialized with a single start node
                        // it is okay to start with an empty input path here
                        searcher.AddStartPaths(arc, prefixOutput, true, new IntsRef());

                        completions = searcher.Search();
                        Debug.Assert(completions.IsComplete);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus);
                    }

                    int prefixLength = token.Length;

                    BytesRef suffix = new BytesRef(8);
                    //System.out.println("    " + completions.length + " completions");

                    foreach (Util.Fst.Util.Result <long?> completion in completions)
                    {
                        token.Length = prefixLength;
                        // append suffix
                        Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                        token.Append(suffix);

                        //System.out.println("    completion " + token.utf8ToString());

                        // Skip this path if a higher-order model already
                        // saw/predicted its last token:
                        BytesRef lastToken = token;
                        for (int i = token.Length - 1; i >= 0; i--)
                        {
                            if (token.Bytes[token.Offset + i] == separator)
                            {
                                Debug.Assert(token.Length - i - 1 > 0);
                                lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                                break;
                            }
                        }
                        if (seen.Contains(lastToken))
                        {
                            //System.out.println("      skip dup " + lastToken.utf8ToString());
                            goto nextCompletionContinue;
                        }
                        seen.Add(BytesRef.DeepCopyOf(lastToken));
                        spare.Grow(token.Length);
                        UnicodeUtil.UTF8toUTF16(token, spare);
                        LookupResult result = new LookupResult(spare.ToString(), (long)(long.MaxValue * backoff * ((double)DecodeWeight(completion.Output)) / contextCount));
                        results.Add(result);
                        Debug.Assert(results.Count == seen.Count);
                        //System.out.println("  add result=" + result);
                        nextCompletionContinue :;
                    }
                    nextCompletionBreak :
                    backoff *= ALPHA;
                }

                results.Sort(new ComparatorAnonymousInnerClassHelper(this));

                if (results.Count > num)
                {
                    results.SubList(num, results.Count).Clear();
                }

                return(results);
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(ts);
            }
        }
Exemplo n.º 16
0
        public override void  CopyTo(AttributeImpl target)
        {
            PositionIncrementAttribute t = (PositionIncrementAttribute)target;

            t.SetPositionIncrement(positionIncrement);
        }
Exemplo n.º 17
0
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testBasic() throws Exception
        public virtual void testBasic()
        {
            b = new SynonymMap.Builder(true);
            add("a", "foo", true);
            add("a b", "bar fee", true);
            add("b c", "dog collar", true);
            add("c d", "dog harness holder extras", true);
            add("m c e", "dog barks loudly", false);
            add("i j k", "feep", true);

            add("e f", "foo bar", false);
            add("e f", "baz bee", false);

            add("z", "boo", false);
            add("y", "bee", true);

            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.reset();
            assertTrue(tokensIn.incrementToken());
            assertFalse(tokensIn.incrementToken());
            tokensIn.end();
            tokensIn.close();

            tokensOut = new SynonymFilter(tokensIn, b.build(), true);
            termAtt = tokensOut.addAttribute(typeof(CharTermAttribute));
            posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute));
            posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute));
            offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute));

            verify("a b c", "a/bar b/fee c");

            // syn output extends beyond input tokens
            verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras");

            verify("a b a", "a/bar b/fee a/foo");

            // outputs that add to one another:
            verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras");

            // two outputs for same input
            verify("e f", "foo/baz bar/bee");

            // verify multi-word / single-output offsets:
            verify("g i j k g", "g i/feep:7_3 j k g");

            // mixed keepOrig true/false:
            verify("a m c e x", "a/foo dog barks loudly x");
            verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x");
            assertTrue(tokensOut.CaptureCount > 0);

            // no captureStates when no syns matched
            verify("p q r s t", "p q r s t");
            assertEquals(0, tokensOut.CaptureCount);

            // no captureStates when only single-input syns, w/ no
            // lookahead needed, matched
            verify("p q z y t", "p q boo y/bee t");
            assertEquals(0, tokensOut.CaptureCount);
        }
Exemplo n.º 18
0
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testBasic2() throws Exception
        public virtual void testBasic2()
        {
            b = new SynonymMap.Builder(true);
            const bool keepOrig = false;
            add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig);
            add("bbb", "bbbb1 bbbb2", keepOrig);
            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.reset();
            assertTrue(tokensIn.incrementToken());
            assertFalse(tokensIn.incrementToken());
            tokensIn.end();
            tokensIn.close();

            tokensOut = new SynonymFilter(tokensIn, b.build(), true);
            termAtt = tokensOut.addAttribute(typeof(CharTermAttribute));
            posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute));
            posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute));
            offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute));

            if (keepOrig)
            {
              verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold");
              verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold");
            }
            else
            {
              verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold");
              verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold");
            }
        }
Exemplo n.º 19
0
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testOutputHangsOffEnd() throws Exception
        public virtual void testOutputHangsOffEnd()
        {
            b = new SynonymMap.Builder(true);
            const bool keepOrig = false;
            // b hangs off the end (no input token under it):
            add("a", "a b", keepOrig);
            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.reset();
            assertTrue(tokensIn.incrementToken());
            assertFalse(tokensIn.incrementToken());
            tokensIn.end();
            tokensIn.close();

            tokensOut = new SynonymFilter(tokensIn, b.build(), true);
            termAtt = tokensOut.addAttribute(typeof(CharTermAttribute));
            posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute));
            offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute));
            posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute));

            // Make sure endOffset inherits from previous input token:
            verify("a", "a b:1");
        }
Exemplo n.º 20
0
 public SynonymFilter(TokenStream input) : base(input)
 {
     _termAtt    = (TermAttribute)AddAttribute <ITermAttribute>();
     _posIncrAtt = (PositionIncrementAttribute)AddAttribute <IPositionIncrementAttribute>();
 }
Exemplo n.º 21
0
        /// <summary>
        /// Not an explicit test, just useful to print out some info on performance
        /// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void performance() throws Exception
        public virtual void performance()
        {
            int[] tokCount  = new int[] { 100, 500, 1000, 2000, 5000, 10000 };
            int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
            for (int k = 0; k < tokCount.Length; k++)
            {
                StringBuilder buffer = new StringBuilder();
                Console.WriteLine("-----Tokens: " + tokCount[k] + "-----");
                for (int i = 0; i < tokCount[k]; i++)
                {
                    buffer.Append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).Append(' ');
                }
                //make sure we produce the same tokens
                TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))));
                TokenStream        sink      = teeStream.newSinkTokenStream(new ModuloSinkFilter(this, 100));
                teeStream.consumeAllTokens();
                TokenStream       stream  = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), 100);
                CharTermAttribute tfTok   = stream.addAttribute(typeof(CharTermAttribute));
                CharTermAttribute sinkTok = sink.addAttribute(typeof(CharTermAttribute));
                for (int i = 0; stream.incrementToken(); i++)
                {
                    assertTrue(sink.incrementToken());
                    assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.Equals(sinkTok) == true);
                }

                //simulate two fields, each being analyzed once, for 20 documents
                for (int j = 0; j < modCounts.Length; j++)
                {
                    int  tfPos = 0;
                    long start = DateTimeHelperClass.CurrentUnixTimeMillis();
                    for (int i = 0; i < 20; i++)
                    {
                        stream = new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString())));
                        PositionIncrementAttribute posIncrAtt = stream.getAttribute(typeof(PositionIncrementAttribute));
                        while (stream.incrementToken())
                        {
                            tfPos += posIncrAtt.PositionIncrement;
                        }
                        stream     = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), modCounts[j]);
                        posIncrAtt = stream.getAttribute(typeof(PositionIncrementAttribute));
                        while (stream.incrementToken())
                        {
                            tfPos += posIncrAtt.PositionIncrement;
                        }
                    }
                    long finish = DateTimeHelperClass.CurrentUnixTimeMillis();
                    Console.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
                    int sinkPos = 0;
                    //simulate one field with one sink
                    start = DateTimeHelperClass.CurrentUnixTimeMillis();
                    for (int i = 0; i < 20; i++)
                    {
                        teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))));
                        sink      = teeStream.newSinkTokenStream(new ModuloSinkFilter(this, modCounts[j]));
                        PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(typeof(PositionIncrementAttribute));
                        while (teeStream.incrementToken())
                        {
                            sinkPos += posIncrAtt.PositionIncrement;
                        }
                        //System.out.println("Modulo--------");
                        posIncrAtt = sink.getAttribute(typeof(PositionIncrementAttribute));
                        while (sink.incrementToken())
                        {
                            sinkPos += posIncrAtt.PositionIncrement;
                        }
                    }
                    finish = DateTimeHelperClass.CurrentUnixTimeMillis();
                    Console.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
                    assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
                }
                Console.WriteLine("- End Tokens: " + tokCount[k] + "-----");
            }
        }