public MockHoleInjectingTokenFilter(Random random, TokenStream @in)
     : base(@in)
 {
     RandomSeed = random.Next();
     PosIncAtt = AddAttribute<IPositionIncrementAttribute>();
     PosLenAtt = AddAttribute<IPositionLengthAttribute>();
 }
Exemplo n.º 2
0
 public GraphTokenizer(TextReader input)
     : base(input)
 {
     TermAtt = AddAttribute<ICharTermAttribute>();
     OffsetAtt = AddAttribute<IOffsetAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = AddAttribute<IPositionLengthAttribute>();
 }
 public CannedBinaryTokenStream(params BinaryToken[] tokens)
     : base()
 {
     this.Tokens = tokens;
     TermAtt = AddAttribute<IBinaryTermAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = AddAttribute<IPositionLengthAttribute>();
     OffsetAtt = AddAttribute<IOffsetAttribute>();
 }
 /// <summary>
 /// Construct a token stream filtering the given input using a Set of common
 /// words to create bigrams. Outputs both unigrams with position increment and
 /// bigrams with position increment 0 type=gram where one or both of the words
 /// in a potential bigram are in the set of common words .
 /// </summary>
 /// <param name="input"> TokenStream input in filter chain </param>
 /// <param name="commonWords"> The set of common words. </param>
 public CommonGramsFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet commonWords)
     : base(input)
 {
     termAttribute = AddAttribute<ICharTermAttribute>();
     offsetAttribute = AddAttribute<IOffsetAttribute>();
     typeAttribute = AddAttribute<ITypeAttribute>();
     posIncAttribute = AddAttribute<IPositionIncrementAttribute>();
     posLenAttribute = AddAttribute<IPositionLengthAttribute>();
     this.commonWords = commonWords;
 }
Exemplo n.º 5
0
 /// <summary>
 /// If inputText is non-null, and the TokenStream has
 ///  offsets, we include the surface form in each arc's
 ///  label.
 /// </summary>
 public TokenStreamToDot(string inputText, TokenStream @in, TextWriter @out)
 {
     this.@in = @in;
     this.@out = @out;
     this.InputText = inputText;
     TermAtt = @in.AddAttribute<ICharTermAttribute>();
     PosIncAtt = @in.AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = @in.AddAttribute<IPositionLengthAttribute>();
     if (@in.HasAttribute<IOffsetAttribute>())
     {
         OffsetAtt = @in.AddAttribute<IOffsetAttribute>();
     }
     else
     {
         OffsetAtt = null;
     }
 }
Exemplo n.º 6
0
        public virtual void TestOutputHangsOffEnd()
        {
            b = new SynonymMap.Builder(true);
            const bool keepOrig = false;

            // b hangs off the end (no input token under it):
            Add("a", "a b", keepOrig);
            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut  = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt    = tokensOut.AddAttribute <ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>();
            offsetAtt  = tokensOut.AddAttribute <IOffsetAttribute>();
            posLenAtt  = tokensOut.AddAttribute <IPositionLengthAttribute>();

            // Make sure endOffset inherits from previous input token:
            Verify("a", "a b:1");
        }
Exemplo n.º 7
0
        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
            : base(input)
        {
            // LUCENENET specific - version cannot be null because it is a value type.

            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
            {
                throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
            }

            if (!side.IsDefined())
            {
                throw new ArgumentOutOfRangeException(nameof(side), "sideLabel must be either front or back"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
            }

            if (minGram < 1)
            {
                throw new ArgumentOutOfRangeException(nameof(minGram), "minGram must be greater than zero"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
            }

            if (minGram > maxGram)
            {
                throw new ArgumentException("minGram must not be greater than maxGram");
            }

            this.version   = version;
            this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
            this.minGram   = minGram;
            this.maxGram   = maxGram;
            this.side      = side;

            this.termAtt    = AddAttribute <ICharTermAttribute>();
            this.offsetAtt  = AddAttribute <IOffsetAttribute>();
            this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            this.posLenAtt  = AddAttribute <IPositionLengthAttribute>();
        }
Exemplo n.º 8
0
        /// <param name="input"> input tokenstream </param>
        /// <param name="synonyms"> synonym map </param>
        /// <param name="ignoreCase"> case-folds input for matching with <seealso cref="Character#toLowerCase(int)"/>.
        ///                   Note, if you set this to true, its your responsibility to lowercase
        ///                   the input entries when you create the <seealso cref="SynonymMap"/> </param>
        public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase)
            : base(input)
        {
            termAtt = AddAttribute<ICharTermAttribute>();
            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            posLenAtt = AddAttribute<IPositionLengthAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();

            this.synonyms = synonyms;
            this.ignoreCase = ignoreCase;
            this.fst = synonyms.fst;
            if (fst == null)
            {
                throw new System.ArgumentException("fst must be non-null");
            }
            this.fstReader = fst.BytesReader;

            // Must be 1+ so that when roll buffer is at full
            // lookahead we can distinguish this full buffer from
            // the empty buffer:
            rollBufferSize = 1 + synonyms.maxHorizontalContext;

            futureInputs = new PendingInput[rollBufferSize];
            futureOutputs = new PendingOutputs[rollBufferSize];
            for (int pos = 0; pos < rollBufferSize; pos++)
            {
                futureInputs[pos] = new PendingInput();
                futureOutputs[pos] = new PendingOutputs();
            }

            //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);

            scratchArc = new FST.Arc<BytesRef>();
        }
        public virtual void TestBasic2()
        {
            b = new SynonymMap.Builder(true);
            const bool keepOrig = false;
            Add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig);
            Add("bbb", "bbbb1 bbbb2", keepOrig);
            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt = tokensOut.AddAttribute<ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>();
            posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>();
            offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>();

#pragma warning disable 162
            if (keepOrig)
            {
                Verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold");
                Verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold");
            }
            else
            {
                Verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold");
                Verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold");
            }
#pragma warning restore 612, 618
        }
Exemplo n.º 10
0
 private void InitParams()
 {
     TermAtt = AddAttribute<ICharTermAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = AddAttribute<IPositionLengthAttribute>();
     OffsetAtt = AddAttribute<IOffsetAttribute>();
     PayloadAtt = AddAttribute<IPayloadAttribute>();
 }
        public virtual void TestBasic()
        {
            b = new SynonymMap.Builder(true);
            Add("a", "foo", true);
            Add("a b", "bar fee", true);
            Add("b c", "dog collar", true);
            Add("c d", "dog harness holder extras", true);
            Add("m c e", "dog barks loudly", false);
            Add("i j k", "feep", true);

            Add("e f", "foo bar", false);
            Add("e f", "baz bee", false);

            Add("z", "boo", false);
            Add("y", "bee", true);

            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt = tokensOut.AddAttribute<ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>();
            posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>();
            offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>();

            Verify("a b c", "a/bar b/fee c");

            // syn output extends beyond input tokens
            Verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras");

            Verify("a b a", "a/bar b/fee a/foo");

            // outputs that add to one another:
            Verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras");

            // two outputs for same input
            Verify("e f", "foo/baz bar/bee");

            // verify multi-word / single-output offsets:
            Verify("g i j k g", "g i/feep:7_3 j k g");

            // mixed keepOrig true/false:
            Verify("a m c e x", "a/foo dog barks loudly x");
            Verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x");
            assertTrue(tokensOut.CaptureCount > 0);

            // no captureStates when no syns matched
            Verify("p q r s t", "p q r s t");
            assertEquals(0, tokensOut.CaptureCount);

            // no captureStates when only single-input syns, w/ no
            // lookahead needed, matched
            Verify("p q z y t", "p q boo y/bee t");
            assertEquals(0, tokensOut.CaptureCount);
        }
        public virtual void TestRandom()
        {

            int alphabetSize = TestUtil.NextInt(Random(), 2, 7);

            int docLen = AtLeast(3000);
            //final int docLen = 50;

            string document = GetRandomString('a', alphabetSize, docLen);

            if (VERBOSE)
            {
                Console.WriteLine("TEST: doc=" + document);
            }

            int numSyn = AtLeast(5);
            //final int numSyn = 2;

            IDictionary<string, OneSyn> synMap = new Dictionary<string, OneSyn>();
            IList<OneSyn> syns = new List<OneSyn>();
            bool dedup = Random().nextBoolean();
            if (VERBOSE)
            {
                Console.WriteLine("  dedup=" + dedup);
            }
            b = new SynonymMap.Builder(dedup);
            for (int synIDX = 0; synIDX < numSyn; synIDX++)
            {
                string synIn = GetRandomString('a', alphabetSize, TestUtil.NextInt(Random(), 1, 5)).Trim();
                OneSyn s = synMap.ContainsKey(synIn) ? synMap[synIn] : null;
                if (s == null)
                {
                    s = new OneSyn();
                    s.@in = synIn;
                    syns.Add(s);
                    s.@out = new List<string>();
                    synMap[synIn] = s;
                    s.keepOrig = Random().nextBoolean();
                }
                string synOut = GetRandomString('0', 10, TestUtil.NextInt(Random(), 1, 5)).Trim();
                [email protected](synOut);
                Add(synIn, synOut, s.keepOrig);
                if (VERBOSE)
                {
                    Console.WriteLine("  syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig);
                }
            }

            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt = tokensOut.AddAttribute<ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>();
            posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>();
            offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>();

            if (dedup)
            {
                PruneDups(syns);
            }

            string expected = SlowSynMatcher(document, syns, 5);

            if (VERBOSE)
            {
                Console.WriteLine("TEST: expected=" + expected);
            }

            Verify(document, expected);
        }
Exemplo n.º 13
0
        // offsetsAreCorrect also validates:
        //   - graph offsets are correct (all tokens leaving from
        //     pos X have the same startOffset; all tokens
        //     arriving to pos Y have the same endOffset)
        //   - offsets only move forwards (startOffset >=
        //     lastStartOffset)
        public static void AssertTokenStreamContents(TokenStream ts, string[] output, int[] startOffsets, int[] endOffsets, string[] types, int[] posIncrements, int[] posLengths, int?finalOffset, int?finalPosInc, bool[] keywordAtts, bool offsetsAreCorrect)
        {
            Assert.IsNotNull(output);
            var checkClearAtt = ts.AddAttribute <ICheckClearAttributesAttribute>();

            ICharTermAttribute termAtt = null;

            if (output.Length > 0)
            {
                Assert.IsTrue(ts.HasAttribute <ICharTermAttribute>(), "has no CharTermAttribute");
                termAtt = ts.GetAttribute <ICharTermAttribute>();
            }

            IOffsetAttribute offsetAtt = null;

            if (startOffsets != null || endOffsets != null || finalOffset != null)
            {
                Assert.IsTrue(ts.HasAttribute <IOffsetAttribute>(), "has no OffsetAttribute");
                offsetAtt = ts.GetAttribute <IOffsetAttribute>();
            }

            ITypeAttribute typeAtt = null;

            if (types != null)
            {
                Assert.IsTrue(ts.HasAttribute <ITypeAttribute>(), "has no TypeAttribute");
                typeAtt = ts.GetAttribute <ITypeAttribute>();
            }

            IPositionIncrementAttribute posIncrAtt = null;

            if (posIncrements != null || finalPosInc != null)
            {
                Assert.IsTrue(ts.HasAttribute <IPositionIncrementAttribute>(), "has no PositionIncrementAttribute");
                posIncrAtt = ts.GetAttribute <IPositionIncrementAttribute>();
            }

            IPositionLengthAttribute posLengthAtt = null;

            if (posLengths != null)
            {
                Assert.IsTrue(ts.HasAttribute <IPositionLengthAttribute>(), "has no PositionLengthAttribute");
                posLengthAtt = ts.GetAttribute <IPositionLengthAttribute>();
            }

            IKeywordAttribute keywordAtt = null;

            if (keywordAtts != null)
            {
                Assert.IsTrue(ts.HasAttribute <IKeywordAttribute>(), "has no KeywordAttribute");
                keywordAtt = ts.GetAttribute <IKeywordAttribute>();
            }

            // Maps position to the start/end offset:
            IDictionary <int?, int?> posToStartOffset = new Dictionary <int?, int?>();
            IDictionary <int?, int?> posToEndOffset   = new Dictionary <int?, int?>();

            ts.Reset();
            int pos             = -1;
            int lastStartOffset = 0;

            for (int i = 0; i < output.Length; i++)
            {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetEmpty().Append("bogusTerm");
                if (offsetAtt != null)
                {
                    offsetAtt.SetOffset(14584724, 24683243);
                }
                if (typeAtt != null)
                {
                    typeAtt.Type = "bogusType";
                }
                if (posIncrAtt != null)
                {
                    posIncrAtt.PositionIncrement = 45987657;
                }
                if (posLengthAtt != null)
                {
                    posLengthAtt.PositionLength = 45987653;
                }
                if (keywordAtt != null)
                {
                    keywordAtt.Keyword = (i & 1) == 0;
                }

                bool reset = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(reset, "ClearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.ToString(), "term " + i + ", output[i] = " + output[i] + ", termAtt = " + termAtt.ToString());
                if (startOffsets != null)
                {
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i);
                }
                if (endOffsets != null)
                {
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i);
                }
                if (types != null)
                {
                    Assert.AreEqual(types[i], typeAtt.Type, "type " + i);
                }
                if (posIncrements != null)
                {
                    Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i);
                }
                if (posLengths != null)
                {
                    Assert.AreEqual(posLengths[i], posLengthAtt.PositionLength, "posLength " + i);
                }
                if (keywordAtts != null)
                {
                    Assert.AreEqual(keywordAtts[i], keywordAtt.Keyword, "keywordAtt " + i);
                }

                // we can enforce some basic things about a few attributes even if the caller doesn't check:
                if (offsetAtt != null)
                {
                    int startOffset = offsetAtt.StartOffset();
                    int endOffset   = offsetAtt.EndOffset();
                    if (finalOffset != null)
                    {
                        Assert.IsTrue(startOffset <= (int)finalOffset, "startOffset must be <= finalOffset");
                        Assert.IsTrue(endOffset <= (int)finalOffset, "endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + (int)finalOffset);
                    }

                    if (offsetsAreCorrect)
                    {
                        Assert.IsTrue(offsetAtt.StartOffset() >= lastStartOffset, "offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
                        lastStartOffset = offsetAtt.StartOffset();
                    }

                    if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null)
                    {
                        // Validate offset consistency in the graph, ie
                        // all tokens leaving from a certain pos have the
                        // same startOffset, and all tokens arriving to a
                        // certain pos have the same endOffset:
                        int posInc = posIncrAtt.PositionIncrement;
                        pos += posInc;

                        int posLength = posLengthAtt.PositionLength;

                        if (!posToStartOffset.ContainsKey(pos))
                        {
                            // First time we've seen a token leaving from this position:
                            posToStartOffset[pos] = startOffset;
                            //System.out.println("  + s " + pos + " -> " + startOffset);
                        }
                        else
                        {
                            // We've seen a token leaving from this position
                            // before; verify the startOffset is the same:
                            //System.out.println("  + vs " + pos + " -> " + startOffset);
                            Assert.AreEqual((int)posToStartOffset[pos], startOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt);
                        }

                        int endPos = pos + posLength;

                        if (!posToEndOffset.ContainsKey(endPos))
                        {
                            // First time we've seen a token arriving to this position:
                            posToEndOffset[endPos] = endOffset;
                            //System.out.println("  + e " + endPos + " -> " + endOffset);
                        }
                        else
                        {
                            // We've seen a token arriving to this position
                            // before; verify the endOffset is the same:
                            //System.out.println("  + ve " + endPos + " -> " + endOffset);
                            Assert.AreEqual((int)posToEndOffset[endPos], endOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt);
                        }
                    }
                }
                if (posIncrAtt != null)
                {
                    if (i == 0)
                    {
                        Assert.IsTrue(posIncrAtt.PositionIncrement >= 1, "first posIncrement must be >= 1");
                    }
                    else
                    {
                        Assert.IsTrue(posIncrAtt.PositionIncrement >= 0, "posIncrement must be >= 0");
                    }
                }
                if (posLengthAtt != null)
                {
                    Assert.IsTrue(posLengthAtt.PositionLength >= 1, "posLength must be >= 1");
                }
            }

            if (ts.IncrementToken())
            {
                Assert.Fail("TokenStream has more tokens than expected (expected count=" + output.Length + "); extra token=" + termAtt);
            }

            // repeat our extra safety checks for End()
            ts.ClearAttributes();
            if (termAtt != null)
            {
                termAtt.SetEmpty().Append("bogusTerm");
            }
            if (offsetAtt != null)
            {
                offsetAtt.SetOffset(14584724, 24683243);
            }
            if (typeAtt != null)
            {
                typeAtt.Type = "bogusType";
            }
            if (posIncrAtt != null)
            {
                posIncrAtt.PositionIncrement = 45987657;
            }
            if (posLengthAtt != null)
            {
                posLengthAtt.PositionLength = 45987653;
            }

            var reset_ = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before

            ts.End();
            Assert.IsTrue(checkClearAtt.AndResetClearCalled, "super.End()/ClearAttributes() was not called correctly in End()");

            if (finalOffset != null)
            {
                Assert.AreEqual((int)finalOffset, offsetAtt.EndOffset(), "finalOffset");
            }
            if (offsetAtt != null)
            {
                Assert.IsTrue(offsetAtt.EndOffset() >= 0, "finalOffset must be >= 0");
            }
            if (finalPosInc != null)
            {
                Assert.AreEqual((int)finalPosInc, posIncrAtt.PositionIncrement, "finalPosInc");
            }

            ts.Dispose();
        }
        public virtual void TestOutputHangsOffEnd()
        {
            b = new SynonymMap.Builder(true);
            const bool keepOrig = false;
            // b hangs off the end (no input token under it):
            Add("a", "a b", keepOrig);
            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt = tokensOut.AddAttribute<ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>();
            offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>();
            posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>();

            // Make sure endOffset inherits from previous input token:
            Verify("a", "a b:1");
        }
        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
              : base(input)
        {

            //if (version == null)
            //{
            //    throw new System.ArgumentException("version must not be null");
            //}

            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
            {
                throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
            }

            if (!Enum.IsDefined(typeof(Side), side))
            {
                throw new System.ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }

            this.version = version;
            this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
            this.minGram = minGram;
            this.maxGram = maxGram;
            this.side = side;

            this.termAtt = AddAttribute<ICharTermAttribute>();
            this.offsetAtt = AddAttribute<IOffsetAttribute>();
            this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            this.posLenAtt = AddAttribute<IPositionLengthAttribute>();
        }
Exemplo n.º 16
0
 /// <summary>
 /// Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
 /// and whether or not unigrams should also be output. </summary>
 /// <param name="flags"> OR'ed set from <seealso cref="CJKBigramFilter#HAN"/>, <seealso cref="CJKBigramFilter#HIRAGANA"/>, 
 ///        <seealso cref="CJKBigramFilter#KATAKANA"/>, <seealso cref="CJKBigramFilter#HANGUL"/> </param>
 /// <param name="outputUnigrams"> true if unigrams for the selected writing systems should also be output.
 ///        when this is false, this is only done when there are no adjacent characters to form
 ///        a bigram. </param>
 public CJKBigramFilter(TokenStream @in, int flags, bool outputUnigrams)
       : base(@in)
 {
     doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
     doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
     doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
     doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
     this.outputUnigrams = outputUnigrams;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.typeAtt = AddAttribute<ITypeAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
     this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
     this.posLengthAtt = AddAttribute<IPositionLengthAttribute>();
 }
Exemplo n.º 17
0
        /// <summary>
        /// Retrieve suggestions.
        /// </summary>
        public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num)
        {
            if (contexts != null)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString());

            try
            {
                ITermToBytesRefAttribute    termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                IOffsetAttribute            offsetAtt    = ts.AddAttribute <IOffsetAttribute>();
                IPositionLengthAttribute    posLenAtt    = ts.AddAttribute <IPositionLengthAttribute>();
                IPositionIncrementAttribute posIncAtt    = ts.AddAttribute <IPositionIncrementAttribute>();
                ts.Reset();

                var lastTokens = new BytesRef[grams];
                //System.out.println("lookup: key='" + key + "'");

                // Run full analysis, but save only the
                // last 1gram, last 2gram, etc.:
                BytesRef tokenBytes   = termBytesAtt.BytesRef;
                int      maxEndOffset = -1;
                bool     sawRealToken = false;
                while (ts.IncrementToken())
                {
                    termBytesAtt.FillBytesRef();
                    sawRealToken |= tokenBytes.Length > 0;
                    // TODO: this is somewhat iffy; today, ShingleFilter
                    // sets posLen to the gram count; maybe we should make
                    // a separate dedicated att for this?
                    int gramCount = posLenAtt.PositionLength;

                    Debug.Assert(gramCount <= grams);

                    // Safety: make sure the recalculated count "agrees":
                    if (CountGrams(tokenBytes) != gramCount)
                    {
                        throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes));
                    }
                    maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset);
                    lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
                }
                ts.End();

                if (!sawRealToken)
                {
                    throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
                }

                // Carefully fill last tokens with _ tokens;
                // ShingleFilter appraently won't emit "only hole"
                // tokens:
                int endPosInc = posIncAtt.PositionIncrement;

                // Note this will also be true if input is the empty
                // string (in which case we saw no tokens and
                // maxEndOffset is still -1), which in fact works out OK
                // because we fill the unigram with an empty BytesRef
                // below:
                bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0;
                //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset);

                if (lastTokenEnded)
                {
                    //System.out.println("  lastTokenEnded");
                    // If user hit space after the last token, then
                    // "upgrade" all tokens.  This way "foo " will suggest
                    // all bigrams starting w/ foo, and not any unigrams
                    // starting with "foo":
                    for (int i = grams - 1; i > 0; i--)
                    {
                        BytesRef token = lastTokens[i - 1];
                        if (token == null)
                        {
                            continue;
                        }
                        token.Grow(token.Length + 1);
                        token.Bytes[token.Length] = separator;
                        token.Length++;
                        lastTokens[i] = token;
                    }
                    lastTokens[0] = new BytesRef();
                }

                var arc = new FST.Arc <long?>();

                var bytesReader = fst.GetBytesReader();

                // Try highest order models first, and if they return
                // results, return that; else, fallback:
                double backoff = 1.0;

                List <LookupResult> results = new List <LookupResult>(num);

                // We only add a given suffix once, from the highest
                // order model that saw it; for subsequent lower order
                // models we skip it:
                var seen = new HashSet <BytesRef>();

                for (int gram = grams - 1; gram >= 0; gram--)
                {
                    BytesRef token = lastTokens[gram];
                    // Don't make unigram predictions from empty string:
                    if (token == null || (token.Length == 0 && key.Length > 0))
                    {
                        // Input didn't have enough tokens:
                        //System.out.println("  gram=" + gram + ": skip: not enough input");
                        continue;
                    }

                    if (endPosInc > 0 && gram <= endPosInc)
                    {
                        // Skip hole-only predictions; in theory we
                        // shouldn't have to do this, but we'd need to fix
                        // ShingleFilter to produce only-hole tokens:
                        //System.out.println("  break: only holes now");
                        break;
                    }

                    //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

                    // TODO: we could add fuzziness here
                    // match the prefix portion exactly
                    //Pair<Long,BytesRef> prefixOutput = null;
                    long?prefixOutput = null;
                    try
                    {
                        prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.ToString(), bogus);
                    }
                    //System.out.println("  prefixOutput=" + prefixOutput);

                    if (prefixOutput == null)
                    {
                        // This model never saw this prefix, e.g. the
                        // trigram model never saw context "purple mushroom"
                        backoff *= ALPHA;
                        continue;
                    }

                    // TODO: we could do this division at build time, and
                    // bake it into the FST?

                    // Denominator for computing scores from current
                    // model's predictions:
                    long contextCount = totTokens;

                    BytesRef lastTokenFragment = null;

                    for (int i = token.Length - 1; i >= 0; i--)
                    {
                        if (token.Bytes[token.Offset + i] == separator)
                        {
                            BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
                            long?    output  = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef()));
                            Debug.Assert(output != null);
                            contextCount      = DecodeWeight(output);
                            lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                            break;
                        }
                    }

                    BytesRef finalLastToken;

                    if (lastTokenFragment == null)
                    {
                        finalLastToken = BytesRef.DeepCopyOf(token);
                    }
                    else
                    {
                        finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
                    }
                    Debug.Assert(finalLastToken.Offset == 0);

                    CharsRef spare = new CharsRef();

                    // complete top-N
                    Util.Fst.Util.TopResults <long?> completions = null;
                    try
                    {
                        // Because we store multiple models in one FST
                        // (1gram, 2gram, 3gram), we must restrict the
                        // search so that it only considers the current
                        // model.  For highest order model, this is not
                        // necessary since all completions in the FST
                        // must be from this model, but for lower order
                        // models we have to filter out the higher order
                        // ones:

                        // Must do num+seen.size() for queue depth because we may
                        // reject up to seen.size() paths in acceptResult():
                        Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken);

                        // since this search is initialized with a single start node
                        // it is okay to start with an empty input path here
                        searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef());

                        completions = searcher.Search();
                        Debug.Assert(completions.IsComplete);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.ToString(), bogus);
                    }

                    int prefixLength = token.Length;

                    BytesRef suffix = new BytesRef(8);
                    //System.out.println("    " + completions.length + " completions");

                    foreach (Util.Fst.Util.Result <long?> completion in completions)
                    {
                        token.Length = prefixLength;
                        // append suffix
                        Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                        token.Append(suffix);

                        //System.out.println("    completion " + token.utf8ToString());

                        // Skip this path if a higher-order model already
                        // saw/predicted its last token:
                        BytesRef lastToken = token;
                        for (int i = token.Length - 1; i >= 0; i--)
                        {
                            if (token.Bytes[token.Offset + i] == separator)
                            {
                                Debug.Assert(token.Length - i - 1 > 0);
                                lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                                break;
                            }
                        }
                        if (seen.Contains(lastToken))
                        {
                            //System.out.println("      skip dup " + lastToken.utf8ToString());
                            goto nextCompletionContinue;
                        }
                        seen.Add(BytesRef.DeepCopyOf(lastToken));
                        spare.Grow(token.Length);
                        UnicodeUtil.UTF8toUTF16(token, spare);
                        LookupResult result = new LookupResult(spare.ToString(),
                                                               // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes
                                                               // return numbers that are greater than long.MaxValue, which results in a negative long number.
                                                               (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount));
                        results.Add(result);
                        Debug.Assert(results.Count == seen.Count);
                        //System.out.println("  add result=" + result);
                        nextCompletionContinue :;
                    }
                    backoff *= ALPHA;
                }

                results.Sort(new ComparerAnonymousInnerClassHelper(this));

                if (results.Count > num)
                {
                    results.SubList(num, results.Count).Clear();
                }

                return(results);
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
Exemplo n.º 18
0
 /// <summary>
 /// Constructs a ShingleFilter with the specified shingle size from the
 /// <seealso cref="TokenStream"/> <code>input</code>
 /// </summary>
 /// <param name="input"> input stream </param>
 /// <param name="minShingleSize"> minimum shingle size produced by the filter. </param>
 /// <param name="maxShingleSize"> maximum shingle size produced by the filter. </param>
 public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize)
       : base(input)
 {
     MaxShingleSize = maxShingleSize;
     MinShingleSize = minShingleSize;
     termAtt = AddAttribute<ICharTermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     posLenAtt = AddAttribute<IPositionLengthAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
 }
Exemplo n.º 19
0
        /// <summary>
        /// Creates NGramTokenFilter with given min and max n-grams. </summary>
        /// <param name="version"> Lucene version to enable correct position increments.
        ///                See <a href="#version">above</a> for details. </param>
        /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
        /// <param name="minGram"> the smallest n-gram to generate </param>
        /// <param name="maxGram"> the largest n-gram to generate </param>
        public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
            : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
        {
            this.version = version;
            this.charUtils = version.OnOrAfter(
#pragma warning disable 612, 618
                LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
                CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }
            this.minGram = minGram;
            this.maxGram = maxGram;
#pragma warning disable 612, 618
            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
            {
                posIncAtt = AddAttribute<IPositionIncrementAttribute>();
                posLenAtt = AddAttribute<IPositionLengthAttribute>();
            }
            else
            {
                posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
                posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
            }
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
        }
Exemplo n.º 20
0
        public virtual void TestRandom()
        {
            int alphabetSize = TestUtil.NextInt32(Random, 2, 7);

            int docLen = AtLeast(3000);
            //final int docLen = 50;

            string document = GetRandomString('a', alphabetSize, docLen);

            if (Verbose)
            {
                Console.WriteLine("TEST: doc=" + document);
            }

            int numSyn = AtLeast(5);
            //final int numSyn = 2;

            IDictionary <string, OneSyn> synMap = new Dictionary <string, OneSyn>();
            IList <OneSyn> syns  = new JCG.List <OneSyn>();
            bool           dedup = Random.nextBoolean();

            if (Verbose)
            {
                Console.WriteLine("  dedup=" + dedup);
            }
            b = new SynonymMap.Builder(dedup);
            for (int synIDX = 0; synIDX < numSyn; synIDX++)
            {
                string synIn = GetRandomString('a', alphabetSize, TestUtil.NextInt32(Random, 1, 5)).Trim();
                if (!synMap.TryGetValue(synIn, out OneSyn s) || s is null)
                {
                    s     = new OneSyn();
                    s.@in = synIn;
                    syns.Add(s);
                    s.@out        = new JCG.List <string>();
                    synMap[synIn] = s;
                    s.keepOrig    = Random.nextBoolean();
                }
                string synOut = GetRandomString('0', 10, TestUtil.NextInt32(Random, 1, 5)).Trim();
                [email protected](synOut);
                Add(synIn, synOut, s.keepOrig);
                if (Verbose)
                {
                    Console.WriteLine("  syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig);
                }
            }

            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut  = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt    = tokensOut.AddAttribute <ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>();
            posLenAtt  = tokensOut.AddAttribute <IPositionLengthAttribute>();
            offsetAtt  = tokensOut.AddAttribute <IOffsetAttribute>();

            if (dedup)
            {
                PruneDups(syns);
            }

            string expected = SlowSynMatcher(document, syns, 5);

            if (Verbose)
            {
                Console.WriteLine("TEST: expected=" + expected);
            }

            Verify(document, expected);
        }
Exemplo n.º 21
0
        private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
        {
#pragma warning disable 612, 618
            if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
            {
                throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
            }
#pragma warning disable 612, 618
            charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
                CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }
            termAtt = AddAttribute<ICharTermAttribute>();
            posIncAtt = AddAttribute<IPositionIncrementAttribute>();
            posLenAtt = AddAttribute<IPositionLengthAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            this.minGram = minGram;
            this.maxGram = maxGram;
            this.edgesOnly = edgesOnly;
            charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
            buffer = new int[charBuffer.Buffer.Length];



            // Make the term att large enough
            termAtt.ResizeBuffer(2 * maxGram);
        }