예제 #1
        /// <summary>
        /// Creates <see cref="NGramTokenFilter"/> with given min and max n-grams. </summary>
        /// <param name="version"> Lucene version to enable correct position increments.
        ///                See <see cref="NGramTokenFilter"/> for details. </param>
        /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
        /// <param name="minGram"> the smallest n-gram to generate </param>
        /// <param name="maxGram"> the largest n-gram to generate </param>
        public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
            : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
            this.version   = version;
            this.charUtils = version.OnOrAfter(
#pragma warning disable 612, 618
                LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
                             CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
            if (minGram < 1)
                throw new ArgumentException("minGram must be greater than zero");
            if (minGram > maxGram)
                throw new ArgumentException("minGram must not be greater than maxGram");
            this.minGram = minGram;
            this.maxGram = maxGram;
#pragma warning disable 612, 618
            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
                posIncAtt = AddAttribute <IPositionIncrementAttribute>();
                posLenAtt = AddAttribute <IPositionLengthAttribute>();
                posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper();
                posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper();
            termAtt   = AddAttribute <ICharTermAttribute>();
            offsetAtt = AddAttribute <IOffsetAttribute>();
예제 #2
        private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
#pragma warning disable 612, 618
            if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
                throw new ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
#pragma warning disable 612, 618
            charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
                        CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
            if (minGram < 1)
                throw new ArgumentException("minGram must be greater than zero");
            if (minGram > maxGram)
                throw new ArgumentException("minGram must not be greater than maxGram");
            termAtt        = AddAttribute <ICharTermAttribute>();
            posIncAtt      = AddAttribute <IPositionIncrementAttribute>();
            posLenAtt      = AddAttribute <IPositionLengthAttribute>();
            offsetAtt      = AddAttribute <IOffsetAttribute>();
            this.minGram   = minGram;
            this.maxGram   = maxGram;
            this.edgesOnly = edgesOnly;
            charBuffer     = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
            buffer         = new int[charBuffer.Buffer.Length];

            // Make the term att large enough
            termAtt.ResizeBuffer(2 * maxGram);
        private void ReadInputToBuffer()
            while (true)
                // CharacterUtils.fill is supplementary char aware
#pragma warning disable 612, 618
                bool hasRemainingChars = CharacterUtils.GetInstance(LuceneVersion.LUCENE_CURRENT).Fill(tmpBuffer, m_input);
#pragma warning restore 612, 618

                if (Debugging.AssertsEnabled)
                    Debugging.Assert(tmpBuffer.Offset == 0);
                inputBuffer.Append(tmpBuffer.Buffer, 0, tmpBuffer.Length);

                if (hasRemainingChars == false)
                    inputFinished = true;

                int lastCodePoint = Character.CodePointBefore(tmpBuffer.Buffer, tmpBuffer.Length, 0);
                if (normalizer.IsInert(lastCodePoint))
                    // we require an inert char so that we can normalize content before and
                    // after this character independently
예제 #4
 /// <summary>
 /// Create a new UpperCaseFilter, that normalizes token text to upper case.
 /// </summary>
 /// <param name="matchVersion"> See <a href="#version">above</a> </param>
 /// <param name="in"> TokenStream to filter </param>
 public UpperCaseFilter(LuceneVersion matchVersion, TokenStream @in)
     : base(@in)
     termAtt   = AddAttribute <ICharTermAttribute>();
     termAtt   = AddAttribute <ICharTermAttribute>();
     charUtils = CharacterUtils.GetInstance(matchVersion);
예제 #5
        public virtual void TestCodePointAtCharSequenceInt()
            var java4            = CharacterUtils.GetInstance(LuceneVersion.LUCENE_30);
            var cpAt3            = "Abc\ud801\udc1c";
            var highSurrogateAt3 = "Abc\ud801";

            assertEquals((int)'A', java4.CodePointAt(cpAt3, 0));
            assertEquals((int)'\ud801', java4.CodePointAt(cpAt3, 3));
            assertEquals((int)'\ud801', java4.CodePointAt(highSurrogateAt3, 3));
                java4.CodePointAt(highSurrogateAt3, 4);
                fail("string index out of bounds");
            catch (IndexOutOfRangeException)

            var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT);

            assertEquals((int)'A', java5.CodePointAt(cpAt3, 0));
            assertEquals(Character.CodePointAt('\ud801', '\udc1c'), java5.CodePointAt(cpAt3, 3));
            assertEquals((int)'\ud801', java5.CodePointAt(highSurrogateAt3, 3));
                java5.CodePointAt(highSurrogateAt3, 4);
                fail("string index out of bounds");
            catch (System.IndexOutOfRangeException)
예제 #6
 /// <summary>
 /// Creates NGramTokenFilter with given min and max n-grams. </summary>
 /// <param name="version"> Lucene version to enable correct position increments.
 ///                See <a href="#version">above</a> for details. </param>
 /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram)
     : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
     this.version   = version;
     this.charUtils = version.OnOrAfter(Version.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
     if (minGram < 1)
         throw new System.ArgumentException("minGram must be greater than zero");
     if (minGram > maxGram)
         throw new System.ArgumentException("minGram must not be greater than maxGram");
     this.minGram = minGram;
     this.maxGram = maxGram;
     if (version.OnOrAfter(Version.LUCENE_44))
         posIncAtt = AddAttribute(typeof(PositionIncrementAttribute));
         posLenAtt = AddAttribute(typeof(PositionLengthAttribute));
         posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
         posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
예제 #7
        public virtual void TestConversions()
            var java4 = CharacterUtils.Java4Instance;
            var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT);

예제 #8
        public virtual void TestCodePointCount()
            var java4 = CharacterUtils.Java4Instance;
            var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT);

            var s = TestUtil.RandomUnicodeString(Random());

            assertEquals(s.Length, java4.CodePointCount(s));
            assertEquals(Character.CodePointCount(s, 0, s.Length), java5.CodePointCount(s));
예제 #9
        public virtual void TestCodePointAtCharArrayIntInt()
            var java4            = CharacterUtils.GetInstance(LuceneVersion.LUCENE_30);
            var cpAt3            = "Abc\ud801\udc1c".ToCharArray();
            var highSurrogateAt3 = "Abc\ud801".ToCharArray();

            assertEquals((int)'A', java4.CodePointAt(cpAt3, 0, 2));
            assertEquals((int)'\ud801', java4.CodePointAt(cpAt3, 3, 5));
            assertEquals((int)'\ud801', java4.CodePointAt(highSurrogateAt3, 3, 4));

            var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT);

            assertEquals((int)'A', java5.CodePointAt(cpAt3, 0, 2));
            assertEquals(Character.ToCodePoint('\ud801', '\udc1c'), java5.CodePointAt(cpAt3, 3, 5));
            assertEquals((int)'\ud801', java5.CodePointAt(highSurrogateAt3, 3, 4));
예제 #10
        public virtual void TestOffsetByCodePoint()
            var java4 = CharacterUtils.Java4Instance;
            var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT);

            for (int i = 0; i < 10; ++i)
                var s      = TestUtil.RandomUnicodeString(Random()).toCharArray();
                var index  = TestUtil.NextInt(Random(), 0, s.Length);
                var offset = Random().Next(7) - 3;
                    var to = java4.OffsetByCodePoints(s, 0, s.Length, index, offset);
                    assertEquals(to, index + offset);
                catch (System.IndexOutOfRangeException)
                    assertTrue((index + offset) < 0 || (index + offset) > s.Length);

                int o;
                    o = java5.OffsetByCodePoints(s, 0, s.Length, index, offset);
                catch (System.IndexOutOfRangeException)
                        Character.OffsetByCodePoints(s, 0, s.Length, index, offset);
                    catch (System.IndexOutOfRangeException)
                        // OK
                    o = -1;
                if (o >= 0)
                    assertEquals(Character.OffsetByCodePoints(s, 0, s.Length, index, offset), o);
예제 #11
        public virtual void TestFillJava14()
            var input    = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
            var instance = CharacterUtils.GetInstance(LuceneVersion.LUCENE_30);
            var reader   = new StringReader(input);
            var buffer   = CharacterUtils.NewCharacterBuffer(5);

            assertTrue(instance.Fill(buffer, reader));
            assertEquals(5, buffer.Length);
            assertEquals("1234\ud801", new string(buffer.Buffer, buffer.Offset, buffer.Length));
            assertTrue(instance.Fill(buffer, reader));
            assertEquals(5, buffer.Length);
            assertEquals("\udc1c7891", new string(buffer.Buffer));
            buffer = CharacterUtils.NewCharacterBuffer(6);
            assertTrue(instance.Fill(buffer, reader));
            assertEquals(6, buffer.Length);
            assertEquals("23\ud801\ud801\udc1c\ud801", new string(buffer.Buffer, buffer.Offset, buffer.Length));
            assertFalse(instance.Fill(buffer, reader));
예제 #12
        public virtual void TestFillNoHighSurrogate()
            var versions = new LuceneVersion[] { LuceneVersion.LUCENE_30, TEST_VERSION_CURRENT };

            foreach (var version in versions)
                var instance = CharacterUtils.GetInstance(version);
                var reader   = new StringReader("helloworld");
                var buffer   = CharacterUtils.NewCharacterBuffer(6);
                assertTrue(instance.Fill(buffer, reader));
                assertEquals(0, buffer.Offset);
                assertEquals(6, buffer.Length);
                assertEquals("hellow", new string(buffer.Buffer));
                assertFalse(instance.Fill(buffer, reader));
                assertEquals(4, buffer.Length);
                assertEquals(0, buffer.Offset);

                assertEquals("orld", new string(buffer.Buffer, buffer.Offset, buffer.Length));
                assertFalse(instance.Fill(buffer, reader));
예제 #13
        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
            : base(input)
            //if (version == null)
            //    throw new ArgumentException("version must not be null");

            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
                throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");

            if (!Enum.IsDefined(typeof(Side), side))
                throw new ArgumentException("sideLabel must be either front or back");

            if (minGram < 1)
                throw new ArgumentException("minGram must be greater than zero");

            if (minGram > maxGram)
                throw new ArgumentException("minGram must not be greater than maxGram");

            this.version   = version;
            this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
            this.minGram   = minGram;
            this.maxGram   = maxGram;
            this.side      = side;

            this.termAtt    = AddAttribute <ICharTermAttribute>();
            this.offsetAtt  = AddAttribute <IOffsetAttribute>();
            this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            this.posLenAtt  = AddAttribute <IPositionLengthAttribute>();
예제 #14
        public virtual void TestFillJava15()
            const string input    = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
            var          instance = CharacterUtils.GetInstance(TEST_VERSION_CURRENT);
            var          reader   = new StringReader(input);
            var          buffer   = CharacterUtils.NewCharacterBuffer(5);

            assertTrue(instance.Fill(buffer, reader));
            assertEquals(4, buffer.Length);
            assertEquals("1234", new string(buffer.Buffer, buffer.Offset, buffer.Length));
            assertTrue(instance.Fill(buffer, reader));
            assertEquals(5, buffer.Length);
            assertEquals("\ud801\udc1c789", new string(buffer.Buffer));
            assertTrue(instance.Fill(buffer, reader));
            assertEquals(4, buffer.Length);
            assertEquals("123\ud801", new string(buffer.Buffer, buffer.Offset, buffer.Length));
            assertFalse(instance.Fill(buffer, reader));
            assertEquals(3, buffer.Length);
            assertEquals("\ud801\udc1c\ud801", new string(buffer.Buffer, buffer.Offset, buffer.Length));
            assertFalse(instance.Fill(buffer, reader));
            assertEquals(0, buffer.Length);
예제 #15
 private void init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
     if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
         throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
     charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
     if (minGram < 1)
         throw new System.ArgumentException("minGram must be greater than zero");
     if (minGram > maxGram)
         throw new System.ArgumentException("minGram must not be greater than maxGram");
     this.minGram   = minGram;
     this.maxGram   = maxGram;
     this.edgesOnly = edgesOnly;
     charBuffer     = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
     buffer         = new int[charBuffer.Buffer.Length];
     // Make the term att large enough
     termAtt.ResizeBuffer(2 * maxGram);
예제 #16
        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
            : base(input)
            // LUCENENET specific - version cannot be null because it is a value type.

            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
                throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");

            if (!side.IsDefined())
                throw new ArgumentOutOfRangeException(nameof(side), "sideLabel must be either front or back"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)

            if (minGram < 1)
                throw new ArgumentOutOfRangeException(nameof(minGram), "minGram must be greater than zero"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)

            if (minGram > maxGram)
                throw new ArgumentException("minGram must not be greater than maxGram");

            this.version   = version;
            this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
            this.minGram   = minGram;
            this.maxGram   = maxGram;
            this.side      = side;

            this.termAtt    = AddAttribute <ICharTermAttribute>();
            this.offsetAtt  = AddAttribute <IOffsetAttribute>();
            this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            this.posLenAtt  = AddAttribute <IPositionLengthAttribute>();