CharacterUtils provides a unified interface to Character-related operations to implement backwards compatible character operations based on a Version instance. @lucene.internal
Example #1
0
 /// <summary>
 /// Create a new UpperCaseFilter, that normalizes token text to upper case.
 /// </summary>
 /// <param name="matchVersion"> See <a href="#version">above</a> </param>
 /// <param name="in"> TokenStream to filter </param>
 public UpperCaseFilter(LuceneVersion matchVersion, TokenStream @in)
     : base(@in)
 {
     termAtt = AddAttribute<ICharTermAttribute>();
     termAtt = AddAttribute<ICharTermAttribute>();
     charUtils = CharacterUtils.GetInstance(matchVersion);
 }
Example #2
0
 /// <summary>
 /// Creates NGramTokenFilter with given min and max n-grams. </summary>
 /// <param name="version"> Lucene version to enable correct position increments.
 ///                See <a href="#version">above</a> for details. </param>
 /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram)
     : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
 {
     this.version = version;
     this.charUtils = version.OnOrAfter(Version.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
     if (minGram < 1)
     {
         throw new System.ArgumentException("minGram must be greater than zero");
     }
     if (minGram > maxGram)
     {
         throw new System.ArgumentException("minGram must not be greater than maxGram");
     }
     this.minGram = minGram;
     this.maxGram = maxGram;
     if (version.OnOrAfter(Version.LUCENE_44))
     {
         posIncAtt = AddAttribute(typeof(PositionIncrementAttribute));
         posLenAtt = AddAttribute(typeof(PositionLengthAttribute));
     }
     else
     {
         posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
         posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
     }
 }
Example #3
0
	  private void init(Version version, int minGram, int maxGram, bool edgesOnly)
	  {
		if (!version.onOrAfter(Version.LUCENE_44))
		{
		  throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
		}
		charUtils = version.onOrAfter(Version.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance;
		if (minGram < 1)
		{
		  throw new System.ArgumentException("minGram must be greater than zero");
		}
		if (minGram > maxGram)
		{
		  throw new System.ArgumentException("minGram must not be greater than maxGram");
		}
		this.minGram = minGram;
		this.maxGram = maxGram;
		this.edgesOnly = edgesOnly;
		charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
		buffer = new int[charBuffer.Buffer.Length];
		// Make the term att large enough
		termAtt.resizeBuffer(2 * maxGram);
	  }
 /// <summary>
 /// LUCENENET Added in the .NET version to assist with setting the attributes
 /// from multiple constructors.
 /// </summary>
 /// <param name="matchVersion"></param>
 private void Init(LuceneVersion matchVersion)
 {
     charUtils = CharacterUtils.GetInstance(matchVersion);
     termAtt = AddAttribute<ICharTermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
 }
Example #5
0
 /// <summary>
 /// Creates a new <seealso cref="CharTokenizer"/> instance
 /// </summary>
 /// <param name="matchVersion">
 ///          Lucene version to match </param>
 /// <param name="factory">
 ///          the attribute factory to use for this <seealso cref="Tokenizer"/> </param>
 /// <param name="input">
 ///          the input to split up into tokens </param>
 protected CharTokenizer(LuceneVersion matchVersion, AttributeFactory factory, TextReader input)
     : base(factory, input)
 {
     _input    = input;
     charUtils = CharacterUtils.GetInstance(matchVersion);
 }
Example #6
0
 /// <summary>
 /// LUCENENET specific - Added in the .NET version to assist with setting the attributes
 /// from multiple constructors.
 /// </summary>
 /// <param name="matchVersion"></param>
 private void Init(LuceneVersion matchVersion)
 {
     charUtils = CharacterUtils.GetInstance(matchVersion);
     termAtt   = AddAttribute <ICharTermAttribute>();
     offsetAtt = AddAttribute <IOffsetAttribute>();
 }
        /// <summary>
        /// Creates NGramTokenFilter with given min and max n-grams. </summary>
        /// <param name="version"> Lucene version to enable correct position increments.
        ///                See <a href="#version">above</a> for details. </param>
        /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
        /// <param name="minGram"> the smallest n-gram to generate </param>
        /// <param name="maxGram"> the largest n-gram to generate </param>
        public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
            : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
        {
            this.version = version;
            this.charUtils = version.OnOrAfter(
#pragma warning disable 612, 618
                LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
                CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }
            this.minGram = minGram;
            this.maxGram = maxGram;
#pragma warning disable 612, 618
            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
            {
                posIncAtt = AddAttribute<IPositionIncrementAttribute>();
                posLenAtt = AddAttribute<IPositionLengthAttribute>();
            }
            else
            {
                posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
                posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
            }
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
        }
Example #8
0
        /// <summary>
        /// Creates a new <seealso cref="CharTokenizer"/> instance
        /// </summary>
        /// <param name="matchVersion">
        ///          Lucene version to match </param>
        /// <param name="input">
        ///          the input to split up into tokens </param>
        protected CharTokenizer(LuceneVersion matchVersion, TextReader input)
            : base(input)
        {
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();

            charUtils = CharacterUtils.GetInstance(matchVersion);
        }
Example #9
0
 /// <summary>
 /// Creates a new <seealso cref="CharTokenizer"/> instance
 /// </summary>
 /// <param name="matchVersion">
 ///          Lucene version to match </param>
 /// <param name="factory">
 ///          the attribute factory to use for this <seealso cref="Tokenizer"/> </param>
 /// <param name="input">
 ///          the input to split up into tokens </param>
 protected CharTokenizer(LuceneVersion matchVersion, AttributeFactory factory, TextReader input)
     : base(factory, input)
 {
     _input = input;
     charUtils = CharacterUtils.GetInstance(matchVersion);
 }
        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
            : base(input)
        {
            if (version == null)
            {
              throw new System.ArgumentException("version must not be null");
            }

            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
            {
              throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
            }

            if (side == null)
            {
              throw new System.ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
              throw new System.ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
              throw new System.ArgumentException("minGram must not be greater than maxGram");
            }

            this.version = version;
            this.charUtils = version.onOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance;
            this.minGram = minGram;
            this.maxGram = maxGram;
            this.side = side;
        }
Example #11
0
 /// <summary>
 /// Creates a new <seealso cref="CharTokenizer"/> instance
 /// </summary>
 /// <param name="matchVersion">
 ///          Lucene version to match </param>
 /// <param name="factory">
 ///          the attribute factory to use for this <seealso cref="Tokenizer"/> </param>
 /// <param name="input">
 ///          the input to split up into tokens </param>
 public CharTokenizer(Version matchVersion, AttributeFactory factory, TextReader input)
     : base(factory, input)
 {
     _input    = input;
     charUtils = CharacterUtils.getInstance(matchVersion);
 }
Example #12
0
 /// <summary>
 /// Creates a new <seealso cref="CharTokenizer"/> instance
 /// </summary>
 /// <param name="matchVersion">
 ///          Lucene version to match </param>
 /// <param name="input">
 ///          the input to split up into tokens </param>
 public CharTokenizer(Version matchVersion, TextReader input)
     : base(input)
 {
     charUtils = CharacterUtils.getInstance(matchVersion);
 }
        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
              : base(input)
        {

            //if (version == null)
            //{
            //    throw new System.ArgumentException("version must not be null");
            //}

            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
            {
                throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
            }

            if (!Enum.IsDefined(typeof(Side), side))
            {
                throw new System.ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }

            this.version = version;
            this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
            this.minGram = minGram;
            this.maxGram = maxGram;
            this.side = side;

            this.termAtt = AddAttribute<ICharTermAttribute>();
            this.offsetAtt = AddAttribute<IOffsetAttribute>();
            this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            this.posLenAtt = AddAttribute<IPositionLengthAttribute>();
        }
        private void TestConversions(CharacterUtils charUtils)
        {
            var orig = TestUtil.RandomUnicodeString(Random(), 100).toCharArray();

            var buf = new int[orig.Length];

            var restored = new char[buf.Length];

            var o1 = TestUtil.NextInt(Random(), 0, Math.Min(5, orig.Length));
            var o2 = TestUtil.NextInt(Random(), 0, o1);
            var o3 = TestUtil.NextInt(Random(), 0, o1);
            var codePointCount = charUtils.toCodePoints(orig, o1, orig.Length - o1, buf, o2);
            var charCount = charUtils.toChars(buf, o2, codePointCount, restored, o3);
            assertEquals(orig.Length - o1, charCount);
            assertArrayEquals(Arrays.CopyOfRange(orig, o1, o1 + charCount), Arrays.CopyOfRange(restored, o3, o3 + charCount));
        }
        private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
        {
#pragma warning disable 612, 618
            if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
            {
                throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
            }
#pragma warning disable 612, 618
            charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
                CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }
            termAtt = AddAttribute<ICharTermAttribute>();
            posIncAtt = AddAttribute<IPositionIncrementAttribute>();
            posLenAtt = AddAttribute<IPositionLengthAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            this.minGram = minGram;
            this.maxGram = maxGram;
            this.edgesOnly = edgesOnly;
            charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
            buffer = new int[charBuffer.Buffer.Length];



            // Make the term att large enough
            termAtt.ResizeBuffer(2 * maxGram);
        }