/// <summary> /// Create a new UpperCaseFilter, that normalizes token text to upper case. /// </summary> /// <param name="matchVersion"> See <a href="#version">above</a> </param> /// <param name="in"> TokenStream to filter </param> public UpperCaseFilter(LuceneVersion matchVersion, TokenStream @in) : base(@in) { termAtt = AddAttribute<ICharTermAttribute>(); termAtt = AddAttribute<ICharTermAttribute>(); charUtils = CharacterUtils.GetInstance(matchVersion); }
/// <summary> /// Creates NGramTokenFilter with given min and max n-grams. </summary> /// <param name="version"> Lucene version to enable correct position increments. /// See <a href="#version">above</a> for details. </param> /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) : base(new CodepointCountFilter(version, input, minGram, int.MaxValue)) { this.version = version; this.charUtils = version.OnOrAfter(Version.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; if (version.OnOrAfter(Version.LUCENE_44)) { posIncAtt = AddAttribute(typeof(PositionIncrementAttribute)); posLenAtt = AddAttribute(typeof(PositionLengthAttribute)); } else { posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this); posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this); } }
private void init(Version version, int minGram, int maxGram, bool edgesOnly) { if (!version.onOrAfter(Version.LUCENE_44)) { throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer"); } charUtils = version.onOrAfter(Version.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance; if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.edgesOnly = edgesOnly; charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader buffer = new int[charBuffer.Buffer.Length]; // Make the term att large enough termAtt.resizeBuffer(2 * maxGram); }
/// <summary> /// LUCENENET Added in the .NET version to assist with setting the attributes /// from multiple constructors. /// </summary> /// <param name="matchVersion"></param> private void Init(LuceneVersion matchVersion) { charUtils = CharacterUtils.GetInstance(matchVersion); termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); }
/// <summary> /// Creates a new <seealso cref="CharTokenizer"/> instance /// </summary> /// <param name="matchVersion"> /// Lucene version to match </param> /// <param name="factory"> /// the attribute factory to use for this <seealso cref="Tokenizer"/> </param> /// <param name="input"> /// the input to split up into tokens </param> protected CharTokenizer(LuceneVersion matchVersion, AttributeFactory factory, TextReader input) : base(factory, input) { _input = input; charUtils = CharacterUtils.GetInstance(matchVersion); }
/// <summary> /// LUCENENET specific - Added in the .NET version to assist with setting the attributes /// from multiple constructors. /// </summary> /// <param name="matchVersion"></param> private void Init(LuceneVersion matchVersion) { charUtils = CharacterUtils.GetInstance(matchVersion); termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); }
/// <summary> /// Creates NGramTokenFilter with given min and max n-grams. </summary> /// <param name="version"> Lucene version to enable correct position increments. /// See <a href="#version">above</a> for details. </param> /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram) : base(new CodepointCountFilter(version, input, minGram, int.MaxValue)) { this.version = version; this.charUtils = version.OnOrAfter( #pragma warning disable 612, 618 LuceneVersion.LUCENE_44) ? #pragma warning restore 612, 618 CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; #pragma warning disable 612, 618 if (version.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { posIncAtt = AddAttribute<IPositionIncrementAttribute>(); posLenAtt = AddAttribute<IPositionLengthAttribute>(); } else { posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this); posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this); } termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); }
/// <summary> /// Creates a new <seealso cref="CharTokenizer"/> instance /// </summary> /// <param name="matchVersion"> /// Lucene version to match </param> /// <param name="input"> /// the input to split up into tokens </param> protected CharTokenizer(LuceneVersion matchVersion, TextReader input) : base(input) { termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); charUtils = CharacterUtils.GetInstance(matchVersion); }
public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram) : base(input) { if (version == null) { throw new System.ArgumentException("version must not be null"); } if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK) { throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); } if (side == null) { throw new System.ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.version = version; this.charUtils = version.onOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance; this.minGram = minGram; this.maxGram = maxGram; this.side = side; }
/// <summary> /// Creates a new <seealso cref="CharTokenizer"/> instance /// </summary> /// <param name="matchVersion"> /// Lucene version to match </param> /// <param name="factory"> /// the attribute factory to use for this <seealso cref="Tokenizer"/> </param> /// <param name="input"> /// the input to split up into tokens </param> public CharTokenizer(Version matchVersion, AttributeFactory factory, TextReader input) : base(factory, input) { _input = input; charUtils = CharacterUtils.getInstance(matchVersion); }
/// <summary> /// Creates a new <seealso cref="CharTokenizer"/> instance /// </summary> /// <param name="matchVersion"> /// Lucene version to match </param> /// <param name="input"> /// the input to split up into tokens </param> public CharTokenizer(Version matchVersion, TextReader input) : base(input) { charUtils = CharacterUtils.getInstance(matchVersion); }
public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram) : base(input) { //if (version == null) //{ // throw new System.ArgumentException("version must not be null"); //} if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK) { throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); } if (!Enum.IsDefined(typeof(Side), side)) { throw new System.ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.version = version; this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; this.minGram = minGram; this.maxGram = maxGram; this.side = side; this.termAtt = AddAttribute<ICharTermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); this.posLenAtt = AddAttribute<IPositionLengthAttribute>(); }
private void TestConversions(CharacterUtils charUtils) { var orig = TestUtil.RandomUnicodeString(Random(), 100).toCharArray(); var buf = new int[orig.Length]; var restored = new char[buf.Length]; var o1 = TestUtil.NextInt(Random(), 0, Math.Min(5, orig.Length)); var o2 = TestUtil.NextInt(Random(), 0, o1); var o3 = TestUtil.NextInt(Random(), 0, o1); var codePointCount = charUtils.toCodePoints(orig, o1, orig.Length - o1, buf, o2); var charCount = charUtils.toChars(buf, o2, codePointCount, restored, o3); assertEquals(orig.Length - o1, charCount); assertArrayEquals(Arrays.CopyOfRange(orig, o1, o1 + charCount), Arrays.CopyOfRange(restored, o3, o3 + charCount)); }
private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly) { #pragma warning disable 612, 618 if (!version.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer"); } #pragma warning disable 612, 618 charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? #pragma warning restore 612, 618 CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } termAtt = AddAttribute<ICharTermAttribute>(); posIncAtt = AddAttribute<IPositionIncrementAttribute>(); posLenAtt = AddAttribute<IPositionLengthAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); this.minGram = minGram; this.maxGram = maxGram; this.edgesOnly = edgesOnly; charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader buffer = new int[charBuffer.Buffer.Length]; // Make the term att large enough termAtt.ResizeBuffer(2 * maxGram); }