/// <summary> /// Creates NGramTokenFilter with given min and max n-grams. </summary> /// <param name="version"> Lucene version to enable correct position increments. /// See <a href="#version">above</a> for details. </param> /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) : base(new CodepointCountFilter(version, input, minGram, int.MaxValue)) { this.version = version; this.charUtils = version.OnOrAfter(Version.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; if (version.OnOrAfter(Version.LUCENE_44)) { posIncAtt = AddAttribute(typeof(PositionIncrementAttribute)); posLenAtt = AddAttribute(typeof(PositionLengthAttribute)); } else { posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this); posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this); } }
/// <summary> /// Create a new ReverseStringFilter that reverses and marks all tokens in the /// supplied <seealso cref="TokenStream"/>. /// <para> /// The reversed tokens will be prepended (marked) by the <code>marker</code> /// character. /// </para> /// </summary> /// <param name="matchVersion"> See <a href="#version">above</a> </param> /// <param name="in"> <seealso cref="TokenStream"/> to filter </param> /// <param name="marker"> A character used to mark reversed tokens </param> public ReverseStringFilter(Version matchVersion, TokenStream @in, char marker) : base(@in) { this.matchVersion = matchVersion; this.marker = marker; }
/// <summary> /// Reverses the given input buffer in-place </summary> /// <param name="matchVersion"> See <a href="#version">above</a> </param> /// <param name="buffer"> the input char array to reverse </param> //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET: //ORIGINAL LINE: public static void reverse(org.apache.lucene.util.Version matchVersion, final char[] buffer) public static void reverse(Version matchVersion, char[] buffer) { reverse(matchVersion, buffer, 0, buffer.Length); }
/// <summary> /// Partially reverses the given input buffer in-place from the given offset /// up to the given length. </summary> /// <param name="matchVersion"> See <a href="#version">above</a> </param> /// <param name="buffer"> the input char array to reverse </param> /// <param name="start"> the offset from where to reverse the buffer </param> /// <param name="len"> the length in the buffer up to where the /// buffer should be reversed </param> public static void reverse(Version matchVersion, char[] buffer, int start, int len) { if (!matchVersion.OnOrAfter(Version.LUCENE_31)) { reverseUnicode3(buffer, start, len); return; } /* modified version of Apache Harmony AbstractStringBuilder reverse0() */ if (len < 2) { return; } int end = (start + len) - 1; char frontHigh = buffer[start]; char endLow = buffer[end]; bool allowFrontSur = true, allowEndSur = true; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int mid = start + (len >> 1); int mid = start + (len >> 1); for (int i = start; i < mid; ++i, --end) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final char frontLow = buffer[i + 1]; char frontLow = buffer[i + 1]; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final char endHigh = buffer[end - 1]; char endHigh = buffer[end - 1]; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final boolean surAtFront = allowFrontSur && Character.isSurrogatePair(frontHigh, frontLow); bool surAtFront = allowFrontSur && char.IsSurrogatePair(frontHigh, frontLow); if (surAtFront && (len < 3)) { // nothing to do since surAtFront is allowed and 1 char left return; } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final boolean surAtEnd = allowEndSur && Character.isSurrogatePair(endHigh, endLow); bool surAtEnd = allowEndSur && char.IsSurrogatePair(endHigh, endLow); allowFrontSur = allowEndSur = true; if (surAtFront == surAtEnd) { if (surAtFront) { // both surrogates buffer[end] = frontLow; buffer[--end] = frontHigh; buffer[i] = endHigh; buffer[++i] = endLow; frontHigh = buffer[i + 1]; endLow = buffer[end - 1]; } else { // neither surrogates buffer[end] = frontHigh; buffer[i] = endLow; frontHigh = frontLow; endLow = endHigh; } } else { if (surAtFront) { // surrogate only at the front buffer[end] = frontLow; buffer[i] = endLow; endLow = endHigh; allowFrontSur = false; } else { // surrogate only at the end buffer[end] = frontHigh; buffer[i] = endHigh; frontHigh = frontLow; allowEndSur = false; } } } if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur)) { // only if odd length buffer[end] = allowFrontSur ? endLow : frontHigh; } }
/// <summary> /// Returns a fully initialized TokenFilterFactory with the specified name and key-value arguments. /// <seealso cref="ClasspathResourceLoader"/> is used for loading resources, so any required ones should /// be on the test classpath. /// </summary> protected internal virtual TokenFilterFactory TokenFilterFactory(string name, Version version, params string[] keysAndValues) { return(TokenFilterFactory(name, version, new ClasspathResourceLoader(this.GetType()), keysAndValues)); }
/// <summary> /// Returns a fully initialized CharFilterFactory with the specified name, version, resource loader, /// and key-value arguments. /// </summary> protected internal virtual CharFilterFactory CharFilterFactory(string name, Version matchVersion, IResourceLoader loader, params string[] keysAndValues) { return (CharFilterFactory)AnalysisFactory(Lucene.Net.Analysis.Util.CharFilterFactory.LookupClass(name), matchVersion, loader, keysAndValues); }
/// <summary> /// Reverses the given input string /// </summary> /// <param name="matchVersion"> See <a href="#version">above</a> </param> /// <param name="input"> the string to reverse </param> /// <returns> the given input string in reversed order </returns> //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET: //ORIGINAL LINE: public static String reverse(org.apache.lucene.util.Version matchVersion, final String input) public static string reverse(Version matchVersion, string input) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final char[] charInput = input.toCharArray(); char[] charInput = input.ToCharArray(); reverse(matchVersion, charInput, 0, charInput.Length); return new string(charInput); }
/// <summary> /// Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting /// leading and trailing whitespace). Every line of the Reader should contain only /// one word. The words need to be in lowercase if you make use of an /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). /// </summary> /// <param name="reader"> Reader containing the wordlist </param> /// <param name="comment"> The string representing a comment. </param> /// <param name="matchVersion"> the Lucene <seealso cref="Version"/> </param> /// <returns> A CharArraySet with the reader's words </returns> public static CharArraySet GetWordSet(TextReader reader, string comment, Version matchVersion) { return GetWordSet(reader, comment, new CharArraySet(matchVersion, INITIAL_CAPACITY, false)); }
public Lucene43EdgeNGramTokenizer(Version version, Reader input, string sideLabel, int minGram, int maxGram) : this(version, input, GetSide(sideLabel), minGram, maxGram) { }
public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, Side side, int minGram, int maxGram) : base(factory, input) { init(version, side, minGram, maxGram); }
public Lucene43EdgeNGramTokenizer(Version version, Reader input, Side side, int minGram, int maxGram) : base(input) { init(version, side, minGram, maxGram); }
private void init(Version version, Side side, int minGram, int maxGram) { if (version == null) { throw new System.ArgumentException("version must not be null"); } if (side == null) { throw new System.ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } if (version.OnOrAfter(Version.LUCENE_44)) { if (side == Side.BACK) { throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4"); } } else { maxGram = Math.Min(maxGram, 1024); } this.minGram = minGram; this.maxGram = maxGram; this.side = side; }
/// <summary> /// Creates a CharArraySet from a file. /// </summary> /// <param name="stopwords"> /// the stopwords reader to load /// </param> /// <param name="matchVersion"> /// the Lucene version for cross version compatibility </param> /// <returns> a CharArraySet containing the distinct stopwords from the given /// reader </returns> /// <exception cref="IOException"> /// if loading the stopwords throws an <seealso cref="IOException"/> </exception> protected internal static CharArraySet loadStopwordSet(Reader stopwords, Version matchVersion) { try { return WordlistLoader.GetWordSet(stopwords, matchVersion); } finally { IOUtils.Close(stopwords); } }
/// <summary> /// Creates a new Analyzer with an empty stopword set /// </summary> /// <param name="version"> /// the Lucene version for cross version compatibility </param> protected internal StopwordAnalyzerBase(Version version) : this(version, null) { }
/// <summary> /// Partially reverses the given input buffer in-place from offset 0 /// up to the given length. </summary> /// <param name="matchVersion"> See <a href="#version">above</a> </param> /// <param name="buffer"> the input char array to reverse </param> /// <param name="len"> the length in the buffer up to where the /// buffer should be reversed </param> //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET: //ORIGINAL LINE: public static void reverse(org.apache.lucene.util.Version matchVersion, final char[] buffer, final int len) public static void reverse(Version matchVersion, char[] buffer, int len) { reverse(matchVersion, buffer, 0, len); }
/// <summary> /// Creates a new StandardTokenizer with a given <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> /// </summary> public StandardTokenizer(Version matchVersion, AttributeFactory factory, Reader input) : base(factory, input) { Init(matchVersion); }
/// <summary> /// Create set with enough capacity to hold startSize terms /// </summary> /// <param name="matchVersion"> /// compatibility match version see <a href="#version">Version /// note</a> above for details. </param> /// <param name="startSize"> /// the initial capacity </param> /// <param name="ignoreCase"> /// <code>false</code> if and only if the set should be case sensitive /// otherwise <code>true</code>. </param> public CharArraySet(Lucene.Net.Util.LuceneVersion matchVersion, int startSize, bool ignoreCase) : this(new CharArrayMap <>(matchVersion, startSize, ignoreCase))
/// <summary> /// Create a new ReverseStringFilter that reverses all tokens in the /// supplied <seealso cref="TokenStream"/>. /// <para> /// The reversed tokens will not be marked. /// </para> /// </summary> /// <param name="matchVersion"> See <a href="#version">above</a> </param> /// <param name="in"> <seealso cref="TokenStream"/> to filter </param> public ReverseStringFilter(Version matchVersion, TokenStream @in) : this(matchVersion, @in, NOMARKER) { }
/// <summary> /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range /// </summary> /// <param name="version"> the <a href="#version">Lucene match version</a> </param> /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param> /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) : this(version, factory, input, Side.FRONT, minGram, maxGram) { }
/// <summary> /// Creates a new instance initialized with the given stopword set /// </summary> /// <param name="version"> /// the Lucene version for cross version compatibility </param> /// <param name="stopwords"> /// the analyzer's stopword set </param> protected internal StopwordAnalyzerBase(Version version, CharArraySet stopwords) { matchVersion = version; // analyzers should use char array set for stopwords! this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet.UnmodifiableSet(CharArraySet.Copy(version, stopwords)); }
/// <summary> /// Creates a new instance of the <seealso cref="StandardTokenizer"/>. Attaches /// the <code>input</code> to the newly created JFlex scanner. /// </summary> /// <param name="input"> The input reader /// /// See http://issues.apache.org/jira/browse/LUCENE-1068 </param> public StandardTokenizer(Version matchVersion, Reader input) : base(input) { Init(matchVersion); }
/// <summary> /// Creates NGramTokenFilter with default min and max n-grams. </summary> /// <param name="version"> Lucene version to enable correct position increments. /// See <a href="#version">above</a> for details. </param> /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param> public NGramTokenFilter(Version version, TokenStream input) : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE) { }
/// <summary> /// Creates a CharArraySet from a file. /// </summary> /// <param name="stopwords"> /// the stopwords file to load /// </param> /// <param name="matchVersion"> /// the Lucene version for cross version compatibility </param> /// <returns> a CharArraySet containing the distinct stopwords from the given /// file </returns> /// <exception cref="IOException"> /// if loading the stopwords throws an <seealso cref="IOException"/> </exception> protected internal static CharArraySet LoadStopwordSet(File stopwords, Version matchVersion) { Reader reader = null; try { reader = IOUtils.GetDecodingReader(stopwords, StandardCharsets.UTF_8); return WordlistLoader.GetWordSet(reader, matchVersion); } finally { IOUtils.Close(reader); } }
/// <summary> /// Creates a new instance of the <seealso cref="StandardTokenizer"/>. Attaches /// the <code>input</code> to the newly created JFlex scanner. /// </summary> /// <param name="input"> The input reader /// /// See http://issues.apache.org/jira/browse/LUCENE-1068 </param> public StandardTokenizer(Version matchVersion, Reader input) : base(input) { termAtt = AddAttribute<ICharTermAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); Init(matchVersion); }
private void Init(Version matchVersion) { if (matchVersion.OnOrAfter(Version.LUCENE_47)) { this.scanner = new StandardTokenizerImpl(input); } else if (matchVersion.OnOrAfter(Version.LUCENE_40)) { this.scanner = new StandardTokenizerImpl40(input); } else if (matchVersion.OnOrAfter(Version.LUCENE_34)) { this.scanner = new StandardTokenizerImpl34(input); } else if (matchVersion.OnOrAfter(Version.LUCENE_31)) { this.scanner = new StandardTokenizerImpl31(input); } else { this.scanner = new ClassicTokenizerImpl(input); } }
/// <summary> /// Reads stopwords from a stopword list in Snowball format. /// <para> /// The snowball format is the following: /// <ul> /// <li>Lines may contain multiple words separated by whitespace. /// <li>The comment character is the vertical line (|). /// <li>Lines may contain trailing comments. /// </ul> /// </para> /// </summary> /// <param name="reader"> Reader containing a Snowball stopword list </param> /// <param name="matchVersion"> the Lucene <seealso cref="Version"/> </param> /// <returns> A <seealso cref="CharArraySet"/> with the reader's words </returns> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public static CharArraySet getSnowballWordSet(java.io.Reader reader, org.apache.lucene.util.Version matchVersion) throws java.io.IOException public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion) { return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false)); }
/// <summary> /// Returns a fully initialized TokenFilterFactory with the specified name and key-value arguments. /// <seealso cref="ClasspathResourceLoader"/> is used for loading resources, so any required ones should /// be on the test classpath. /// </summary> protected internal virtual TokenFilterFactory TokenFilterFactory(string name, Version version, params string[] keysAndValues) { return(TokenFilterFactory(name, version, GetCurrentTypeResourceLoader(), keysAndValues)); }
private void Init(Version matchVersion) { #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(Version.LUCENE_47)) { this.scanner = new StandardTokenizerImpl(input); } else if (matchVersion.OnOrAfter(Version.LUCENE_40)) { this.scanner = new StandardTokenizerImpl40(input); } else if (matchVersion.OnOrAfter(Version.LUCENE_34)) { this.scanner = new StandardTokenizerImpl34(input); } else if (matchVersion.OnOrAfter(Version.LUCENE_31)) { this.scanner = new StandardTokenizerImpl31(input); } #pragma warning restore 612, 618 else { this.scanner = new ClassicTokenizerImpl(input); } termAtt = AddAttribute<ICharTermAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); }
/// <summary> /// Reads stopwords from a stopword list in Snowball format. /// <para> /// The snowball format is the following: /// <ul> /// <li>Lines may contain multiple words separated by whitespace. /// <li>The comment character is the vertical line (|). /// <li>Lines may contain trailing comments. /// </ul> /// </para> /// </summary> /// <param name="reader"> Reader containing a Snowball stopword list </param> /// <param name="matchVersion"> the Lucene <seealso cref="Version"/> </param> /// <returns> A <seealso cref="CharArraySet"/> with the reader's words </returns> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public static CharArraySet getSnowballWordSet(java.io.Reader reader, org.apache.lucene.util.Version matchVersion) throws java.io.IOException public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion) { return(getSnowballWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false))); }
/// <summary> /// Returns a fully initialized CharFilterFactory with the specified name, version, resource loader, /// and key-value arguments. /// </summary> protected internal virtual CharFilterFactory CharFilterFactory(string name, Version matchVersion, IResourceLoader loader, params string[] keysAndValues) { return((CharFilterFactory)AnalysisFactory(Lucene.Net.Analysis.Util.CharFilterFactory.LookupClass(name), matchVersion, loader, keysAndValues)); }
/// <summary> /// Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting /// leading and trailing whitespace). Every line of the Reader should contain only /// one word. The words need to be in lowercase if you make use of an /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). /// </summary> /// <param name="reader"> Reader containing the wordlist </param> /// <param name="comment"> The string representing a comment. </param> /// <param name="matchVersion"> the Lucene <seealso cref="Version"/> </param> /// <returns> A CharArraySet with the reader's words </returns> public static CharArraySet GetWordSet(TextReader reader, string comment, Version matchVersion) { return(GetWordSet(reader, comment, new CharArraySet(matchVersion, INITIAL_CAPACITY, false))); }
private AbstractAnalysisFactory AnalysisFactory(Type clazz, Version matchVersion, IResourceLoader loader, params string[] keysAndValues) { if (keysAndValues.Length % 2 == 1) { throw new System.ArgumentException("invalid keysAndValues map"); } string previous; IDictionary<string, string> args = new Dictionary<string, string>(); for (int i = 0; i < keysAndValues.Length; i += 2) { if (args.TryGetValue(keysAndValues[i], out previous)) { fail("duplicate values for key: " + keysAndValues[i]); } args[keysAndValues[i]] = keysAndValues[i + 1]; } if (args.TryGetValue("luceneMatchVersion", out previous)) { fail("duplicate values for key: luceneMatchVersion"); } args["luceneMatchVersion"] = matchVersion.ToString(); AbstractAnalysisFactory factory = null; try { factory = (AbstractAnalysisFactory)Activator.CreateInstance(clazz, BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance, null, new object[] { args }, CultureInfo.InvariantCulture); } catch (TargetInvocationException e) { // to simplify tests that check for illegal parameters if (e.InnerException is System.ArgumentException) { throw (System.ArgumentException)e.InnerException; } else { throw e; } } if (factory is IResourceLoaderAware) { ((IResourceLoaderAware)factory).Inform(loader); } return factory; }
public FilteringTokenFilter(Lucene.Net.Util.LuceneVersion version, bool enablePositionIncrements, TokenStream input) : this(version, input) { CheckPositionIncrement(version, enablePositionIncrements); this.enablePositionIncrements = enablePositionIncrements; }
/// <summary> /// Returns a fully initialized TokenFilterFactory with the specified name and key-value arguments. /// <seealso cref="ClasspathResourceLoader"/> is used for loading resources, so any required ones should /// be on the test classpath. /// </summary> protected internal virtual TokenFilterFactory TokenFilterFactory(string name, Version version, params string[] keysAndValues) { return TokenFilterFactory(name, version, GetCurrentTypeResourceLoader(), keysAndValues); }
/// <summary> /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range /// </summary> /// <param name="version"> the <a href="#version">Lucene match version</a> </param> /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public Lucene43EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) : this(version, input, Side.FRONT, minGram, maxGram) { }