/// <summary> /// Create a new UpperCaseFilter, that normalizes token text to upper case. /// </summary> /// <param name="matchVersion"> See <a href="#version">above</a> </param> /// <param name="in"> TokenStream to filter </param> public UpperCaseFilter(LuceneVersion matchVersion, TokenStream @in) : base(@in) { termAtt = AddAttribute<ICharTermAttribute>(); termAtt = AddAttribute<ICharTermAttribute>(); charUtils = CharacterUtils.GetInstance(matchVersion); }
public StandardFilter(LuceneVersion matchVersion, TokenStream @in) : base(@in) { this.matchVersion = matchVersion; typeAtt = AddAttribute<ITypeAttribute>(); termAtt = AddAttribute<ICharTermAttribute>(); }
/// <summary> /// Creates NGramTokenFilter with given min and max n-grams. </summary> /// <param name="version"> Lucene version to enable correct position increments. /// See <a href="#version">above</a> for details. </param> /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram) : base(new CodepointCountFilter(version, input, minGram, int.MaxValue)) { this.version = version; this.charUtils = version.OnOrAfter( #pragma warning disable 612, 618 LuceneVersion.LUCENE_44) ? #pragma warning restore 612, 618 CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; #pragma warning disable 612, 618 if (version.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { posIncAtt = AddAttribute<IPositionIncrementAttribute>(); posLenAtt = AddAttribute<IPositionLengthAttribute>(); } else { posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this); posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this); } termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); }
/// <summary> /// Create a new <seealso cref="CodepointCountFilter"/>. This will filter out tokens whose /// <seealso cref="CharTermAttribute"/> is either too short (<seealso cref="Character#CodePointCount(char[], int, int)"/> /// < min) or too long (<seealso cref="Character#codePointCount(char[], int, int)"/> > max). </summary> /// <param name="version"> the Lucene match version </param> /// <param name="in"> the <seealso cref="TokenStream"/> to consume </param> /// <param name="min"> the minimum length </param> /// <param name="max"> the maximum length </param> public CodepointCountFilter(LuceneVersion version, TokenStream @in, int min, int max) : base(version, @in) { this.min = min; this.max = max; termAtt = AddAttribute<ICharTermAttribute>(); }
protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(input) { termAtt = AddAttribute<ICharTermAttribute>() as CharTermAttribute; offsetAtt = AddAttribute<IOffsetAttribute>(); posIncAtt = AddAttribute<IPositionIncrementAttribute>(); this.matchVersion = matchVersion; this.tokens = new LinkedList<CompoundToken>(); if (minWordSize < 0) { throw new System.ArgumentException("minWordSize cannot be negative"); } this.minWordSize = minWordSize; if (minSubwordSize < 0) { throw new System.ArgumentException("minSubwordSize cannot be negative"); } this.minSubwordSize = minSubwordSize; if (maxSubwordSize < 0) { throw new System.ArgumentException("maxSubwordSize cannot be negative"); } this.maxSubwordSize = maxSubwordSize; this.onlyLongestMatch = onlyLongestMatch; this.dictionary = dictionary; }
/// <summary> /// Create a new <seealso cref="FilteringTokenFilter"/>. </summary> /// <param name="version"> the Lucene match version </param> /// <param name="in"> the <seealso cref="TokenStream"/> to consume </param> public FilteringTokenFilter(LuceneVersion version, TokenStream @in) : base(@in) { posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); this.version = version; this.enablePositionIncrements = true; }
public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram) : base(input) { if (version == null) { throw new System.ArgumentException("version must not be null"); } if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK) { throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); } if (side == null) { throw new System.ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.version = version; this.charUtils = version.onOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance; this.minGram = minGram; this.maxGram = maxGram; this.side = side; }
/// <summary> /// Create a new <seealso cref="TypeTokenFilter"/>. </summary> /// <param name="version"> the Lucene match version </param> /// <param name="input"> the <seealso cref="TokenStream"/> to consume </param> /// <param name="stopTypes"> the types to filter </param> /// <param name="useWhiteList"> if true, then tokens whose type is in stopTypes will /// be kept, otherwise they will be filtered out </param> public TypeTokenFilter(LuceneVersion version, TokenStream input, IEnumerable<string> stopTypes, bool useWhiteList) : base(version, input) { typeAttribute = AddAttribute<ITypeAttribute>(); this.stopTypes = new HashSet<string>(stopTypes); this.useWhiteList = useWhiteList; }
/// <summary> /// Creates a new <seealso cref="DictionaryCompoundWordTokenFilter"/> /// </summary> /// <param name="matchVersion"> /// Lucene version to enable correct Unicode 4.0 behavior in the /// dictionaries if Version > 3.0. See <a /// href="CompoundWordTokenFilterBase.html#version" /// >CompoundWordTokenFilterBase</a> for details. </param> /// <param name="input"> /// the <seealso cref="TokenStream"/> to process </param> /// <param name="dictionary"> /// the word dictionary to match against. </param> /// <param name="minWordSize"> /// only words longer than this get processed </param> /// <param name="minSubwordSize"> /// only subwords longer than this get to the output stream </param> /// <param name="maxSubwordSize"> /// only subwords shorter than this get to the output stream </param> /// <param name="onlyLongestMatch"> /// Add only the longest matching subword to the stream </param> public DictionaryCompoundWordTokenFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch) { if (dictionary == null) { throw new System.ArgumentException("dictionary cannot be null"); } }
/// <summary> /// Creates a new <seealso cref="DictionaryCompoundWordTokenFilter"/> /// </summary> /// <param name="matchVersion"> /// Lucene version to enable correct Unicode 4.0 behavior in the /// dictionaries if Version > 3.0. See <a /// href="CompoundWordTokenFilterBase.html#version" /// >CompoundWordTokenFilterBase</a> for details. </param> /// <param name="input"> /// the <seealso cref="TokenStream"/> to process </param> /// <param name="dictionary"> /// the word dictionary to match against. </param> public DictionaryCompoundWordTokenFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary) : base(matchVersion, input, dictionary) { if (dictionary == null) { throw new System.ArgumentException("dictionary cannot be null"); } }
/// <summary> /// Creates index upgrader on the given directory, using an <seealso cref="IndexWriter"/> using the given /// {@code matchVersion}. You have the possibility to upgrade indexes with multiple commit points by removing /// all older ones. If {@code infoStream} is not {@code null}, all logging output will be sent to this stream. /// </summary> public IndexUpgrader(Directory dir, LuceneVersion matchVersion, TextWriter infoStream, bool deletePriorCommits) : this(dir, new IndexWriterConfig(matchVersion, null), deletePriorCommits) { if (null != infoStream) { this.Iwc.SetInfoStream(infoStream); } }
/// <summary> /// Creates a new <seealso cref="CharTokenizer"/> instance /// </summary> /// <param name="matchVersion"> /// Lucene version to match </param> /// <param name="input"> /// the input to split up into tokens </param> protected CharTokenizer(LuceneVersion matchVersion, TextReader input) : base(input) { termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); charUtils = CharacterUtils.GetInstance(matchVersion); }
/// <summary> /// Construct a token stream filtering the given input using a Set of common /// words to create bigrams. Outputs both unigrams with position increment and /// bigrams with position increment 0 type=gram where one or both of the words /// in a potential bigram are in the set of common words . /// </summary> /// <param name="input"> TokenStream input in filter chain </param> /// <param name="commonWords"> The set of common words. </param> public CommonGramsFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet commonWords) : base(input) { termAttribute = AddAttribute<ICharTermAttribute>(); offsetAttribute = AddAttribute<IOffsetAttribute>(); typeAttribute = AddAttribute<ITypeAttribute>(); posIncAttribute = AddAttribute<IPositionIncrementAttribute>(); posLenAttribute = AddAttribute<IPositionLengthAttribute>(); this.commonWords = commonWords; }
public TrimFilter(LuceneVersion version, TokenStream @in, bool updateOffsets) : base(@in) { if (updateOffsets && version.OnOrAfter(LuceneVersion.LUCENE_44)) { throw new System.ArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4"); } termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); this.updateOffsets = updateOffsets; }
private bool hasIllegalOffsets = false; // only if the length changed before this filter /// <summary> /// Creates a new ThaiWordFilter with the specified match version. </summary> public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input) : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input)) { if (!DBBI_AVAILABLE) { throw new System.NotSupportedException("This JRE does not have support for Thai segmentation"); } handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31); termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); posAtt = AddAttribute<IPositionIncrementAttribute>(); }
/// <summary> /// Initialize this factory via a set of key-value pairs. /// </summary> protected internal AbstractAnalysisFactory(IDictionary<string, string> args) { ExplicitLuceneMatchVersion = false; originalArgs = Collections.UnmodifiableMap(args); string version = Get(args, LUCENE_MATCH_VERSION_PARAM); // LUCENENET TODO: What should we do if the version is null? //luceneMatchVersion = version == null ? (LuceneVersion?)null : LuceneVersionHelpers.ParseLeniently(version); luceneMatchVersion = version == null ? #pragma warning disable 612, 618 LuceneVersion.LUCENE_CURRENT : #pragma warning restore 612, 618 LuceneVersionHelpers.ParseLeniently(version); args.Remove(CLASS_NAME); // consume the class arg }
/// <summary> /// Create a new <seealso cref="LengthFilter"/>. This will filter out tokens whose /// <seealso cref="CharTermAttribute"/> is either too short (<seealso cref="CharTermAttribute#length()"/> /// < min) or too long (<seealso cref="CharTermAttribute#length()"/> > max). </summary> /// <param name="version"> the Lucene match version </param> /// <param name="in"> the <seealso cref="TokenStream"/> to consume </param> /// <param name="min"> the minimum length </param> /// <param name="max"> the maximum length </param> public LengthFilter(LuceneVersion version, TokenStream @in, int min, int max) : base(version, @in) { if (min < 0) { throw new ArgumentOutOfRangeException("minimum length must be greater than or equal to zero"); } if (min > max) { throw new ArgumentOutOfRangeException("maximum length must not be greater than minimum length"); } this.min = min; this.max = max; this.termAtt = AddAttribute<ICharTermAttribute>(); }
/// <summary> /// Create a new <seealso cref="CodepointCountFilter"/>. This will filter out tokens whose /// <seealso cref="CharTermAttribute"/> is either too short (<seealso cref="Character#CodePointCount(char[], int, int)"/> /// < min) or too long (<seealso cref="Character#codePointCount(char[], int, int)"/> > max). </summary> /// <param name="version"> the Lucene match version </param> /// <param name="in"> the <seealso cref="TokenStream"/> to consume </param> /// <param name="min"> the minimum length </param> /// <param name="max"> the maximum length </param> public CodepointCountFilter(LuceneVersion version, TokenStream @in, int min, int max) : base(version, @in) { // LUCENENET: The guard clauses were copied here from the version of Lucene. // Apparently, the tests were not ported from 4.8.0 because they expected this and the // original tests did not. Adding them anyway because there is no downside to this. if (min < 0) { throw new ArgumentOutOfRangeException("minimum length must be greater than or equal to zero"); } if (min > max) { throw new ArgumentOutOfRangeException("maximum length must not be greater than minimum length"); } this.min = min; this.max = max; termAtt = AddAttribute<ICharTermAttribute>(); }
/// <summary> /// Builds an analyzer with the stop words from the given reader. </summary> /// <seealso cref="WordlistLoader.GetWordSet(TextReader, LuceneVersion)"/> /// <param name="matchVersion"> Lucene compatibility version - See <see cref="StandardAnalyzer"/> </param> /// <param name="stopwords"> <see cref="TextReader"/> to read stop words from </param> public StandardAnalyzer(LuceneVersion matchVersion, TextReader stopwords) : this(matchVersion, LoadStopwordSet(stopwords, matchVersion)) { }
/// <summary> /// Creates a <see cref="MultiFieldQueryParser"/>. Allows passing of a map with term to /// Boost, and the boost to apply to each term. /// /// <para/> /// It will, when <see cref="QueryParserBase.Parse(string)"/> is called, construct a query like this /// (assuming the query consists of two terms and you specify the two fields /// <c>title</c> and <c>body</c>): /// <para/> /// /// <code> /// (title:term1 body:term1) (title:term2 body:term2) /// </code> /// /// <para/> /// When <see cref="QueryParserBase.DefaultOperator"/> is set to <see cref="QueryParserBase.AND_OPERATOR"/>, the result will be: /// <para/> /// /// <code> /// +(title:term1 body:term1) +(title:term2 body:term2) /// </code> /// /// <para/> /// When you pass a boost (title=>5 body=>10) you can get /// <para/> /// /// <code> /// +(title:term1^5.0 body:term1^10.0) +(title:term2^5.0 body:term2^10.0) /// </code> /// /// <para/> /// In other words, all the query's terms must appear, but it doesn't matter /// in what fields they appear. /// <para/> /// </summary> public MultiFieldQueryParser(LuceneVersion matchVersion, string[] fields, Analyzer analyzer, IDictionary <string, float> boosts) : this(matchVersion, fields, analyzer) { this.m_boosts = boosts; }
public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram) : this(version, factory, input, GetSide(sideLabel), minGram, maxGram) { }
public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, Side side, int minGram, int maxGram) : base(input) { Init(version, side, minGram, maxGram); }
/// <summary> /// Creates a new <see cref="SimpleAnalyzer"/> </summary> /// <param name="matchVersion"> <see cref="LuceneVersion"/> to match </param> public SimpleAnalyzer(LuceneVersion matchVersion) { this.matchVersion = matchVersion; }
/// <summary> /// Builds an analyzer with the default stop words (<see cref="STOP_WORDS_SET"/>). </summary> /// <param name="matchVersion"> Lucene compatibility version - See <see cref="StandardAnalyzer"/> </param> public StandardAnalyzer(LuceneVersion matchVersion) : this(matchVersion, STOP_WORDS_SET) { }
/// <summary> /// Creates a new instance of the <see cref="StandardTokenizer"/>. Attaches /// the <paramref name="input"/> to the newly created JFlex-generated (then ported to .NET) scanner. /// </summary> /// <param name="matchVersion"> Lucene compatibility version - See <see cref="StandardTokenizer"/> </param> /// <param name="input"> The input reader /// /// See http://issues.apache.org/jira/browse/LUCENE-1068 </param> public StandardTokenizer(LuceneVersion matchVersion, TextReader input) : base(input) { Init(matchVersion); }
/// <summary> /// Builds an analyzer with the default stop words: <seealso cref="#DEFAULT_STOPWORD_FILE"/>. /// </summary> public FinnishAnalyzer(LuceneVersion matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET) { }
/// <summary> /// Builds an analyzer with the given stop words. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> public FinnishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) { }
/// <summary> /// Builds an analyzer with the given stop words /// </summary> /// <param name="matchVersion"> /// lucene compatibility version </param> /// <param name="stopwords"> /// a stopword set </param> /// <param name="stemExclutionSet"> /// a stemming exclusion set </param> public FrenchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclutionSet) : base(matchVersion, stopwords) { this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclutionSet)); }
/// <summary> /// Create a new <see cref="KeepWordFilter"/>. /// <para><c>NOTE</c>: The words set passed to this constructor will be directly /// used by this filter and should not be modified. /// </para> /// </summary> /// <param name="version"> the Lucene match version </param> /// <param name="in"> the <see cref="TokenStream"/> to consume </param> /// <param name="words"> the words to keep </param> public KeepWordFilter(LuceneVersion version, TokenStream @in, CharArraySet words) : base(version, @in) { this.words = words; termAtt = AddAttribute <ICharTermAttribute>(); }
public KeepWordFilter(LuceneVersion version, bool enablePositionIncrements, TokenStream @in, CharArraySet words) : base(version, enablePositionIncrements, @in) { this.words = words; termAtt = AddAttribute <ICharTermAttribute>(); }
public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram) : base(factory, input) { Init(version, side, minGram, maxGram); }
/// <summary> /// Builds the named analyzer with no stop words. </summary> public SnowballAnalyzer(LuceneVersion matchVersion, string name) { this.name = name; this.matchVersion = matchVersion; }
/// <summary> /// Create a new instance, loading from a previously built /// <see cref="AnalyzingInfixSuggester"/> directory, if it exists. /// This directory must be /// private to the infix suggester (i.e., not an external /// Lucene index). Note that <see cref="Dispose()"/> /// will also dispose the provided directory. /// </summary> public AnalyzingInfixSuggester(LuceneVersion matchVersion, Directory dir, Analyzer analyzer) : this(matchVersion, dir, analyzer, analyzer, DEFAULT_MIN_PREFIX_CHARS) { }
/// <summary> /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range /// </summary> /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram) : this(version, input, Side.FRONT, minGram, maxGram) { }
/// <summary> /// Builds an analyzer with the given stop words. </summary> /// <param name="matchVersion"> Lucene compatibility version - See <see cref="StandardAnalyzer"/> </param> /// <param name="stopWords"> stop words </param> public StandardAnalyzer(LuceneVersion matchVersion, CharArraySet stopWords) : base(matchVersion, stopWords) { }
/// <summary> /// Construct a new LetterTokenizer using a given /// <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>. /// </summary> /// <param name="matchVersion"> /// Lucene version to match See <seealso cref="<a href="#version">above</a>"/> </param> /// <param name="factory"> /// the attribute factory to use for this <seealso cref="Tokenizer"/> </param> /// <param name="in"> /// the input to split up into tokens </param> public LetterTokenizer(LuceneVersion matchVersion, AttributeSource.AttributeFactory factory, TextReader @in) : base(matchVersion, factory, @in) { }
/// <summary> /// Builds the named analyzer with the given stop words. </summary> public SnowballAnalyzer(LuceneVersion matchVersion, string name, CharArraySet stopWords) : this(matchVersion, name) { stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopWords)); }
/// <summary> /// Builds an analyzer which removes words in /// <see cref="ENGLISH_STOP_WORDS_SET"/>. </summary> /// <param name="matchVersion"> See <see cref="LuceneVersion"/> </param> public StopAnalyzer(LuceneVersion matchVersion) : this(matchVersion, ENGLISH_STOP_WORDS_SET) { }
/// <summary> /// Creates a new <see cref="StandardTokenizer"/> with a given <see cref="AttributeSource.AttributeFactory"/> /// </summary> public StandardTokenizer(LuceneVersion matchVersion, AttributeFactory factory, TextReader input) : base(factory, input) { Init(matchVersion); }
/// <summary> /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range /// </summary> /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param> /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram) : this(version, factory, input, Side.FRONT, minGram, maxGram) { }
// NOTE: This was moved into the QueryParserBase class. // * The default operator for parsing queries. // * Use <see cref="QueryParser.DefaultOperator"/> to change it. // */ //public enum Operator //{ // OR, // AND //} /// <summary> /// Constructs a query parser. /// </summary> /// <param name="matchVersion">Lucene version to match.</param> /// <param name="f">the default field for query terms.</param> /// <param name="a">used to find terms in the query text.</param> public QueryParser(LuceneVersion matchVersion, string f, Analyzer a) : this(new FastCharStream(new StringReader(""))) { Init(matchVersion, f, a); }
/// <summary> /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range /// </summary> /// <param name="version"> the <a href="#version">Lucene match version</a> </param> /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram) : base(version, input, minGram, maxGram, true) { }
/// <summary> /// Builds an analyzer with the stop words from the given file. </summary> /// <seealso cref="WordlistLoader.GetWordSet(TextReader, LuceneVersion)"/> /// <param name="matchVersion"> See <see cref="LuceneVersion"/> </param> /// <param name="stopwordsFile"> File to load stop words from </param> public StopAnalyzer(LuceneVersion matchVersion, FileInfo stopwordsFile) : this(matchVersion, LoadStopwordSet(stopwordsFile, matchVersion)) { }
/// <summary> /// create a new index writer config with random defaults </summary> public static IndexWriterConfig NewIndexWriterConfig(LuceneVersion v, Analyzer a) { return NewIndexWriterConfig(Random(), v, a); }
/// <summary> /// Construct a new LetterTokenizer. /// </summary> /// <param name="matchVersion"> /// Lucene version to match See <seealso cref="<a href="#version">above</a>"/> </param> /// <param name="in"> /// the input to split up into tokens </param> public LetterTokenizer(LuceneVersion matchVersion, TextReader @in) : base(matchVersion, @in) { }
/// <summary> /// Builds an analyzer with the default stop words: <seealso cref="#DEFAULT_STOPWORD_FILE"/>. /// </summary> public DanishAnalyzer(LuceneVersion matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET) { }
private static void CheckPositionIncrement(LuceneVersion version, bool enablePositionIncrements) { if (!enablePositionIncrements && #pragma warning disable 612, 618 version.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { throw new System.ArgumentException("enablePositionIncrements=false is not supported anymore as of Lucene 4.4 as it can create broken token streams"); } }
/// <summary> /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before /// stemming. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param> public DanishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords) { this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionSet)); }
// NOTE: This was moved into the QueryParserBase class. ///* The default operator_Renamed for parsing queries. // * Use {@link QueryParser#setDefaultOperator} to change it. // */ //public enum Operator //{ // OR, // AND //} /// <summary> /// Constructs a query parser. /// </summary> /// <param name="matchVersion">Lucene version to match.</param> /// <param name="f">the default field for query terms.</param> /// <param name="a">used to find terms in the query text.</param> public QueryParser(LuceneVersion matchVersion, string f, Analyzer a) : this(new FastCharStream(new StringReader(""))) { Init(matchVersion, f, a); }
/// <summary> /// Create a new instance, loading from a previously built /// directory, if it exists. /// </summary> public BlendedInfixSuggester(LuceneVersion matchVersion, Directory dir, Analyzer analyzer) : base(matchVersion, dir, analyzer) { this.blenderType = BlenderType.POSITION_LINEAR; this.numFactor = DEFAULT_NUM_FACTOR; }
/// <summary> /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is /// provided this analyzer will add a <see cref="SetKeywordMarkerFilter"/> before /// stemming. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param> public NorwegianAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords) { this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionSet)); }
/// <summary> /// Creates a MultiFieldQueryParser. /// /// <para/> /// It will, when <see cref="QueryParserBase.Parse(string)"/> is called, construct a query like this /// (assuming the query consists of two terms and you specify the two fields /// <c>title</c> and <c>body</c>): /// <para/> /// /// <code> /// (title:term1 body:term1) (title:term2 body:term2) /// </code> /// /// <para/> /// When <see cref="QueryParserBase.DefaultOperator"/> is set to <see cref="QueryParserBase.AND_OPERATOR"/>, the result will be: /// <para/> /// /// <code> /// +(title:term1 body:term1) +(title:term2 body:term2) /// </code> /// /// <para/> /// In other words, all the query's terms must appear, but it doesn't matter /// in what fields they appear. /// <para/> /// </summary> public MultiFieldQueryParser(LuceneVersion matchVersion, string[] fields, Analyzer analyzer) : base(matchVersion, null, analyzer) { this.m_fields = fields; }
/// <summary> /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range /// </summary> /// <param name="version"> the <a href="#version">Lucene match version</a> </param> /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param> /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public EdgeNGramTokenizer(LuceneVersion version, AttributeSource.AttributeFactory factory, TextReader input, int minGram, int maxGram) : base(version, factory, input, minGram, maxGram, true) { }
/// <summary> /// Builds an analyzer with the default stop words: <see cref="DEFAULT_STOPWORD_FILE"/>. /// </summary> public NorwegianAnalyzer(LuceneVersion matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET) { }
/// <summary> /// create a new index writer config with random defaults using the specified random </summary> public static IndexWriterConfig NewIndexWriterConfig(Random r, LuceneVersion v, Analyzer a) { IndexWriterConfig c = new IndexWriterConfig(v, a); c.SetSimilarity(ClassEnvRule.Similarity); if (VERBOSE) { // Even though TestRuleSetupAndRestoreClassEnv calls // InfoStream.setDefault, we do it again here so that // the PrintStreamInfoStream.messageID increments so // that when there are separate instances of // IndexWriter created we see "IW 0", "IW 1", "IW 2", // ... instead of just always "IW 0": c.InfoStream = new TestRuleSetupAndRestoreClassEnv.ThreadNameFixingPrintStreamInfoStream(Console.Out); } if (r.NextBoolean()) { c.SetMergeScheduler(new SerialMergeScheduler()); } else if (Rarely(r)) { int maxThreadCount = TestUtil.NextInt(Random(), 1, 4); int maxMergeCount = TestUtil.NextInt(Random(), maxThreadCount, maxThreadCount + 4); ConcurrentMergeScheduler cms = new ConcurrentMergeScheduler(); cms.SetMaxMergesAndThreads(maxMergeCount, maxThreadCount); c.SetMergeScheduler(cms); } if (r.NextBoolean()) { if (Rarely(r)) { // crazy value c.SetMaxBufferedDocs(TestUtil.NextInt(r, 2, 15)); } else { // reasonable value c.SetMaxBufferedDocs(TestUtil.NextInt(r, 16, 1000)); } } if (r.NextBoolean()) { if (Rarely(r)) { // crazy value c.SetTermIndexInterval(r.NextBoolean() ? TestUtil.NextInt(r, 1, 31) : TestUtil.NextInt(r, 129, 1000)); } else { // reasonable value c.SetTermIndexInterval(TestUtil.NextInt(r, 32, 128)); } } if (r.NextBoolean()) { int maxNumThreadStates = Rarely(r) ? TestUtil.NextInt(r, 5, 20) : TestUtil.NextInt(r, 1, 4); // reasonable value - crazy value if (Rarely(r)) { // Retrieve the package-private setIndexerThreadPool // method: MethodInfo setIndexerThreadPoolMethod = typeof(IndexWriterConfig).GetMethod("SetIndexerThreadPool", new Type[] { typeof(DocumentsWriterPerThreadPool) }); //setIndexerThreadPoolMethod.setAccessible(true); Type clazz = typeof(RandomDocumentsWriterPerThreadPool); ConstructorInfo ctor = clazz.GetConstructor(new[] { typeof(int), typeof(Random) }); //ctor.Accessible = true; // random thread pool setIndexerThreadPoolMethod.Invoke(c, new[] { ctor.Invoke(new object[] { maxNumThreadStates, r }) }); } else { // random thread pool c.SetMaxThreadStates(maxNumThreadStates); } } c.SetMergePolicy(NewMergePolicy(r)); if (Rarely(r)) { c.SetMergedSegmentWarmer(new SimpleMergedSegmentWarmer(c.InfoStream)); } c.SetUseCompoundFile(r.NextBoolean()); c.SetReaderPooling(r.NextBoolean()); c.SetReaderTermsIndexDivisor(TestUtil.NextInt(r, 1, 4)); c.SetCheckIntegrityAtMerge(r.NextBoolean()); return c; }
/// <summary> /// Builds an analyzer with the given stop words. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> public NorwegianAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) { }
/// <summary> /// Builds an analyzer with the given stop words. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> public DanishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) { }
public LetterTokenizerAnonymousInnerClassHelper(LuceneVersion TEST_VERSION_CURRENT, TextReader reader) : base(TEST_VERSION_CURRENT, reader) { }
/// <summary> /// Returns a <seealso cref="CharacterUtils"/> implementation according to the given /// <seealso cref="LuceneVersion"/> instance. /// </summary> /// <param name="matchVersion"> /// a version instance </param> /// <returns> a <seealso cref="CharacterUtils"/> implementation according to the given /// <seealso cref="LuceneVersion"/> instance. </returns> public static CharacterUtils GetInstance(LuceneVersion matchVersion) { return matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_5 : JAVA_4; }
/// <summary> /// Create a new instance, loading from a previously built /// directory, if it exists. /// </summary> /// <param name="blenderType"> Type of blending strategy, see BlenderType for more precisions </param> /// <param name="numFactor"> Factor to multiply the number of searched elements before ponderate </param> /// <exception cref="IOException"> If there are problems opening the underlying Lucene index. </exception> public BlendedInfixSuggester(LuceneVersion matchVersion, Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, BlenderType blenderType, int numFactor) : base(matchVersion, dir, indexAnalyzer, queryAnalyzer, minPrefixChars) { this.blenderType = blenderType; this.numFactor = numFactor; }