/// <summary> /// Create a new <seealso cref="PatternKeywordMarkerFilter"/>, that marks the current /// token as a keyword if the tokens term buffer matches the provided /// <seealso cref="Pattern"/> via the <seealso cref="KeywordAttribute"/>. /// </summary> /// <param name="in"> /// TokenStream to filter </param> /// <param name="pattern"> /// the pattern to apply to the incoming term buffer /// </param> public PatternKeywordMarkerFilter(TokenStream @in, Pattern pattern) : base(@in) { termAtt = AddAttribute<ICharTermAttribute>(); this.matcher = pattern.matcher(""); }
public SimplePayloadFilter(TokenStream input) : base(input) { Pos = 0; PayloadAttr = input.AddAttribute<IPayloadAttribute>(); TermAttr = input.AddAttribute<ICharTermAttribute>(); }
/// <summary> /// Create a new MockTokenFilter. /// </summary> /// <param name="input"> TokenStream to filter </param> /// <param name="filter"> DFA representing the terms that should be removed. </param> public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter) : base(input) { this.Filter = filter; TermAtt = AddAttribute<ICharTermAttribute>(); PosIncrAtt = AddAttribute<IPositionIncrementAttribute>(); }
protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(input) { termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); posIncAtt = AddAttribute<IPositionIncrementAttribute>(); this.matchVersion = matchVersion; this.tokens = new LinkedList<CompoundToken>(); if (minWordSize < 0) { throw new System.ArgumentException("minWordSize cannot be negative"); } this.minWordSize = minWordSize; if (minSubwordSize < 0) { throw new System.ArgumentException("minSubwordSize cannot be negative"); } this.minSubwordSize = minSubwordSize; if (maxSubwordSize < 0) { throw new System.ArgumentException("maxSubwordSize cannot be negative"); } this.maxSubwordSize = maxSubwordSize; this.onlyLongestMatch = onlyLongestMatch; this.dictionary = dictionary; }
/// <summary> /// Create a new UpperCaseFilter, that normalizes token text to upper case. /// </summary> /// <param name="matchVersion"> See <a href="#version">above</a> </param> /// <param name="in"> TokenStream to filter </param> public UpperCaseFilter(LuceneVersion matchVersion, TokenStream @in) : base(@in) { termAtt = AddAttribute<ICharTermAttribute>(); termAtt = AddAttribute<ICharTermAttribute>(); charUtils = CharacterUtils.GetInstance(matchVersion); }
public ChineseFilter(TokenStream @in) : base(@in) { stopTable = new CharArraySet(LuceneVersion.LUCENE_CURRENT, Arrays.AsList(STOP_WORDS), false); termAtt = AddAttribute<ICharTermAttribute>(); }
/// <summary> /// Creates a new NorwegianLightStemFilter </summary> /// <param name="flags"> set to <seealso cref="NorwegianLightStemmer#BOKMAAL"/>, /// <seealso cref="NorwegianLightStemmer#NYNORSK"/>, or both. </param> public NorwegianMinimalStemFilter(TokenStream input, int flags) : base(input) { this.stemmer = new NorwegianMinimalStemmer(flags); termAtt = AddAttribute<ICharTermAttribute>(); keywordAttr = AddAttribute<IKeywordAttribute>(); }
/// <summary> /// Create a new <seealso cref="CodepointCountFilter"/>. This will filter out tokens whose /// <seealso cref="CharTermAttribute"/> is either too short (<seealso cref="Character#CodePointCount(char[], int, int)"/> /// < min) or too long (<seealso cref="Character#codePointCount(char[], int, int)"/> > max). </summary> /// <param name="version"> the Lucene match version </param> /// <param name="in"> the <seealso cref="TokenStream"/> to consume </param> /// <param name="min"> the minimum length </param> /// <param name="max"> the maximum length </param> public CodepointCountFilter(LuceneVersion version, TokenStream @in, int min, int max) : base(version, @in) { this.min = min; this.max = max; termAtt = AddAttribute<ICharTermAttribute>(); }
/// <summary> /// Create a new <seealso cref="ASCIIFoldingFilter"/>. /// </summary> /// <param name="input"> /// TokenStream to filter </param> /// <param name="preserveOriginal"> /// should the original tokens be kept on the input stream with a 0 position increment /// from the folded tokens? /// </param> public ASCIIFoldingFilter(TokenStream input, bool preserveOriginal) : base(input) { this.preserveOriginal = preserveOriginal; termAtt = AddAttribute<ICharTermAttribute>(); posIncAttr = AddAttribute<IPositionIncrementAttribute>(); }
/// <summary> /// Create a new IndonesianStemFilter. /// <para> /// If <code>stemDerivational</code> is false, /// only inflectional suffixes (particles and possessive pronouns) are stemmed. /// </para> /// </summary> public IndonesianStemFilter(TokenStream input, bool stemDerivational) : base(input) { this.stemDerivational = stemDerivational; termAtt = AddAttribute<ICharTermAttribute>(); keywordAtt = AddAttribute<IKeywordAttribute>(); }
public SnowballFilter(TokenStream input, SnowballProgram stemmer) : base(input) { this.stemmer = stemmer; this.termAtt = AddAttribute<ICharTermAttribute>(); this.keywordAttr = AddAttribute<IKeywordAttribute>(); }
public CannedTokenizer(System.IO.TextReader reader, TokenAndPos[] tokens) : base(reader) { this.tokens = tokens; this.termAtt = AddAttribute<ICharTermAttribute>(); this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); }
/// <summary> /// Create filter using the supplied stemming table. /// </summary> /// <param name="in">input token stream</param> /// <param name="stemmer">stemmer</param> /// <param name="minLength">For performance reasons words shorter than minLength /// characters are not processed, but simply returned.</param> public StempelFilter(TokenStream @in, StempelStemmer stemmer, int minLength) : base(@in) { this.stemmer = stemmer; this.minLength = minLength; this.termAtt = AddAttribute<ICharTermAttribute>(); this.keywordAtt = AddAttribute<IKeywordAttribute>(); }
/// <summary> /// Create a new StemmerOverrideFilter, performing dictionary-based stemming /// with the provided <code>dictionary</code>. /// <para> /// Any dictionary-stemmed terms will be marked with <seealso cref="KeywordAttribute"/> /// so that they will not be stemmed with stemmers down the chain. /// </para> /// </summary> public StemmerOverrideFilter(TokenStream input, StemmerOverrideMap stemmerOverrideMap) : base(input) { this.stemmerOverrideMap = stemmerOverrideMap; fstReader = stemmerOverrideMap.BytesReader; termAtt = AddAttribute<ICharTermAttribute>(); keywordAtt = AddAttribute<IKeywordAttribute>(); }
/// <summary> /// Constructs an instance to replace either the first, or all occurances /// </summary> /// <param name="in"> the TokenStream to process </param> /// <param name="pattern"> the pattern (a <seealso cref="Regex"/> object) to apply to each Token </param> /// <param name="replacement"> the "replacement string" to substitute, if null a /// blank string will be used. Note that this is not the literal /// string that will be used, '$' and '\' have special meaning. </param> /// <param name="all"> if true, all matches will be replaced otherwise just the first match. </param> /// <seealso cref= Matcher#quoteReplacement </seealso> public PatternReplaceFilter(TokenStream @in, Regex pattern, string replacement, bool all) : base(@in) { this.replacement = (null == replacement) ? "" : replacement; this.all = all; this.pattern = pattern; termAtt = AddAttribute<ICharTermAttribute>(); }
public GraphTokenizer(TextReader input) : base(input) { TermAtt = AddAttribute<ICharTermAttribute>(); OffsetAtt = AddAttribute<IOffsetAttribute>(); PosIncrAtt = AddAttribute<IPositionIncrementAttribute>(); PosLengthAtt = AddAttribute<IPositionLengthAttribute>(); }
public DelimitedPayloadTokenFilter(TokenStream input, char delimiter, IPayloadEncoder encoder) : base(input) { this.delimiter = delimiter; this.encoder = encoder; termAtt = AddAttribute<ICharTermAttribute>(); payAtt = AddAttribute<IPayloadAttribute>(); }
public RepeatingTokenizer(TextReader reader, string val, Random random, float percentDocs, int maxTF) : base(reader) { this.Value = val; this.Random = random; this.PercentDocs = percentDocs; this.MaxTF = maxTF; this.TermAtt = AddAttribute<ICharTermAttribute>(); }
/// <summary> /// Sole constructor. </summary> public SuggestStopFilter(TokenStream input, CharArraySet stopWords) : base(input) { this.stopWords = stopWords; this.termAtt = AddAttribute<ICharTermAttribute>(); this.posIncAtt = AddAttribute<IPositionIncrementAttribute>(); this.keywordAtt = AddAttribute<IKeywordAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); }
public TruncateTokenFilter(TokenStream input, int length) : base(input) { if (length < 1) { throw new System.ArgumentOutOfRangeException("length parameter must be a positive number: " + length); } this.length = length; this.termAttribute = AddAttribute<ICharTermAttribute>(); this.keywordAttr = AddAttribute<IKeywordAttribute>(); }
/// <summary> /// Pop one input token's worth of tokens off the filter and verify that they are as expected. /// </summary> internal virtual void assertNextTerms(string expectedUnfolded, string expectedFolded, ASCIIFoldingFilter filter, ICharTermAttribute termAtt) { assertTrue(filter.IncrementToken()); assertEquals(expectedFolded, termAtt.ToString()); if (filter.PreserveOriginal && !expectedUnfolded.Equals(expectedFolded)) { assertTrue(filter.IncrementToken()); assertEquals(expectedUnfolded, termAtt.ToString()); } }
/// <summary> /// Construct a token stream filtering the given input using a Set of common /// words to create bigrams. Outputs both unigrams with position increment and /// bigrams with position increment 0 type=gram where one or both of the words /// in a potential bigram are in the set of common words . /// </summary> /// <param name="input"> TokenStream input in filter chain </param> /// <param name="commonWords"> The set of common words. </param> public CommonGramsFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet commonWords) : base(input) { termAttribute = AddAttribute<ICharTermAttribute>(); offsetAttribute = AddAttribute<IOffsetAttribute>(); typeAttribute = AddAttribute<ITypeAttribute>(); posIncAttribute = AddAttribute<IPositionIncrementAttribute>(); posLenAttribute = AddAttribute<IPositionLengthAttribute>(); this.commonWords = commonWords; }
public MockPayloadFilter(TokenStream input, string fieldName) : base(input) { this.FieldName = fieldName; Pos = 0; i = 0; PosIncrAttr = input.AddAttribute<IPositionIncrementAttribute>(); PayloadAttr = input.AddAttribute<IPayloadAttribute>(); TermAttr = input.AddAttribute<ICharTermAttribute>(); }
/// <summary> /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary> public ThaiTokenizer(AttributeFactory factory, TextReader reader) : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS())) { if (!DBBI_AVAILABLE) { throw new System.NotSupportedException("This JRE does not have support for Thai segmentation"); } wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS())); termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); }
public IterTokenStream(params Token[] tokens) : base() { this.tokens = tokens; this.termAtt = AddAttribute<ICharTermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); this.posIncAtt = AddAttribute<IPositionIncrementAttribute>(); this.flagsAtt = AddAttribute<IFlagsAttribute>(); this.typeAtt = AddAttribute<ITypeAttribute>(); this.payloadAtt = AddAttribute<IPayloadAttribute>(); }
public TrimFilter(LuceneVersion version, TokenStream @in, bool updateOffsets) : base(@in) { if (updateOffsets && version.OnOrAfter(LuceneVersion.LUCENE_44)) { throw new System.ArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4"); } termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); this.updateOffsets = updateOffsets; }
private bool hasIllegalOffsets = false; // only if the length changed before this filter /// <summary> /// Creates a new ThaiWordFilter with the specified match version. </summary> public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input) : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input)) { if (!DBBI_AVAILABLE) { throw new System.NotSupportedException("This JRE does not have support for Thai segmentation"); } handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31); termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); posAtt = AddAttribute<IPositionIncrementAttribute>(); }
public KeywordTokenizer(AttributeSource.AttributeFactory factory, Reader input, int bufferSize) : base(factory, input) { termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); if (bufferSize <= 0) { throw new System.ArgumentException("bufferSize must be > 0"); } termAtt.ResizeBuffer(bufferSize); }
/// <summary> /// Create a new <seealso cref="LengthFilter"/>. This will filter out tokens whose /// <seealso cref="CharTermAttribute"/> is either too short (<seealso cref="CharTermAttribute#length()"/> /// < min) or too long (<seealso cref="CharTermAttribute#length()"/> > max). </summary> /// <param name="version"> the Lucene match version </param> /// <param name="in"> the <seealso cref="TokenStream"/> to consume </param> /// <param name="min"> the minimum length </param> /// <param name="max"> the maximum length </param> public LengthFilter(LuceneVersion version, TokenStream @in, int min, int max) : base(version, @in) { if (min < 0) { throw new ArgumentOutOfRangeException("minimum length must be greater than or equal to zero"); } if (min > max) { throw new ArgumentOutOfRangeException("maximum length must not be greater than minimum length"); } this.min = min; this.max = max; this.termAtt = AddAttribute<ICharTermAttribute>(); }
/// <summary> /// If inputText is non-null, and the TokenStream has /// offsets, we include the surface form in each arc's /// label. /// </summary> public TokenStreamToDot(string inputText, TokenStream @in, TextWriter @out) { this.@in = @in; this.@out = @out; this.InputText = inputText; TermAtt = @in.AddAttribute<ICharTermAttribute>(); PosIncAtt = @in.AddAttribute<IPositionIncrementAttribute>(); PosLengthAtt = @in.AddAttribute<IPositionLengthAttribute>(); if (@in.HasAttribute<IOffsetAttribute>()) { OffsetAtt = @in.AddAttribute<IOffsetAttribute>(); } else { OffsetAtt = null; } }
internal WordTokenFilter(TypeAsPayloadTokenFilterTest outerInstance, TokenStream input) : base(input) { this.outerInstance = outerInstance; termAtt = AddAttribute <ICharTermAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); }
/// <summary> /// Pop one input token's worth of tokens off the filter and verify that they are as expected. /// </summary> internal virtual void assertNextTerms(string expectedUnfolded, string expectedFolded, ASCIIFoldingFilter filter, ICharTermAttribute termAtt) { assertTrue(filter.IncrementToken()); assertEquals(expectedFolded, termAtt.ToString()); if (filter.PreserveOriginal && !expectedUnfolded.Equals(expectedFolded, StringComparison.Ordinal)) { assertTrue(filter.IncrementToken()); assertEquals(expectedUnfolded, termAtt.ToString()); } }
protected internal RandomTokenStream(BaseTermVectorsFormatTestCase baseTermVectorsFormatTestCase, int len, string[] sampleTerms, BytesRef[] sampleTermBytes, bool offsetsGoBackwards) { terms = new string[len]; termBytes = new BytesRef[len]; positionsIncrements = new int[len]; positions = new int[len]; startOffsets = new int[len]; endOffsets = new int[len]; payloads = new BytesRef[len]; for (int i = 0; i < len; ++i) { int o = Random.Next(sampleTerms.Length); terms[i] = sampleTerms[o]; termBytes[i] = sampleTermBytes[o]; positionsIncrements[i] = TestUtil.NextInt32(Random, i == 0 ? 1 : 0, 10); if (offsetsGoBackwards) { startOffsets[i] = Random.Next(); endOffsets[i] = Random.Next(); } else { if (i == 0) { startOffsets[i] = TestUtil.NextInt32(Random, 0, 1 << 16); } else { startOffsets[i] = startOffsets[i - 1] + TestUtil.NextInt32(Random, 0, Rarely() ? 1 << 16 : 20); } endOffsets[i] = startOffsets[i] + TestUtil.NextInt32(Random, 0, Rarely() ? 1 << 10 : 20); } } for (int i = 0; i < len; ++i) { if (i == 0) { positions[i] = positionsIncrements[i] - 1; } else { positions[i] = positions[i - 1] + positionsIncrements[i]; } } if (Rarely()) { Arrays.Fill(payloads, baseTermVectorsFormatTestCase.RandomPayload()); } else { for (int i = 0; i < len; ++i) { payloads[i] = baseTermVectorsFormatTestCase.RandomPayload(); } } positionToTerms = new Dictionary <int?, ISet <int?> >(len); startOffsetToTerms = new Dictionary <int?, ISet <int?> >(len); for (int i = 0; i < len; ++i) { if (!positionToTerms.TryGetValue(positions[i], out ISet <int?> positionTerms)) { positionToTerms[positions[i]] = positionTerms = new JCG.HashSet <int?>(1); } positionTerms.Add(i); if (!startOffsetToTerms.TryGetValue(startOffsets[i], out ISet <int?> startOffsetTerms)) { startOffsetToTerms[startOffsets[i]] = startOffsetTerms = new JCG.HashSet <int?>(1); } startOffsetTerms.Add(i); } freqs = new Dictionary <string, int?>(); foreach (string term in terms) { if (freqs.TryGetValue(term, out int?freq)) { freqs[term] = freq + 1; } else { freqs[term] = 1; } } AddAttributeImpl(new PermissiveOffsetAttribute()); termAtt = AddAttribute <ICharTermAttribute>(); piAtt = AddAttribute <IPositionIncrementAttribute>(); oAtt = AddAttribute <IOffsetAttribute>(); pAtt = AddAttribute <IPayloadAttribute>(); }
public virtual void TestCaptureState() { // init a first instance AttributeSource src = new AttributeSource(); ICharTermAttribute termAtt = src.AddAttribute <ICharTermAttribute>(); ITypeAttribute typeAtt = src.AddAttribute <ITypeAttribute>(); termAtt.Append("TestTerm"); typeAtt.Type = "TestType"; int hashCode = src.GetHashCode(); AttributeSource.State state = src.CaptureState(); // modify the attributes termAtt.SetEmpty().Append("AnotherTestTerm"); typeAtt.Type = "AnotherTestType"; Assert.IsTrue(hashCode != src.GetHashCode(), "Hash code should be different"); src.RestoreState(state); Assert.AreEqual(termAtt.ToString(), "TestTerm"); Assert.AreEqual(typeAtt.Type, "TestType"); Assert.AreEqual(hashCode, src.GetHashCode(), "Hash code should be equal after restore"); // restore into an exact configured copy AttributeSource copy = new AttributeSource(); copy.AddAttribute <ICharTermAttribute>(); copy.AddAttribute <ITypeAttribute>(); copy.RestoreState(state); Assert.AreEqual(src.GetHashCode(), copy.GetHashCode(), "Both AttributeSources should have same hashCode after restore"); Assert.AreEqual(src, copy, "Both AttributeSources should be equal after restore"); // init a second instance (with attributes in different order and one additional attribute) AttributeSource src2 = new AttributeSource(); typeAtt = src2.AddAttribute <ITypeAttribute>(); IFlagsAttribute flagsAtt = src2.AddAttribute <IFlagsAttribute>(); termAtt = src2.AddAttribute <ICharTermAttribute>(); flagsAtt.Flags = 12345; src2.RestoreState(state); Assert.AreEqual(termAtt.ToString(), "TestTerm"); Assert.AreEqual(typeAtt.Type, "TestType"); Assert.AreEqual(12345, flagsAtt.Flags, "FlagsAttribute should not be touched"); // init a third instance missing one Attribute AttributeSource src3 = new AttributeSource(); termAtt = src3.AddAttribute <ICharTermAttribute>(); try { src3.RestoreState(state); Assert.Fail("The third instance is missing the TypeAttribute, so restoreState() should throw IllegalArgumentException"); } #pragma warning disable 168 catch (System.ArgumentException iae) #pragma warning restore 168 { // pass } }
public BugReproTokenStream() { termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); posIncAtt = AddAttribute <IPositionIncrementAttribute>(); }
public WholeSentenceTokenizer(TextReader reader) : base(reader, BreakIterator.CreateSentenceInstance(Locale.GetUS())) { termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); }
public InputWindowToken(AttributeSource attSource) { this.attSource = attSource; this.termAtt = attSource.GetAttribute <ICharTermAttribute>(); this.offsetAtt = attSource.GetAttribute <IOffsetAttribute>(); }
/// <summary> /// Create a new <see cref="KeepWordFilter"/>. /// <para><c>NOTE</c>: The words set passed to this constructor will be directly /// used by this filter and should not be modified. /// </para> /// </summary> /// <param name="version"> the Lucene match version </param> /// <param name="in"> the <see cref="TokenStream"/> to consume </param> /// <param name="words"> the words to keep </param> public KeepWordFilter(LuceneVersion version, TokenStream @in, CharArraySet words) : base(version, @in) { this.words = words; termAtt = AddAttribute <ICharTermAttribute>(); }
public KeepWordFilter(LuceneVersion version, bool enablePositionIncrements, TokenStream @in, CharArraySet words) : base(version, enablePositionIncrements, @in) { this.words = words; termAtt = AddAttribute <ICharTermAttribute>(); }
public GreekStemFilter(TokenStream input) : base(input) { termAtt = AddAttribute <ICharTermAttribute>(); keywordAttr = AddAttribute <IKeywordAttribute>(); }
/// <summary> /// Creates a new <see cref="HyphenatedWordsFilter"/> /// </summary> /// <param name="in"> <see cref="TokenStream"/> that will be filtered </param> public HyphenatedWordsFilter(TokenStream @in) : base(@in) { termAttribute = AddAttribute <ICharTermAttribute>(); offsetAttribute = AddAttribute <IOffsetAttribute>(); }
public ChineseFilter(TokenStream @in) : base(@in) { stopTable = new CharArraySet(LuceneVersion.LUCENE_CURRENT, Arrays.AsList(STOP_WORDS), false); termAtt = AddAttribute <ICharTermAttribute>(); }
public MyTokenStream() { TermAtt = AddAttribute <ICharTermAttribute>(); }
public MockGraphTokenFilter(Random random, TokenStream input) : base(input) { seed = random.NextInt64(); termAtt = AddAttribute <ICharTermAttribute>(); }
public virtual void TestRandom() { int alphabetSize = TestUtil.NextInt32(Random, 2, 7); int docLen = AtLeast(3000); //final int docLen = 50; string document = GetRandomString('a', alphabetSize, docLen); if (Verbose) { Console.WriteLine("TEST: doc=" + document); } int numSyn = AtLeast(5); //final int numSyn = 2; IDictionary <string, OneSyn> synMap = new Dictionary <string, OneSyn>(); IList <OneSyn> syns = new JCG.List <OneSyn>(); bool dedup = Random.nextBoolean(); if (Verbose) { Console.WriteLine(" dedup=" + dedup); } b = new SynonymMap.Builder(dedup); for (int synIDX = 0; synIDX < numSyn; synIDX++) { string synIn = GetRandomString('a', alphabetSize, TestUtil.NextInt32(Random, 1, 5)).Trim(); if (!synMap.TryGetValue(synIn, out OneSyn s) || s is null) { s = new OneSyn(); s.@in = synIn; syns.Add(s); s.@out = new JCG.List <string>(); synMap[synIn] = s; s.keepOrig = Random.nextBoolean(); } string synOut = GetRandomString('0', 10, TestUtil.NextInt32(Random, 1, 5)).Trim(); [email protected](synOut); Add(synIn, synOut, s.keepOrig); if (Verbose) { Console.WriteLine(" syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig); } } tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.Reset(); assertTrue(tokensIn.IncrementToken()); assertFalse(tokensIn.IncrementToken()); tokensIn.End(); tokensIn.Dispose(); tokensOut = new SynonymFilter(tokensIn, b.Build(), true); termAtt = tokensOut.AddAttribute <ICharTermAttribute>(); posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>(); posLenAtt = tokensOut.AddAttribute <IPositionLengthAttribute>(); offsetAtt = tokensOut.AddAttribute <IOffsetAttribute>(); if (dedup) { PruneDups(syns); } string expected = SlowSynMatcher(document, syns, 5); if (Verbose) { Console.WriteLine("TEST: expected=" + expected); } Verify(document, expected); }
public PortugueseMinimalStemFilter(TokenStream input) : base(input) { termAtt = AddAttribute <ICharTermAttribute>(); keywordAttr = AddAttribute <IKeywordAttribute>(); }
internal LargePosIncTokenFilter(TestWordDelimiterFilter outerInstance, TokenStream input) : base(input) { this.outerInstance = outerInstance; this.termAtt = AddAttribute <ICharTermAttribute>(); this.posIncAtt = AddAttribute <IPositionIncrementAttribute>(); }
public CrazyTokenFilter(TokenStream input) : base(input) { termAtt = AddAttribute <ICharTermAttribute>(); }
public virtual void TestLatin1Accents() { TokenStream stream = new MockTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ" + " Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij" + " ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"), MockTokenizer.WHITESPACE, false); ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, Random.nextBoolean()); ICharTermAttribute termAtt = filter.GetAttribute <ICharTermAttribute>(); filter.Reset(); assertNextTerms("Des", "Des", filter, termAtt); assertNextTerms("mot", "mot", filter, termAtt); assertNextTerms("clés", "cles", filter, termAtt); assertNextTerms("À", "A", filter, termAtt); assertNextTerms("LA", "LA", filter, termAtt); assertNextTerms("CHAÎNE", "CHAINE", filter, termAtt); assertNextTerms("À", "A", filter, termAtt); assertNextTerms("Á", "A", filter, termAtt); assertNextTerms("Â", "A", filter, termAtt); assertNextTerms("Ã", "A", filter, termAtt); assertNextTerms("Ä", "A", filter, termAtt); assertNextTerms("Å", "A", filter, termAtt); assertNextTerms("Æ", "AE", filter, termAtt); assertNextTerms("Ç", "C", filter, termAtt); assertNextTerms("È", "E", filter, termAtt); assertNextTerms("É", "E", filter, termAtt); assertNextTerms("Ê", "E", filter, termAtt); assertNextTerms("Ë", "E", filter, termAtt); assertNextTerms("Ì", "I", filter, termAtt); assertNextTerms("Í", "I", filter, termAtt); assertNextTerms("Î", "I", filter, termAtt); assertNextTerms("Ï", "I", filter, termAtt); assertNextTerms("IJ", "IJ", filter, termAtt); assertNextTerms("Ð", "D", filter, termAtt); assertNextTerms("Ñ", "N", filter, termAtt); assertNextTerms("Ò", "O", filter, termAtt); assertNextTerms("Ó", "O", filter, termAtt); assertNextTerms("Ô", "O", filter, termAtt); assertNextTerms("Õ", "O", filter, termAtt); assertNextTerms("Ö", "O", filter, termAtt); assertNextTerms("Ø", "O", filter, termAtt); assertNextTerms("Œ", "OE", filter, termAtt); assertNextTerms("Þ", "TH", filter, termAtt); assertNextTerms("Ù", "U", filter, termAtt); assertNextTerms("Ú", "U", filter, termAtt); assertNextTerms("Û", "U", filter, termAtt); assertNextTerms("Ü", "U", filter, termAtt); assertNextTerms("Ý", "Y", filter, termAtt); assertNextTerms("Ÿ", "Y", filter, termAtt); assertNextTerms("à", "a", filter, termAtt); assertNextTerms("á", "a", filter, termAtt); assertNextTerms("â", "a", filter, termAtt); assertNextTerms("ã", "a", filter, termAtt); assertNextTerms("ä", "a", filter, termAtt); assertNextTerms("å", "a", filter, termAtt); assertNextTerms("æ", "ae", filter, termAtt); assertNextTerms("ç", "c", filter, termAtt); assertNextTerms("è", "e", filter, termAtt); assertNextTerms("é", "e", filter, termAtt); assertNextTerms("ê", "e", filter, termAtt); assertNextTerms("ë", "e", filter, termAtt); assertNextTerms("ì", "i", filter, termAtt); assertNextTerms("í", "i", filter, termAtt); assertNextTerms("î", "i", filter, termAtt); assertNextTerms("ï", "i", filter, termAtt); assertNextTerms("ij", "ij", filter, termAtt); assertNextTerms("ð", "d", filter, termAtt); assertNextTerms("ñ", "n", filter, termAtt); assertNextTerms("ò", "o", filter, termAtt); assertNextTerms("ó", "o", filter, termAtt); assertNextTerms("ô", "o", filter, termAtt); assertNextTerms("õ", "o", filter, termAtt); assertNextTerms("ö", "o", filter, termAtt); assertNextTerms("ø", "o", filter, termAtt); assertNextTerms("œ", "oe", filter, termAtt); assertNextTerms("ß", "ss", filter, termAtt); assertNextTerms("þ", "th", filter, termAtt); assertNextTerms("ù", "u", filter, termAtt); assertNextTerms("ú", "u", filter, termAtt); assertNextTerms("û", "u", filter, termAtt); assertNextTerms("ü", "u", filter, termAtt); assertNextTerms("ý", "y", filter, termAtt); assertNextTerms("ÿ", "y", filter, termAtt); assertNextTerms("fi", "fi", filter, termAtt); assertNextTerms("fl", "fl", filter, termAtt); assertFalse(filter.IncrementToken()); }
public HungarianLightStemFilter(TokenStream input) : base(input) { termAtt = AddAttribute <ICharTermAttribute>(); keywordAttr = AddAttribute <IKeywordAttribute>(); }
protected override IQueryNode PostProcessNode(IQueryNode node) { if (node is ITextableQueryNode && !(node is WildcardQueryNode) && !(node is FuzzyQueryNode) && !(node is RegexpQueryNode) && !(node.Parent is IRangeQueryNode)) { FieldQueryNode fieldNode = ((FieldQueryNode)node); string text = fieldNode.GetTextAsString(); string field = fieldNode.GetFieldAsString(); CachingTokenFilter buffer = null; IPositionIncrementAttribute posIncrAtt = null; int numTokens = 0; int positionCount = 0; bool severalTokensAtSamePosition = false; TokenStream source = null; try { source = this.analyzer.GetTokenStream(field, text); source.Reset(); buffer = new CachingTokenFilter(source); if (buffer.HasAttribute <IPositionIncrementAttribute>()) { posIncrAtt = buffer.GetAttribute <IPositionIncrementAttribute>(); } try { while (buffer.IncrementToken()) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt .PositionIncrement : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } } } #pragma warning disable 168 catch (IOException e) #pragma warning restore 168 { // ignore } } catch (IOException e) { throw new Exception(e.ToString(), e); } finally { IOUtils.DisposeWhileHandlingException(source); } // rewind the buffer stream buffer.Reset(); if (!buffer.HasAttribute <ICharTermAttribute>()) { return(new NoTokenFoundQueryNode()); } ICharTermAttribute termAtt = buffer.GetAttribute <ICharTermAttribute>(); if (numTokens == 0) { return(new NoTokenFoundQueryNode()); } else if (numTokens == 1) { string term = null; try { bool hasNext; hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } fieldNode.Text = term.AsCharSequence(); return(fieldNode); } else if (severalTokensAtSamePosition || !(node is QuotedFieldQueryNode)) { if (positionCount == 1 || !(node is QuotedFieldQueryNode)) { // no phrase query: if (positionCount == 1) { // simple case: only one position, with synonyms List <IQueryNode> children = new List <IQueryNode>(); for (int i = 0; i < numTokens; i++) { string term = null; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } children.Add(new FieldQueryNode(field, term, -1, -1)); } return(new GroupQueryNode( new StandardBooleanQueryNode(children, positionCount == 1))); } else { // multiple positions IQueryNode q = new StandardBooleanQueryNode(Collections.EmptyList <IQueryNode>(), false); IQueryNode currentQuery = null; for (int i = 0; i < numTokens; i++) { string term = null; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } if (posIncrAtt != null && posIncrAtt.PositionIncrement == 0) { if (!(currentQuery is BooleanQueryNode)) { IQueryNode t = currentQuery; currentQuery = new StandardBooleanQueryNode(Collections.EmptyList <IQueryNode>(), true); ((BooleanQueryNode)currentQuery).Add(t); } ((BooleanQueryNode)currentQuery).Add(new FieldQueryNode(field, term, -1, -1)); } else { if (currentQuery != null) { if (this.defaultOperator == Operator.OR) { q.Add(currentQuery); } else { q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ)); } } currentQuery = new FieldQueryNode(field, term, -1, -1); } } if (this.defaultOperator == Operator.OR) { q.Add(currentQuery); } else { q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ)); } if (q is BooleanQueryNode) { q = new GroupQueryNode(q); } return(q); } } else { // phrase query: MultiPhraseQueryNode mpq = new MultiPhraseQueryNode(); List <FieldQueryNode> multiTerms = new List <FieldQueryNode>(); int position = -1; int i = 0; int termGroupCount = 0; for (; i < numTokens; i++) { string term = null; int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.Count > 0) { foreach (FieldQueryNode termNode in multiTerms) { if (this.positionIncrementsEnabled) { termNode.PositionIncrement = position; } else { termNode.PositionIncrement = termGroupCount; } mpq.Add(termNode); } // Only increment once for each "group" of // terms that were in the same position: termGroupCount++; multiTerms.Clear(); } position += positionIncrement; multiTerms.Add(new FieldQueryNode(field, term, -1, -1)); } foreach (FieldQueryNode termNode in multiTerms) { if (this.positionIncrementsEnabled) { termNode.PositionIncrement = position; } else { termNode.PositionIncrement = termGroupCount; } mpq.Add(termNode); } return(mpq); } } else { TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode(); int position = -1; for (int i = 0; i < numTokens; i++) { string term = null; int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1); if (this.positionIncrementsEnabled) { position += positionIncrement; newFieldNode.PositionIncrement = position; } else { newFieldNode.PositionIncrement = i; } pq.Add(newFieldNode); } return(pq); } } return(node); }
/* * Need to worry about multiple scenarios: * - need to go for the longest match * a b => foo #shouldn't match if "a b" is followed by "c d" * a b c d => bar * - need to backtrack - retry matches for tokens already read * a b c d => foo * b c => bar * If the input stream is "a b c x", one will consume "a b c d" * trying to match the first rule... all but "a" should be * pushed back so a match may be made on "b c". * - don't try and match generated tokens (thus need separate queue) * matching is not recursive. * - handle optional generation of original tokens in all these cases, * merging token streams to preserve token positions. * - preserve original positionIncrement of first matched token */ public override bool IncrementToken() { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. if (replacement != null && replacement.MoveNext()) { Copy(this, replacement.Current); return(true); } // common case fast-path of first token not matching anything AttributeSource firstTok = NextTok(); if (firstTok == null) { return(false); } var termAtt = firstTok.AddAttribute <ICharTermAttribute>(); SlowSynonymMap result = map.Submap != null?map.Submap.Get(termAtt.Buffer, 0, termAtt.Length) : null; if (result == null) { Copy(this, firstTok); return(true); } // fast-path failed, clone ourselves if needed if (firstTok == this) { firstTok = CloneAttributes(); } // OK, we matched a token, so find the longest match. matched = new LinkedList <AttributeSource>(); result = Match(result); if (result == null) { // no match, simply return the first token read. Copy(this, firstTok); return(true); } // reuse, or create new one each time? List <AttributeSource> generated = new List <AttributeSource>(result.Synonyms.Length + matched.Count + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // AttributeSource lastTok = matched.Count == 0 ? firstTok : matched.Last.Value; bool includeOrig = result.IncludeOrig; AttributeSource origTok = includeOrig ? firstTok : null; IPositionIncrementAttribute firstPosIncAtt = firstTok.AddAttribute <IPositionIncrementAttribute>(); int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream int repPos = 0; // curr position in replacement token stream int pos = 0; // current position in merged token stream for (int i = 0; i < result.Synonyms.Length; i++) { Token repTok = result.Synonyms[i]; AttributeSource newTok = firstTok.CloneAttributes(); ICharTermAttribute newTermAtt = newTok.AddAttribute <ICharTermAttribute>(); IOffsetAttribute newOffsetAtt = newTok.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute newPosIncAtt = newTok.AddAttribute <IPositionIncrementAttribute>(); IOffsetAttribute lastOffsetAtt = lastTok.AddAttribute <IOffsetAttribute>(); newOffsetAtt.SetOffset(newOffsetAtt.StartOffset, lastOffsetAtt.EndOffset); newTermAtt.CopyBuffer(repTok.Buffer, 0, repTok.Length); repPos += repTok.PositionIncrement; if (i == 0) // make position of first token equal to original { repPos = origPos; } // if necessary, insert original tokens and adjust position increment while (origTok != null && origPos <= repPos) { IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; //origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (matched.Count == 0) { origTok = null; } else { origTok = matched.First.Value; matched.Remove(origTok); } if (origTok != null) { origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPos += origPosInc.PositionIncrement; } } newPosIncAtt.PositionIncrement = repPos - pos; generated.Add(newTok); pos += newPosIncAtt.PositionIncrement; } // finish up any leftover original tokens while (origTok != null) { IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; if (matched.Count == 0) { origTok = null; } else { origTok = matched.First.Value; matched.Remove(origTok); } if (origTok != null) { origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPos += origPosInc.PositionIncrement; } } // what if we replaced a longer sequence with a shorter one? // a/0 b/5 => foo/0 // should I re-create the gap on the next buffered token? replacement = generated.GetEnumerator(); // Now return to the top of the loop to read and return the first // generated token.. The reason this is done is that we may have generated // nothing at all, and may need to continue with more matching logic. } }
/// <summary> /// Create a new <see cref="SetKeywordMarkerFilter"/>, that marks the current token as a /// keyword if the tokens term buffer is contained in the given set via the /// <see cref="KeywordAttribute"/>. /// </summary> /// <param name="in"> /// <see cref="TokenStream"/> to filter </param> /// <param name="keywordSet"> /// the keywords set to lookup the current termbuffer </param> public SetKeywordMarkerFilter(TokenStream @in, CharArraySet keywordSet) : base(@in) { this.keywordSet = keywordSet; termAtt = AddAttribute <ICharTermAttribute>(); }
/// <summary> /// (non-Javadoc) /// @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element) /// </summary> public virtual Query GetQuery(XmlElement e) { string fieldsList = e.GetAttribute("fieldNames"); //a comma-delimited list of fields string[] fields = defaultFieldNames; if ((fieldsList != null) && (fieldsList.Trim().Length > 0)) { fields = fieldsList.Trim().Split(',').TrimEnd(); //trim the fieldnames for (int i = 0; i < fields.Length; i++) { fields[i] = fields[i].Trim(); } } //Parse any "stopWords" attribute //TODO MoreLikeThis needs to ideally have per-field stopWords lists - until then //I use all analyzers/fields to generate multi-field compatible stop list string stopWords = e.GetAttribute("stopWords"); ISet <string> stopWordsSet = null; if ((stopWords != null) && (fields != null)) { stopWordsSet = new JCG.HashSet <string>(); foreach (string field in fields) { TokenStream ts = null; try { ts = analyzer.GetTokenStream(field, stopWords); ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>(); ts.Reset(); while (ts.IncrementToken()) { stopWordsSet.Add(termAtt.ToString()); } ts.End(); } catch (IOException ioe) { throw new ParserException("IoException parsing stop words list in " + GetType().Name + ":" + ioe.Message); } finally { IOUtils.DisposeWhileHandlingException(ts); } } } MoreLikeThisQuery mlt = new MoreLikeThisQuery(DOMUtils.GetText(e), fields, analyzer, fields[0]); mlt.MaxQueryTerms = DOMUtils.GetAttribute(e, "maxQueryTerms", DEFAULT_MAX_QUERY_TERMS); mlt.MinTermFrequency = DOMUtils.GetAttribute(e, "minTermFrequency", DEFAULT_MIN_TERM_FREQUENCY); mlt.PercentTermsToMatch = DOMUtils.GetAttribute(e, "percentTermsToMatch", DEFAULT_PERCENT_TERMS_TO_MATCH) / 100; mlt.StopWords = stopWordsSet; int minDocFreq = DOMUtils.GetAttribute(e, "minDocFreq", -1); if (minDocFreq >= 0) { mlt.MinDocFreq = minDocFreq; } mlt.Boost = DOMUtils.GetAttribute(e, "boost", 1.0f); return(mlt); }
public PorterStemFilter(TokenStream @in) : base(@in) { termAtt = AddAttribute <ICharTermAttribute>(); keywordAttr = AddAttribute <IKeywordAttribute>(); }
/// <param name="input"> Source token stream </param> /// <param name="collator"> CollationKey generator </param> public CollationKeyFilter(TokenStream input, Collator collator) : base(input) { this.collator = collator; this.termAtt = this.AddAttribute <ICharTermAttribute>(); }
/// <summary> /// Fills ICharTermAttribute with the current token text. /// </summary> public void GetText(ICharTermAttribute t) { t.CopyBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); }
/// <summary> /// <seealso cref="IScorer.Init(TokenStream)"/> /// </summary> public virtual TokenStream Init(TokenStream tokenStream) { termAtt = tokenStream.AddAttribute <ICharTermAttribute>(); return(null); }
private void Init() { termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); }
/// <summary> /// Constructs a filter which removes words from the input <see cref="TokenStream"/> that are /// named in the <see cref="CharArraySet"/>. /// </summary> /// <param name="matchVersion"> /// Lucene version to enable correct Unicode 4.0 behavior in the stop /// set if Version > 3.0. See <see cref="LuceneVersion"/>> for details. </param> /// <param name="in"> /// Input <see cref="TokenStream"/> </param> /// <param name="stopWords"> /// A <see cref="CharArraySet"/> representing the stopwords. </param> /// <seealso cref="MakeStopSet(LuceneVersion, string[])"/> public StopFilter(LuceneVersion matchVersion, TokenStream @in, CharArraySet stopWords) : base(matchVersion, @in) { termAtt = AddAttribute <ICharTermAttribute>(); this.stopWords = stopWords; }