private void Init(int bufferSize) { this.done = false; termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); termAtt.ResizeTermBuffer(bufferSize); }
///<summary> ///Lucene Tokenizer适配器类构造函数 /// </summary> /// <param name="isMaxWordLength">当为true时,分词器进行最大词长切分;当为false是,采用最细粒度切分</param> public IKTokenizer(TextReader inreader, bool isMaxWordLength) : base(inreader) { offsetAtt = AddAttribute<IOffsetAttribute>(); termAtt = AddAttribute<ITermAttribute>(); _IKImplement = new IKSegmentation(inreader, isMaxWordLength); }
public CamelCaseFilter(TokenStream stream) : base(stream) { _termAttribute = AddAttribute<ITermAttribute>(); _offsetAttribute = AddAttribute<IOffsetAttribute>(); _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>(); }
void Init() { InitPanGuSegment(); termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); }
/* * 此处忽略调用base(input);因调用后input的position会被移动 * by zh */ public MMSegTokenizer(Seg seg, TextReader input) : base(input) { mmSeg = new MMSeg(input, seg); termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); }
public CutLeterDigitFilter(TokenStream input) : base(input) { reusableToken = new Token(); termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); }
void Init(string content) { _enumerationPositions = _aufzaehlungDetector.FindAufzaehlungsspans(content).ToArray(); _offsetAttribute = AddAttribute<IOffsetAttribute>(); _flagsAttribute = AddAttribute<IFlagsAttribute>(); _isInitialized = true; }
public GraphTokenizer(TextReader input) : base(input) { TermAtt = AddAttribute<ICharTermAttribute>(); OffsetAtt = AddAttribute<IOffsetAttribute>(); PosIncrAtt = AddAttribute<IPositionIncrementAttribute>(); PosLengthAtt = AddAttribute<IPositionLengthAttribute>(); }
/// <summary> /// Sole constructor. </summary> public SuggestStopFilter(TokenStream input, CharArraySet stopWords) : base(input) { this.stopWords = stopWords; this.termAtt = AddAttribute<ICharTermAttribute>(); this.posIncAtt = AddAttribute<IPositionIncrementAttribute>(); this.keywordAtt = AddAttribute<IKeywordAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); }
public CannedBinaryTokenStream(params BinaryToken[] tokens) : base() { this.Tokens = tokens; TermAtt = AddAttribute<IBinaryTermAttribute>(); PosIncrAtt = AddAttribute<IPositionIncrementAttribute>(); PosLengthAtt = AddAttribute<IPositionLengthAttribute>(); OffsetAtt = AddAttribute<IOffsetAttribute>(); }
private void Init(System.IO.TextReader _input, HebMorph.DataStructures.DictRadix<int> _prefixesTree) { termAtt = AddAttribute <ITermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); //posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute)); typeAtt = AddAttribute <ITypeAttribute>(); input = _input; hebMorphTokenizer = new HebMorph.Tokenizer(_input); prefixesTree = _prefixesTree; }
public JiebaTokenizer(JiebaSegmenter seg, string input) { segmenter = seg; termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); var text = input; tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList(); }
/// <summary> /// Construct a token stream filtering the given input using a Set of common /// words to create bigrams. Outputs both unigrams with position increment and /// bigrams with position increment 0 type=gram where one or both of the words /// in a potential bigram are in the set of common words . /// </summary> /// <param name="input"> TokenStream input in filter chain </param> /// <param name="commonWords"> The set of common words. </param> public CommonGramsFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet commonWords) : base(input) { termAttribute = AddAttribute<ICharTermAttribute>(); offsetAttribute = AddAttribute<IOffsetAttribute>(); typeAttribute = AddAttribute<ITypeAttribute>(); posIncAttribute = AddAttribute<IPositionIncrementAttribute>(); posLenAttribute = AddAttribute<IPositionLengthAttribute>(); this.commonWords = commonWords; }
public TrimFilter(LuceneVersion version, TokenStream @in, bool updateOffsets) : base(@in) { if (updateOffsets && version.OnOrAfter(LuceneVersion.LUCENE_44)) { throw new System.ArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4"); } termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); this.updateOffsets = updateOffsets; }
public IterTokenStream(params Token[] tokens) : base() { this.tokens = tokens; this.termAtt = AddAttribute<ICharTermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); this.posIncAtt = AddAttribute<IPositionIncrementAttribute>(); this.flagsAtt = AddAttribute<IFlagsAttribute>(); this.typeAtt = AddAttribute<ITypeAttribute>(); this.payloadAtt = AddAttribute<IPayloadAttribute>(); }
/// <summary> /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary> public ThaiTokenizer(AttributeFactory factory, TextReader reader) : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS())) { if (!DBBI_AVAILABLE) { throw new System.NotSupportedException("This JRE does not have support for Thai segmentation"); } wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS())); termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); }
public KeywordTokenizer(AttributeSource.AttributeFactory factory, Reader input, int bufferSize) : base(factory, input) { termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); if (bufferSize <= 0) { throw new System.ArgumentException("bufferSize must be > 0"); } termAtt.ResizeBuffer(bufferSize); }
private bool hasIllegalOffsets = false; // only if the length changed before this filter /// <summary> /// Creates a new ThaiWordFilter with the specified match version. </summary> public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input) : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input)) { if (!DBBI_AVAILABLE) { throw new System.NotSupportedException("This JRE does not have support for Thai segmentation"); } handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31); termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); posAtt = AddAttribute<IPositionIncrementAttribute>(); }
private void Init(System.IO.TextReader input, HebMorph.StreamLemmatizer _lemmatizer, HebMorph.LemmaFilters.LemmaFilterBase _lemmaFilter, bool AlwaysSaveMarkedOriginal) { termAtt = AddAttribute <ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); //payAtt = (PayloadAttribute)AddAttribute(typeof(PayloadAttribute)); this.input = input; this._streamLemmatizer = _lemmatizer; this._streamLemmatizer.SetStream(input); this.alwaysSaveMarkedOriginal = AlwaysSaveMarkedOriginal; this.lemmaFilter = _lemmaFilter; }
/* * Creates NGramTokenFilter with given min and max n-grams. * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> * <param name="minGram">the smallest n-gram to generate</param> * <param name="maxGram">the largest n-gram to generate</param> */ public NGramTokenFilter(TokenStream input, int minGram, int maxGram) : base(input) { if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.termAtt = AddAttribute<ITermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); }
/// <summary> /// If inputText is non-null, and the TokenStream has /// offsets, we include the surface form in each arc's /// label. /// </summary> public TokenStreamToDot(string inputText, TokenStream @in, TextWriter @out) { this.@in = @in; this.@out = @out; this.InputText = inputText; TermAtt = @in.AddAttribute<ICharTermAttribute>(); PosIncAtt = @in.AddAttribute<IPositionIncrementAttribute>(); PosLengthAtt = @in.AddAttribute<IPositionLengthAttribute>(); if (@in.HasAttribute<IOffsetAttribute>()) { OffsetAtt = @in.AddAttribute<IOffsetAttribute>(); } else { OffsetAtt = null; } }
/// <summary> /// creates a new PatternTokenizer returning tokens from group (-1 for split functionality) </summary> public PatternTokenizer(AttributeFactory factory, TextReader input, Regex pattern, int group) : base(factory, input) { this.termAtt = AddAttribute<ICharTermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); this.group = group; // Use "" instead of str so don't consume chars // (fillBuffer) from the input on throwing IAE below: this.matcher = pattern.Match(""); this.pattern = pattern; // confusingly group count depends ENTIRELY on the pattern but is only accessible via matcher var groupCount = pattern.GetGroupNumbers().Length; if (group >= 0 && group > groupCount) { throw new System.ArgumentException("invalid group specified: pattern only has: " + groupCount + " capturing groups"); } }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix) { this.suffix = suffix; this.prefix = prefix; prefixExhausted = false; termAtt = AddAttribute<ICharTermAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); payloadAtt = AddAttribute<IPayloadAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); flagsAtt = AddAttribute<IFlagsAttribute>(); p_termAtt = prefix.AddAttribute<ICharTermAttribute>(); p_posIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>(); p_payloadAtt = prefix.AddAttribute<IPayloadAttribute>(); p_offsetAtt = prefix.AddAttribute<IOffsetAttribute>(); p_typeAtt = prefix.AddAttribute<ITypeAttribute>(); p_flagsAtt = prefix.AddAttribute<IFlagsAttribute>(); }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix) { Suffix = suffix; Prefix = prefix; _prefixExhausted = false; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = AddAttribute<ITermAttribute>(); _posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); _payloadAtt = AddAttribute<IPayloadAttribute>(); _offsetAtt = AddAttribute<IOffsetAttribute>(); _typeAtt = AddAttribute<ITypeAttribute>(); _flagsAtt = AddAttribute<IFlagsAttribute>(); // ReSharper restore DoNotCallOverridableMethodsInConstructor _pTermAtt = prefix.AddAttribute<ITermAttribute>(); _pPosIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>(); _pPayloadAtt = prefix.AddAttribute<IPayloadAttribute>(); _pOffsetAtt = prefix.AddAttribute<IOffsetAttribute>(); _pTypeAtt = prefix.AddAttribute<ITypeAttribute>(); _pFlagsAtt = prefix.AddAttribute<IFlagsAttribute>(); }
public PanGuTokenizer(TextReader input) : base(input) { termAttribute = AddAttribute<ITermAttribute>(); offsetAttribute = AddAttribute<IOffsetAttribute>(); inputText = base.input.ReadToEnd(); if (string.IsNullOrEmpty(inputText)) { char[] readBuf = new char[1024]; int relCount = base.input.Read(readBuf, 0, readBuf.Length); StringBuilder inputStr = new StringBuilder(readBuf.Length); while (relCount > 0) { inputStr.Append(readBuf, 0, relCount); relCount = input.Read(readBuf, 0, readBuf.Length); } if (inputStr.Length > 0) { inputText = inputStr.ToString(); } } if (string.IsNullOrEmpty(inputText)) { words = new WordInfo[0]; } else { global::PanGu.Segment segment = new Segment(); ICollection<WordInfo> wordInfos = segment.DoSegment(inputText); words = new WordInfo[wordInfos.Count]; wordInfos.CopyTo(words, 0); } }
public static void VerifyEquals(Fields d1, Fields d2) { if (d1 == null) { Assert.IsTrue(d2 == null || d2.Count == 0); return; } Assert.IsTrue(d2 != null); IEnumerator <string> fieldsEnum2 = d2.GetEnumerator(); foreach (string field1 in d1) { fieldsEnum2.MoveNext(); string field2 = fieldsEnum2.Current; Assert.AreEqual(field1, field2); Terms terms1 = d1.GetTerms(field1); Assert.IsNotNull(terms1); TermsEnum termsEnum1 = terms1.GetIterator(null); Terms terms2 = d2.GetTerms(field2); Assert.IsNotNull(terms2); TermsEnum termsEnum2 = terms2.GetIterator(null); DocsAndPositionsEnum dpEnum1 = null; DocsAndPositionsEnum dpEnum2 = null; DocsEnum dEnum1 = null; DocsEnum dEnum2 = null; BytesRef term1; while ((term1 = termsEnum1.Next()) != null) { BytesRef term2 = termsEnum2.Next(); Assert.AreEqual(term1, term2); Assert.AreEqual(termsEnum1.TotalTermFreq, termsEnum2.TotalTermFreq); dpEnum1 = termsEnum1.DocsAndPositions(null, dpEnum1); dpEnum2 = termsEnum2.DocsAndPositions(null, dpEnum2); if (dpEnum1 != null) { Assert.IsNotNull(dpEnum2); int docID1 = dpEnum1.NextDoc(); dpEnum2.NextDoc(); // docIDs are not supposed to be equal //int docID2 = dpEnum2.NextDoc(); //Assert.AreEqual(docID1, docID2); Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dpEnum1.Freq; int freq2 = dpEnum2.Freq; Assert.AreEqual(freq1, freq2); IOffsetAttribute offsetAtt1 = dpEnum1.Attributes.HasAttribute <IOffsetAttribute>() ? dpEnum1.Attributes.GetAttribute <IOffsetAttribute>() : null; IOffsetAttribute offsetAtt2 = dpEnum2.Attributes.HasAttribute <IOffsetAttribute>() ? dpEnum2.Attributes.GetAttribute <IOffsetAttribute>() : null; if (offsetAtt1 != null) { Assert.IsNotNull(offsetAtt2); } else { Assert.IsNull(offsetAtt2); } for (int posUpto = 0; posUpto < freq1; posUpto++) { int pos1 = dpEnum1.NextPosition(); int pos2 = dpEnum2.NextPosition(); Assert.AreEqual(pos1, pos2); if (offsetAtt1 != null) { Assert.AreEqual(offsetAtt1.StartOffset, offsetAtt2.StartOffset); Assert.AreEqual(offsetAtt1.EndOffset, offsetAtt2.EndOffset); } } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.NextDoc()); } else { dEnum1 = TestUtil.Docs(Random(), termsEnum1, null, dEnum1, DocsFlags.FREQS); dEnum2 = TestUtil.Docs(Random(), termsEnum2, null, dEnum2, DocsFlags.FREQS); Assert.IsNotNull(dEnum1); Assert.IsNotNull(dEnum2); int docID1 = dEnum1.NextDoc(); dEnum2.NextDoc(); // docIDs are not supposed to be equal //int docID2 = dEnum2.NextDoc(); //Assert.AreEqual(docID1, docID2); Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dEnum1.Freq; int freq2 = dEnum2.Freq; Assert.AreEqual(freq1, freq2); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum1.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum2.NextDoc()); } } Assert.IsNull(termsEnum2.Next()); } Assert.IsFalse(fieldsEnum2.MoveNext()); }
public InputWindowToken(AttributeSource attSource) { this.attSource = attSource; this.termAtt = attSource.GetAttribute <ICharTermAttribute>(); this.offsetAtt = attSource.GetAttribute <IOffsetAttribute>(); }
/// <summary> /// Construct a new SegmenterBase, also supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> /// </summary> public SegmentingTokenizerBase(AttributeFactory factory, TextReader reader, BreakIterator iterator) : base(factory, reader) { offsetAtt = AddAttribute <IOffsetAttribute>(); this.iterator = iterator; }
protected internal RandomTokenStream(BaseTermVectorsFormatTestCase baseTermVectorsFormatTestCase, int len, string[] sampleTerms, BytesRef[] sampleTermBytes, bool offsetsGoBackwards) { terms = new string[len]; termBytes = new BytesRef[len]; positionsIncrements = new int[len]; positions = new int[len]; startOffsets = new int[len]; endOffsets = new int[len]; payloads = new BytesRef[len]; for (int i = 0; i < len; ++i) { int o = Random.Next(sampleTerms.Length); terms[i] = sampleTerms[o]; termBytes[i] = sampleTermBytes[o]; positionsIncrements[i] = TestUtil.NextInt32(Random, i == 0 ? 1 : 0, 10); if (offsetsGoBackwards) { startOffsets[i] = Random.Next(); endOffsets[i] = Random.Next(); } else { if (i == 0) { startOffsets[i] = TestUtil.NextInt32(Random, 0, 1 << 16); } else { startOffsets[i] = startOffsets[i - 1] + TestUtil.NextInt32(Random, 0, Rarely() ? 1 << 16 : 20); } endOffsets[i] = startOffsets[i] + TestUtil.NextInt32(Random, 0, Rarely() ? 1 << 10 : 20); } } for (int i = 0; i < len; ++i) { if (i == 0) { positions[i] = positionsIncrements[i] - 1; } else { positions[i] = positions[i - 1] + positionsIncrements[i]; } } if (Rarely()) { Arrays.Fill(payloads, baseTermVectorsFormatTestCase.RandomPayload()); } else { for (int i = 0; i < len; ++i) { payloads[i] = baseTermVectorsFormatTestCase.RandomPayload(); } } positionToTerms = new Dictionary <int, ISet <int> >(len); startOffsetToTerms = new Dictionary <int, ISet <int> >(len); for (int i = 0; i < len; ++i) { if (!positionToTerms.TryGetValue(positions[i], out ISet <int> positionTerms)) { positionToTerms[positions[i]] = positionTerms = new JCG.HashSet <int>(1); } positionTerms.Add(i); if (!startOffsetToTerms.TryGetValue(startOffsets[i], out ISet <int> startOffsetTerms)) { startOffsetToTerms[startOffsets[i]] = startOffsetTerms = new JCG.HashSet <int>(1); } startOffsetTerms.Add(i); } freqs = new Dictionary <string, int>(); foreach (string term in terms) { if (freqs.TryGetValue(term, out int freq)) { freqs[term] = freq + 1; } else { freqs[term] = 1; } } AddAttributeImpl(new PermissiveOffsetAttribute()); termAtt = AddAttribute <ICharTermAttribute>(); piAtt = AddAttribute <IPositionIncrementAttribute>(); oAtt = AddAttribute <IOffsetAttribute>(); pAtt = AddAttribute <IPayloadAttribute>(); }
/// <summary> /// Creates a new HyphenatedWordsFilter /// </summary> /// <param name="in"> TokenStream that will be filtered </param> public HyphenatedWordsFilter(TokenStream @in) : base(@in) { termAttribute = AddAttribute<ICharTermAttribute>(); offsetAttribute = AddAttribute<IOffsetAttribute>(); }
public PositionsTokenStream() { term = AddAttribute <ICharTermAttribute>(); payload = AddAttribute <IPayloadAttribute>(); offset = AddAttribute <IOffsetAttribute>(); }
public RegexTokenizer(String str, Regex regex, bool toLowerCase) { this.str = str; this.matcher = regex.Match(str); this.toLowerCase = toLowerCase; this.termAtt = AddAttribute<ITermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); }
public TestFilter(TokenStream @in) : base(@in) { termAtt = AddAttribute<ICharTermAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); }
public IdentifierTokenizer(TextReader input) : base(input) { _offsetAtt = AddAttribute <IOffsetAttribute>(); _termAtt = AddAttribute <ITermAttribute>(); }
public BugReproTokenStream() { TermAtt = AddAttribute <ICharTermAttribute>(); OffsetAtt = AddAttribute <IOffsetAttribute>(); PosIncAtt = AddAttribute <IPositionIncrementAttribute>(); }
public override bool IncrementToken() { if (hasMoreTokensInClone) { int start = breaker.Current; int end = breaker.Next(); if (end != BreakIterator.Done) { clonedToken.CopyTo(this); termAtt.CopyBuffer(clonedTermAtt.Buffer, start, end - start); if (hasIllegalOffsets) { offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.EndOffset); } else { offsetAtt.SetOffset(clonedOffsetAtt.StartOffset + start, clonedOffsetAtt.StartOffset + end); } if (handlePosIncr) { posAtt.PositionIncrement = 1; } return(true); } hasMoreTokensInClone = false; } if (!m_input.IncrementToken()) { return(false); } if (termAtt.Length == 0 || !thaiPattern.IsMatch(string.Empty + termAtt[0])) { return(true); } hasMoreTokensInClone = true; // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = offsetAtt.EndOffset - offsetAtt.StartOffset != termAtt.Length; // we lazy init the cloned token, as in ctor not all attributes may be added if (clonedToken == null) { clonedToken = CloneAttributes(); clonedTermAtt = clonedToken.GetAttribute <ICharTermAttribute>(); clonedOffsetAtt = clonedToken.GetAttribute <IOffsetAttribute>(); } else { this.CopyTo(clonedToken); } // reinit CharacterIterator charIterator.SetText(clonedTermAtt.Buffer, 0, clonedTermAtt.Length); breaker.SetText(new string(charIterator.Text, charIterator.Start, charIterator.Length)); int end2 = breaker.Next(); if (end2 != BreakIterator.Done) { termAtt.Length = end2; if (hasIllegalOffsets) { offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.EndOffset); } else { offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.StartOffset + end2); } // position increment keeps as it is for first token return(true); } return(false); }
/// <summary> /// Iterates over the given token stream and adds the resulting terms to the index; /// Equivalent to adding a tokenized, indexed, termVectorStored, unstored, /// Lucene <see cref="Documents.Field"/>. /// Finally closes the token stream. Note that untokenized keywords can be added with this method via /// <see cref="T:KeywordTokenStream{T}(ICollection{T}"/>)"/>, the Lucene <c>KeywordTokenizer</c> or similar utilities. /// /// </summary> /// <param name="fieldName"> a name to be associated with the text </param> /// <param name="stream"> the token stream to retrieve tokens from. </param> /// <param name="boost"> the boost factor for hits for this field </param> /// <param name="positionIncrementGap"> the position increment gap if fields with the same name are added more than once </param> /// <param name="offsetGap"> the offset gap if fields with the same name are added more than once </param> /// <seealso cref="Documents.Field.Boost"/> public virtual void AddField(string fieldName, TokenStream stream, float boost, int positionIncrementGap, int offsetGap) { try { if (fieldName == null) { throw new ArgumentException("fieldName must not be null"); } if (stream == null) { throw new ArgumentException("token stream must not be null"); } if (boost <= 0.0f) { throw new ArgumentException("boost factor must be greater than 0.0"); } int numTokens = 0; int numOverlapTokens = 0; int pos = -1; BytesRefHash terms; SliceByteStartArray sliceArray; Info info = null; long sumTotalTermFreq = 0; int offset = 0; if (fields.TryGetValue(fieldName, out info)) { numTokens = info.numTokens; numOverlapTokens = info.numOverlapTokens; pos = info.lastPosition + positionIncrementGap; offset = info.lastOffset + offsetGap; terms = info.terms; boost *= info.boost; sliceArray = info.sliceArray; sumTotalTermFreq = info.sumTotalTermFreq; } else { sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY); terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray); } if (!fieldInfos.ContainsKey(fieldName)) { fieldInfos[fieldName] = new FieldInfo(fieldName, true, fieldInfos.Count, false, false, false, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, DocValuesType.NONE, DocValuesType.NONE, null); } ITermToBytesRefAttribute termAtt = stream.GetAttribute <ITermToBytesRefAttribute>(); IPositionIncrementAttribute posIncrAttribute = stream.AddAttribute <IPositionIncrementAttribute>(); IOffsetAttribute offsetAtt = stream.AddAttribute <IOffsetAttribute>(); BytesRef @ref = termAtt.BytesRef; stream.Reset(); while (stream.IncrementToken()) { termAtt.FillBytesRef(); // if (DEBUG) System.err.println("token='" + term + "'"); numTokens++; int posIncr = posIncrAttribute.PositionIncrement; if (posIncr == 0) { numOverlapTokens++; } pos += posIncr; int ord = terms.Add(@ref); if (ord < 0) { ord = (-ord) - 1; postingsWriter.Reset(sliceArray.end[ord]); } else { sliceArray.start[ord] = postingsWriter.StartNewSlice(); } sliceArray.freq[ord]++; sumTotalTermFreq++; if (!storeOffsets) { postingsWriter.WriteInt32(pos); } else { postingsWriter.WriteInt32(pos); postingsWriter.WriteInt32(offsetAtt.StartOffset + offset); postingsWriter.WriteInt32(offsetAtt.EndOffset + offset); } sliceArray.end[ord] = postingsWriter.CurrentOffset; } stream.End(); // ensure infos.numTokens > 0 invariant; needed for correct operation of terms() if (numTokens > 0) { fields[fieldName] = new Info(terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.EndOffset + offset, sumTotalTermFreq); sortedFields = null; // invalidate sorted view, if any } } // can never happen catch (Exception e) { throw new Exception(e.ToString(), e); } finally { try { if (stream != null) { stream.Dispose(); } } catch (IOException e2) { throw new Exception(e2.ToString(), e2); } } }
protected internal RandomTokenStream(BaseTermVectorsFormatTestCase outerInstance, int len, string[] sampleTerms, BytesRef[] sampleTermBytes, bool offsetsGoBackwards) { this.OuterInstance = outerInstance; Terms = new string[len]; TermBytes = new BytesRef[len]; PositionsIncrements = new int[len]; Positions = new int[len]; StartOffsets = new int[len]; EndOffsets = new int[len]; Payloads = new BytesRef[len]; for (int i = 0; i < len; ++i) { int o = Random().Next(sampleTerms.Length); Terms[i] = sampleTerms[o]; TermBytes[i] = sampleTermBytes[o]; PositionsIncrements[i] = TestUtil.NextInt(Random(), i == 0 ? 1 : 0, 10); if (offsetsGoBackwards) { StartOffsets[i] = Random().Next(); EndOffsets[i] = Random().Next(); } else { if (i == 0) { StartOffsets[i] = TestUtil.NextInt(Random(), 0, 1 << 16); } else { StartOffsets[i] = StartOffsets[i - 1] + TestUtil.NextInt(Random(), 0, Rarely() ? 1 << 16 : 20); } EndOffsets[i] = StartOffsets[i] + TestUtil.NextInt(Random(), 0, Rarely() ? 1 << 10 : 20); } } for (int i = 0; i < len; ++i) { if (i == 0) { Positions[i] = PositionsIncrements[i] - 1; } else { Positions[i] = Positions[i - 1] + PositionsIncrements[i]; } } if (Rarely()) { Arrays.Fill(Payloads, outerInstance.RandomPayload()); } else { for (int i = 0; i < len; ++i) { Payloads[i] = outerInstance.RandomPayload(); } } PositionToTerms = new Dictionary <int?, ISet <int?> >(len); StartOffsetToTerms = new Dictionary <int?, ISet <int?> >(len); for (int i = 0; i < len; ++i) { if (!PositionToTerms.ContainsKey(Positions[i])) { PositionToTerms[Positions[i]] = new HashSet <int?>();//size1 } PositionToTerms[Positions[i]].Add(i); if (!StartOffsetToTerms.ContainsKey(StartOffsets[i])) { StartOffsetToTerms[StartOffsets[i]] = new HashSet <int?>();//size1 } StartOffsetToTerms[StartOffsets[i]].Add(i); } Freqs = new Dictionary <string, int?>(); foreach (string term in Terms) { if (Freqs.ContainsKey(term)) { Freqs[term] = Freqs[term] + 1; } else { Freqs[term] = 1; } } AddAttributeImpl(new PermissiveOffsetAttributeImpl()); TermAtt = AddAttribute <ICharTermAttribute>(); PiAtt = AddAttribute <IPositionIncrementAttribute>(); OAtt = AddAttribute <IOffsetAttribute>(); PAtt = AddAttribute <IPayloadAttribute>(); }
public IdentifierTokenizer(AttributeFactory factory, TextReader input) : base(factory, input) { _offsetAtt = AddAttribute <IOffsetAttribute>(); _termAtt = AddAttribute <ITermAttribute>(); }
/// <summary> /// <para>Get the next token from the input stream. /// </para> /// <para>If the next token has <c>positionIncrement > 1</c>, /// <c>positionIncrement - 1</c> <see cref="fillerToken"/>s are /// inserted first. /// </para> /// </summary> /// <param name="target"> Where to put the new token; if null, a new instance is created. </param> /// <returns> On success, the populated token; null otherwise </returns> /// <exception cref="IOException"> if the input stream has a problem </exception> private InputWindowToken GetNextToken(InputWindowToken target) { InputWindowToken newTarget = target; if (numFillerTokensToInsert > 0) { if (null == target) { newTarget = new InputWindowToken(nextInputStreamToken.CloneAttributes()); } else { nextInputStreamToken.CopyTo(target.attSource); } // A filler token occupies no space newTarget.offsetAtt.SetOffset(newTarget.offsetAtt.StartOffset, newTarget.offsetAtt.StartOffset); newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length); newTarget.isFiller = true; --numFillerTokensToInsert; } else if (isNextInputStreamToken) { if (null == target) { newTarget = new InputWindowToken(nextInputStreamToken.CloneAttributes()); } else { nextInputStreamToken.CopyTo(target.attSource); } isNextInputStreamToken = false; newTarget.isFiller = false; } else if (!exhausted) { if (m_input.IncrementToken()) { if (null == target) { newTarget = new InputWindowToken(CloneAttributes()); } else { this.CopyTo(target.attSource); } if (posIncrAtt.PositionIncrement > 1) { // Each output shingle must contain at least one input token, // so no more than (maxShingleSize - 1) filler tokens will be inserted. numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1); // Save the current token as the next input stream token if (null == nextInputStreamToken) { nextInputStreamToken = CloneAttributes(); } else { this.CopyTo(nextInputStreamToken); } isNextInputStreamToken = true; // A filler token occupies no space newTarget.offsetAtt.SetOffset(offsetAtt.StartOffset, offsetAtt.StartOffset); newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length); newTarget.isFiller = true; --numFillerTokensToInsert; } else { newTarget.isFiller = false; } } else { exhausted = true; m_input.End(); endState = CaptureState(); numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1); if (numFillerTokensToInsert > 0) { nextInputStreamToken = new AttributeSource(this.GetAttributeFactory()); nextInputStreamToken.AddAttribute <ICharTermAttribute>(); IOffsetAttribute newOffsetAtt = nextInputStreamToken.AddAttribute <IOffsetAttribute>(); newOffsetAtt.SetOffset(offsetAtt.EndOffset, offsetAtt.EndOffset); // Recurse/loop just once: return(GetNextToken(target)); } else { newTarget = null; } } } else { newTarget = null; } return(newTarget); }
public IdentifierTokenizer(AttributeSource source, TextReader input) : base(source, input) { _offsetAtt = AddAttribute <IOffsetAttribute>(); _termAtt = AddAttribute <ITermAttribute>(); }
public override void ProcessFields(IIndexableField[] fields, int count) { fieldState.Reset(); bool doInvert = consumer.Start(fields, count); for (int i = 0; i < count; i++) { IIndexableField field = fields[i]; IIndexableFieldType fieldType = field.IndexableFieldType; // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. if (fieldType.IsIndexed && doInvert) { bool analyzed = fieldType.IsTokenized && docState.analyzer != null; // if the field omits norms, the boost cannot be indexed. if (fieldType.OmitNorms && field.Boost != 1.0f) { throw new NotSupportedException("You cannot set an index-time boost: norms are omitted for field '" + field.Name + "'"); } // only bother checking offsets if something will consume them. // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed. bool checkOffsets = fieldType.IndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; int lastStartOffset = 0; if (i > 0) { fieldState.Position += analyzed ? docState.analyzer.GetPositionIncrementGap(fieldInfo.Name) : 0; } /* * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses, * but rather a finally that takes note of the problem. */ bool succeededInProcessingField = false; TokenStream stream = field.GetTokenStream(docState.analyzer); // reset the TokenStream to the first token stream.Reset(); try { bool hasMoreTokens = stream.IncrementToken(); fieldState.AttributeSource = stream; IOffsetAttribute offsetAttribute = fieldState.AttributeSource.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute posIncrAttribute = fieldState.AttributeSource.AddAttribute <IPositionIncrementAttribute>(); if (hasMoreTokens) { consumer.Start(field); do { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID int posIncr = posIncrAttribute.PositionIncrement; if (posIncr < 0) { throw new ArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.Name + "'"); } if (fieldState.Position == 0 && posIncr == 0) { throw new ArgumentException("first position increment must be > 0 (got 0) for field '" + field.Name + "'"); } int position = fieldState.Position + posIncr; if (position > 0) { // NOTE: confusing: this "mirrors" the // position++ we do below position--; } else if (position < 0) { throw new ArgumentException("position overflow for field '" + field.Name + "'"); } // position is legal, we can safely place it in fieldState now. // not sure if anything will use fieldState after non-aborting exc... fieldState.Position = position; if (posIncr == 0) { fieldState.NumOverlap++; } if (checkOffsets) { int startOffset = fieldState.Offset + offsetAttribute.StartOffset; int endOffset = fieldState.Offset + offsetAttribute.EndOffset; if (startOffset < 0 || endOffset < startOffset) { throw new ArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.Name + "'"); } if (startOffset < lastStartOffset) { throw new ArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.Name + "'"); } lastStartOffset = startOffset; } bool success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: consumer.Add(); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.Length++; fieldState.Position++; } while (stream.IncrementToken()); } // trigger streams to perform end-of-stream operations stream.End(); // TODO: maybe add some safety? then again, its already checked // when we come back around to the field... fieldState.Position += posIncrAttribute.PositionIncrement; fieldState.Offset += offsetAttribute.EndOffset; if (docState.maxTermPrefix != null) { string msg = "Document contains at least one immense term in field=\"" + fieldInfo.Name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'"; if (docState.infoStream.IsEnabled("IW")) { docState.infoStream.Message("IW", "ERROR: " + msg); } docState.maxTermPrefix = null; throw new ArgumentException(msg); } /* if success was false above there is an exception coming through and we won't get here.*/ succeededInProcessingField = true; } finally { if (!succeededInProcessingField) { IOUtils.DisposeWhileHandlingException(stream); } else { stream.Dispose(); } if (!succeededInProcessingField && docState.infoStream.IsEnabled("DW")) { docState.infoStream.Message("DW", "An exception was thrown while processing field " + fieldInfo.Name); } } fieldState.Offset += analyzed ? docState.analyzer.GetOffsetGap(fieldInfo.Name) : 0; fieldState.Boost *= field.Boost; } // LUCENE-2387: don't hang onto the field, so GC can // reclaim fields[i] = null; } consumer.Finish(); endConsumer.Finish(); }
public FastStringTokenizer(String str, bool isLetter, bool toLowerCase, ISet<string> stopWords) { this.str = str; this.isLetter = isLetter; this.toLowerCase = toLowerCase; this.stopWords = stopWords; this.termAtt = AddAttribute<ITermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); }
public override void CopyTo(Attribute target) { IOffsetAttribute t = (IOffsetAttribute)target; t.SetOffset(startOffset, endOffset); }
internal SingleTokenAttributeSource() { termAttribute = AddAttribute<ITermAttribute>(); offsetAttribute = AddAttribute<IOffsetAttribute>(); }
private void Init() { termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); }
/// <summary> /// Creates a new <see cref="HyphenatedWordsFilter"/> /// </summary> /// <param name="in"> <see cref="TokenStream"/> that will be filtered </param> public HyphenatedWordsFilter(TokenStream @in) : base(@in) { termAttribute = AddAttribute <ICharTermAttribute>(); offsetAttribute = AddAttribute <IOffsetAttribute>(); }
public DocsAndPositionsEnumAnonymousClass( TokenStream ts, CharacterRunAutomaton[] matchers, ICharTermAttribute charTermAtt, IOffsetAttribute offsetAtt) { this.matchers = matchers; this.charTermAtt = charTermAtt; this.offsetAtt = offsetAtt; stream = ts; matchDescriptions = new BytesRef[matchers.Length]; }