internal virtual void AddToken(Token token, float score) { if (numTokens < MAX_NUM_TOKENS_PER_GROUP) { if (numTokens == 0) { startOffset = matchStartOffset = token.StartOffset(); endOffset = matchEndOffset = token.EndOffset(); tot += score; } else { startOffset = Math.Min(startOffset, token.StartOffset()); endOffset = Math.Max(endOffset, token.EndOffset()); if (score > 0) { if (tot == 0) { matchStartOffset = token.StartOffset(); matchEndOffset = token.EndOffset(); } else { matchStartOffset = Math.Min(matchStartOffset, token.StartOffset()); matchEndOffset = Math.Max(matchEndOffset, token.EndOffset()); } tot += score; } } tokens[numTokens] = token; scores[numTokens] = score; numTokens++; } }
public override Token Next() { if (inPhrase) { inPhrase = false; return(new Token("phrase2", savedStart, savedEnd)); } else { for (Token token = input.Next(); token != null; token = input.Next()) { if (token.TermText().Equals("phrase")) { inPhrase = true; savedStart = token.StartOffset(); savedEnd = token.EndOffset(); return(new Token("phrase1", savedStart, savedEnd)); } else if (!token.TermText().Equals("stop")) { return(token); } } } return(null); }
/* (non-Javadoc) * @see Lucene.Net.Highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token) */ public virtual bool IsNewFragment(Token token) { bool isNewFrag = token.EndOffset() >= (fragmentSize * currentNumFrags); if (isNewFrag) { currentNumFrags++; } return isNewFrag; }
/* (non-Javadoc) * @see Lucene.Net.Highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token) */ public virtual bool IsNewFragment(Token token) { bool isNewFrag = token.EndOffset() >= (fragmentSize * currentNumFrags); if (isNewFrag) { currentNumFrags++; } return(isNewFrag); }
/// <summary> /// Returns the next, stemmed, input Token. /// </summary> /// <returns> /// The stemed form of a token. /// </returns> /// <throws>IOException</throws> public override Token Next() { Token token = input.Next(); if (token == null) { return(null); } else { string str = stemmer.stem(token.TermText()); //if ((System.Object) str != token.TermText()) if (!str.Equals(token.TermText())) { // Yes, I mean object reference comparison here //token.TermText() = str; return(new Token(str, token.StartOffset(), token.EndOffset(), token.Type())); } return(token); } }
internal override void addTerm(Token t, RawPostingList p0) { System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start")); TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList)p0; p.freq++; if (doVectorOffsets) { int startOffset = fieldState.offset + t.StartOffset(); int endOffset = fieldState.offset + t.EndOffset(); termsHashPerField.writeVInt(1, startOffset - p.lastOffset); termsHashPerField.writeVInt(1, endOffset - startOffset); p.lastOffset = endOffset; } if (doVectorPositions) { termsHashPerField.writeVInt(0, fieldState.position - p.lastPosition); p.lastPosition = fieldState.position; } }
/// <summary>Returns the next input Token, after being stemmed </summary> public override Token Next() { Token token = input.Next(); if (token == null) { return(null); } stemmer.SetCurrent(token.TermText()); try { stemMethod.Invoke(stemmer, (System.Object[])EMPTY_ARGS); } catch (System.Exception e) { throw new System.SystemException(e.ToString()); } Token newToken = new Token(stemmer.GetCurrent(), token.StartOffset(), token.EndOffset(), token.Type()); newToken.SetPositionIncrement(token.GetPositionIncrement()); return(newToken); }
/// <summary>This is the hotspot of indexing: it's called once /// for every term of every document. Its job is to * /// update the postings byte stream (Postings hash) * /// based on the occurence of a single term. /// </summary> private void AddPosition(Token token) { Payload payload = token.GetPayload(); // Get the text of this term. Term can either // provide a String token or offset into a char[] // array char[] tokenText = token.TermBuffer(); int tokenTextLen = token.TermLength(); int code = 0; // Compute hashcode int downto = tokenTextLen; while (downto > 0) code = (code * 31) + tokenText[--downto]; // System.out.println(" addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets); int hashPos = code & postingsHashMask; System.Diagnostics.Debug.Assert(!postingsCompacted); // Locate Posting in hash Enclosing_Instance.p = postingsHash[hashPos]; if (Enclosing_Instance.p != null && !Enclosing_Instance.PostingEquals(tokenText, tokenTextLen)) { // Conflict: keep searching different locations in // the hash table. int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & postingsHashMask; Enclosing_Instance.p = postingsHash[hashPos]; } while (Enclosing_Instance.p != null && !Enclosing_Instance.PostingEquals(tokenText, tokenTextLen)); } int proxCode; // If we hit an exception below, it's possible the // posting list or term vectors data will be // partially written and thus inconsistent if // flushed, so we have to abort all documents // since the last flush: try { if (Enclosing_Instance.p != null) { // term seen since last flush if (Enclosing_Instance.docID != Enclosing_Instance.p.lastDocID) { // term not yet seen in this doc // System.out.println(" seen before (new docID=" + docID + ") freqUpto=" + p.freqUpto +" proxUpto=" + p.proxUpto); System.Diagnostics.Debug.Assert(Enclosing_Instance.p.docFreq > 0); // Now that we know doc freq for previous doc, // write it & lastDocCode Enclosing_Instance.freqUpto = Enclosing_Instance.p.freqUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; Enclosing_Instance.freq = Enclosing_Instance.postingsPool.buffers[Enclosing_Instance.p.freqUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; if (1 == Enclosing_Instance.p.docFreq) Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.lastDocCode | 1); else { Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.lastDocCode); Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.docFreq); } Enclosing_Instance.p.freqUpto = Enclosing_Instance.freqUpto + (Enclosing_Instance.p.freqUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); if (doVectors) { Enclosing_Instance.vector = AddNewVector(); if (doVectorOffsets) { offsetStartCode = offsetStart = offset + token.StartOffset(); offsetEnd = offset + token.EndOffset(); } } proxCode = position; Enclosing_Instance.p.docFreq = 1; // Store code so we can write this after we're // done with this new doc Enclosing_Instance.p.lastDocCode = (Enclosing_Instance.docID - Enclosing_Instance.p.lastDocID) << 1; Enclosing_Instance.p.lastDocID = Enclosing_Instance.docID; } else { // term already seen in this doc // System.out.println(" seen before (same docID=" + docID + ") proxUpto=" + p.proxUpto); Enclosing_Instance.p.docFreq++; proxCode = position - Enclosing_Instance.p.lastPosition; if (doVectors) { Enclosing_Instance.vector = Enclosing_Instance.p.vector; if (Enclosing_Instance.vector == null) Enclosing_Instance.vector = AddNewVector(); if (doVectorOffsets) { offsetStart = offset + token.StartOffset(); offsetEnd = offset + token.EndOffset(); offsetStartCode = offsetStart - Enclosing_Instance.vector.lastOffset; } } } } else { // term not seen before // System.out.println(" never seen docID=" + docID); // Refill? if (0 == Enclosing_Instance.postingsFreeCount) { Enclosing_Instance.Enclosing_Instance.GetPostings(Enclosing_Instance.postingsFreeList); Enclosing_Instance.postingsFreeCount = Enclosing_Instance.postingsFreeList.Length; } int textLen1 = 1 + tokenTextLen; if (textLen1 + Enclosing_Instance.charPool.byteUpto > Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE) { if (textLen1 > Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE) { // Just skip this term, to remain as robust as // possible during indexing. A TokenFilter // can be inserted into the analyzer chain if // other behavior is wanted (pruning the term // to a prefix, throwing an exception, etc). if (Enclosing_Instance.maxTermPrefix == null) Enclosing_Instance.maxTermPrefix = new System.String(tokenText, 0, 30); // Still increment position: position++; return ; } Enclosing_Instance.charPool.NextBuffer(); } char[] text = Enclosing_Instance.charPool.buffer; int textUpto = Enclosing_Instance.charPool.byteUpto; // Pull next free Posting from free list Enclosing_Instance.p = Enclosing_Instance.postingsFreeList[--Enclosing_Instance.postingsFreeCount]; Enclosing_Instance.p.textStart = textUpto + Enclosing_Instance.charPool.byteOffset; Enclosing_Instance.charPool.byteUpto += textLen1; Array.Copy(tokenText, 0, text, textUpto, tokenTextLen); text[textUpto + tokenTextLen] = (char) (0xffff); System.Diagnostics.Debug.Assert(postingsHash [hashPos] == null); postingsHash[hashPos] = Enclosing_Instance.p; numPostings++; if (numPostings == postingsHashHalfSize) RehashPostings(2 * postingsHashSize); // Init first slice for freq & prox streams int firstSize = Lucene.Net.Index.DocumentsWriter.levelSizeArray[0]; int upto1 = Enclosing_Instance.postingsPool.NewSlice(firstSize); Enclosing_Instance.p.freqStart = Enclosing_Instance.p.freqUpto = Enclosing_Instance.postingsPool.byteOffset + upto1; int upto2 = Enclosing_Instance.postingsPool.NewSlice(firstSize); Enclosing_Instance.p.proxStart = Enclosing_Instance.p.proxUpto = Enclosing_Instance.postingsPool.byteOffset + upto2; Enclosing_Instance.p.lastDocCode = Enclosing_Instance.docID << 1; Enclosing_Instance.p.lastDocID = Enclosing_Instance.docID; Enclosing_Instance.p.docFreq = 1; if (doVectors) { Enclosing_Instance.vector = AddNewVector(); if (doVectorOffsets) { offsetStart = offsetStartCode = offset + token.StartOffset(); offsetEnd = offset + token.EndOffset(); } } proxCode = position; } Enclosing_Instance.proxUpto = Enclosing_Instance.p.proxUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; Enclosing_Instance.prox = Enclosing_Instance.postingsPool.buffers[Enclosing_Instance.p.proxUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; System.Diagnostics.Debug.Assert(Enclosing_Instance.prox != null); if (payload != null && payload.length > 0) { Enclosing_Instance.WriteProxVInt((proxCode << 1) | 1); Enclosing_Instance.WriteProxVInt(payload.length); Enclosing_Instance.WriteProxBytes(payload.data, payload.offset, payload.length); fieldInfo.storePayloads = true; } else Enclosing_Instance.WriteProxVInt(proxCode << 1); Enclosing_Instance.p.proxUpto = Enclosing_Instance.proxUpto + (Enclosing_Instance.p.proxUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); Enclosing_Instance.p.lastPosition = position++; if (doVectorPositions) { Enclosing_Instance.posUpto = Enclosing_Instance.vector.posUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; Enclosing_Instance.pos = Enclosing_Instance.vectorsPool.buffers[Enclosing_Instance.vector.posUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; Enclosing_Instance.WritePosVInt(proxCode); Enclosing_Instance.vector.posUpto = Enclosing_Instance.posUpto + (Enclosing_Instance.vector.posUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); } if (doVectorOffsets) { Enclosing_Instance.offsetUpto = Enclosing_Instance.vector.offsetUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; Enclosing_Instance.offsets = Enclosing_Instance.vectorsPool.buffers[Enclosing_Instance.vector.offsetUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; Enclosing_Instance.WriteOffsetVInt(offsetStartCode); Enclosing_Instance.WriteOffsetVInt(offsetEnd - offsetStart); Enclosing_Instance.vector.lastOffset = offsetEnd; Enclosing_Instance.vector.offsetUpto = Enclosing_Instance.offsetUpto + (Enclosing_Instance.vector.offsetUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); } } catch (System.Exception t) { throw new AbortException(t, Enclosing_Instance.Enclosing_Instance); } }
internal static void Test(System.IO.TextReader reader, bool verbose, long bytes) { Analyzer analyzer = new SimpleAnalyzer(); TokenStream stream = analyzer.TokenStream(null, reader); System.DateTime start = System.DateTime.Now; int count = 0; for (Token t = stream.Next(); t != null; t = stream.Next()) { if (verbose) { System.Console.Out.WriteLine("Text=" + t.TermText() + " start=" + t.StartOffset() + " end=" + t.EndOffset()); } count++; } System.DateTime end = System.DateTime.Now; long time = end.Ticks - start.Ticks; System.Console.Out.WriteLine(time + " milliseconds to extract " + count + " tokens"); System.Console.Out.WriteLine((time * 1000.0) / count + " microseconds/token"); System.Console.Out.WriteLine((bytes * 1000.0 * 60.0 * 60.0) / (time * 1000000.0) + " megabytes/hour"); }
internal override void processFields(Fieldable[] fields, int count) { fieldState.reset(docState.doc.GetBoost()); int maxFieldLength = docState.maxFieldLength; bool doInvert = consumer.start(fields, count); for (int i = 0; i < count; i++) { Fieldable field = fields[i]; // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. if (field.IsIndexed() && doInvert) { if (fieldState.length > 0) { fieldState.position += docState.analyzer.GetPositionIncrementGap(fieldInfo.name); } if (!field.IsTokenized()) { // un-tokenized field string stringValue = field.StringValue(); int valueLength = stringValue.Length; Token token = perThread.localToken.Reinit(stringValue, fieldState.offset, fieldState.offset + valueLength); bool success = false; try { consumer.add(token); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.offset += valueLength; fieldState.length++; fieldState.position++; } else { // tokenized field TokenStream stream; TokenStream streamValue = field.TokenStreamValue(); if (streamValue != null) { stream = streamValue; } else { // the field does not have a TokenStream, // so we have to obtain one from the analyzer System.IO.TextReader reader; // find or make Reader System.IO.TextReader readerValue = field.ReaderValue(); if (readerValue != null) { reader = readerValue; } else { string stringValue = field.StringValue(); if (stringValue == null) { throw new System.ArgumentException("field must have either TokenStream, string or Reader value"); } perThread.stringReader.Init(stringValue); reader = perThread.stringReader; } // Tokenize field and add to postingTable stream = docState.analyzer.ReusableTokenStream(fieldInfo.name, reader); } // reset the TokenStream to the first token stream.Reset(); try { int offsetEnd = fieldState.offset - 1; Token localToken = perThread.localToken; for (; ;) { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID Token token = stream.Next(localToken); if (token == null) { break; } fieldState.position += (token.GetPositionIncrement() - 1); bool success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: consumer.add(token); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.position++; offsetEnd = fieldState.offset + token.EndOffset(); if (++fieldState.length >= maxFieldLength) { if (docState.infoStream != null) { docState.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens"); } break; } } fieldState.offset = offsetEnd + 1; } finally { stream.Close(); } } fieldState.boost *= field.GetBoost(); } } consumer.finish(); endConsumer.finish(); }
// Tokenizes the fields of a document into Postings. private void InvertDocument(Document doc) { foreach (Field field in doc.Fields()) { System.String fieldName = field.Name(); int fieldNumber = fieldInfos.FieldNumber(fieldName); int length = fieldLengths[fieldNumber]; // length of field int position = fieldPositions[fieldNumber]; // position in field if (length > 0) { position += analyzer.GetPositionIncrementGap(fieldName); } int offset = fieldOffsets[fieldNumber]; // offset field if (field.IsIndexed()) { if (!field.IsTokenized()) { // un-tokenized field System.String stringValue = field.StringValue(); if (field.IsStoreOffsetWithTermVector()) { AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length)); } else { AddPosition(fieldName, stringValue, position++, null); } offset += stringValue.Length; length++; } else { System.IO.TextReader reader; // find or make Reader if (field.ReaderValue() != null) { reader = field.ReaderValue(); } else if (field.StringValue() != null) { reader = new System.IO.StringReader(field.StringValue()); } else { throw new System.ArgumentException("field must have either String or Reader value"); } // Tokenize field and add to postingTable TokenStream stream = analyzer.TokenStream(fieldName, reader); try { Token lastToken = null; for (Token t = stream.Next(); t != null; t = stream.Next()) { position += (t.GetPositionIncrement() - 1); if (field.IsStoreOffsetWithTermVector()) { AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset())); } else { AddPosition(fieldName, t.TermText(), position++, null); } lastToken = t; if (++length > maxFieldLength) { if (infoStream != null) { infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens"); } break; } } if (lastToken != null) { offset += lastToken.EndOffset() + 1; } } finally { stream.Close(); } } fieldLengths[fieldNumber] = length; // save field length fieldPositions[fieldNumber] = position; // save field position fieldBoosts[fieldNumber] *= field.GetBoost(); fieldOffsets[fieldNumber] = offset; } } }