Пример #1
0
 internal virtual void  AddToken(Token token, float score)
 {
     if (numTokens < MAX_NUM_TOKENS_PER_GROUP)
     {
         if (numTokens == 0)
         {
             startOffset = matchStartOffset = token.StartOffset();
             endOffset   = matchEndOffset = token.EndOffset();
             tot        += score;
         }
         else
         {
             startOffset = Math.Min(startOffset, token.StartOffset());
             endOffset   = Math.Max(endOffset, token.EndOffset());
             if (score > 0)
             {
                 if (tot == 0)
                 {
                     matchStartOffset = token.StartOffset();
                     matchEndOffset   = token.EndOffset();
                 }
                 else
                 {
                     matchStartOffset = Math.Min(matchStartOffset, token.StartOffset());
                     matchEndOffset   = Math.Max(matchEndOffset, token.EndOffset());
                 }
                 tot += score;
             }
         }
         tokens[numTokens] = token;
         scores[numTokens] = score;
         numTokens++;
     }
 }
Пример #2
0
		internal virtual void  AddToken(Token token, float score)
		{
			if (numTokens < MAX_NUM_TOKENS_PER_GROUP)
			{
				if (numTokens == 0)
				{
					startOffset = matchStartOffset = token.StartOffset();
					endOffset = matchEndOffset = token.EndOffset();
					tot += score;
				}
				else
				{
					startOffset = Math.Min(startOffset, token.StartOffset());
					endOffset = Math.Max(endOffset, token.EndOffset());
					if (score > 0)
					{
						if (tot == 0)
						{
							matchStartOffset = token.StartOffset();
							matchEndOffset = token.EndOffset();
						}
						else
						{
							matchStartOffset = Math.Min(matchStartOffset, token.StartOffset());
							matchEndOffset = Math.Max(matchEndOffset, token.EndOffset());
						}
						tot += score;
					}
				}
				tokens[numTokens] = token;
				scores[numTokens] = score;
				numTokens++;
			}
		}
Пример #3
0
            public virtual int Compare(System.Object o1, System.Object o2)
            {
                Token t1 = (Token)o1;
                Token t2 = (Token)o2;

                if (t1.StartOffset() > t2.StartOffset())
                {
                    return(1);
                }
                if (t1.StartOffset() < t2.StartOffset())
                {
                    return(-1);
                }
                return(0);
            }
Пример #4
0
 public override Token Next()
 {
     if (inPhrase)
     {
         inPhrase = false;
         return(new Token("phrase2", savedStart, savedEnd));
     }
     else
     {
         for (Token token = input.Next(); token != null; token = input.Next())
         {
             if (token.TermText().Equals("phrase"))
             {
                 inPhrase   = true;
                 savedStart = token.StartOffset();
                 savedEnd   = token.EndOffset();
                 return(new Token("phrase1", savedStart, savedEnd));
             }
             else if (!token.TermText().Equals("stop"))
             {
                 return(token);
             }
         }
     }
     return(null);
 }
Пример #5
0
        /// <summary>
        /// Returns the next, stemmed, input Token.
        /// </summary>
        /// <returns>
        ///  The stemed form of a token.
        /// </returns>
        /// <throws>IOException</throws>
        public override Token Next()
        {
            Token token = input.Next();

            if (token == null)
            {
                return(null);
            }
            else
            {
                string str = stemmer.stem(token.TermText());
                //if ((System.Object) str != token.TermText())
                if (!str.Equals(token.TermText()))
                {
                    // Yes, I mean object reference comparison here
                    //token.TermText() = str;
                    return(new Token(str, token.StartOffset(), token.EndOffset(), token.Type()));
                }
                return(token);
            }
        }
Пример #6
0
        internal override void addTerm(Token t, RawPostingList p0)
        {
            System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start"));

            TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList)p0;
            p.freq++;

            if (doVectorOffsets)
            {
                int startOffset = fieldState.offset + t.StartOffset();
                int endOffset   = fieldState.offset + t.EndOffset();
                termsHashPerField.writeVInt(1, startOffset - p.lastOffset);
                termsHashPerField.writeVInt(1, endOffset - startOffset);
                p.lastOffset = endOffset;
            }

            if (doVectorPositions)
            {
                termsHashPerField.writeVInt(0, fieldState.position - p.lastPosition);
                p.lastPosition = fieldState.position;
            }
        }
        internal override void addTerm(Token t, RawPostingList p0)
        {
            System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start"));

            TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList)p0;
            p.freq++;

            if (doVectorOffsets)
            {
                int startOffset = fieldState.offset + t.StartOffset();
                int endOffset = fieldState.offset + t.EndOffset();
                termsHashPerField.writeVInt(1, startOffset - p.lastOffset);
                termsHashPerField.writeVInt(1, endOffset - startOffset);
                p.lastOffset = endOffset;
            }

            if (doVectorPositions)
            {
                termsHashPerField.writeVInt(0, fieldState.position - p.lastPosition);
                p.lastPosition = fieldState.position;
            }
        }
Пример #8
0
        /// <summary>Returns the next input Token, after being stemmed </summary>
        public override Token Next()
        {
            Token token = input.Next();

            if (token == null)
            {
                return(null);
            }
            stemmer.SetCurrent(token.TermText());
            try
            {
                stemMethod.Invoke(stemmer, (System.Object[])EMPTY_ARGS);
            }
            catch (System.Exception e)
            {
                throw new System.SystemException(e.ToString());
            }

            Token newToken = new Token(stemmer.GetCurrent(), token.StartOffset(), token.EndOffset(), token.Type());

            newToken.SetPositionIncrement(token.GetPositionIncrement());
            return(newToken);
        }
Пример #9
0
		internal virtual bool IsDistinct(Token token)
		{
			return token.StartOffset() >= endOffset;
		}
Пример #10
0
 internal virtual bool IsDistinct(Token token)
 {
     return(token.StartOffset() >= endOffset);
 }
Пример #11
0
				/// <summary>This is the hotspot of indexing: it's called once
				/// for every term of every document.  Its job is to *
				/// update the postings byte stream (Postings hash) *
				/// based on the occurence of a single term. 
				/// </summary>
				private void  AddPosition(Token token)
				{
					
					Payload payload = token.GetPayload();
					
					// Get the text of this term.  Term can either
					// provide a String token or offset into a char[]
					// array
					char[] tokenText = token.TermBuffer();
					int tokenTextLen = token.TermLength();
					
					int code = 0;
					
					// Compute hashcode
					int downto = tokenTextLen;
					while (downto > 0)
						code = (code * 31) + tokenText[--downto];
					
					// System.out.println("  addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
					
					int hashPos = code & postingsHashMask;
					
					System.Diagnostics.Debug.Assert(!postingsCompacted);
					
					// Locate Posting in hash
					Enclosing_Instance.p = postingsHash[hashPos];
					
					if (Enclosing_Instance.p != null && !Enclosing_Instance.PostingEquals(tokenText, tokenTextLen))
					{
						// Conflict: keep searching different locations in
						// the hash table.
						int inc = ((code >> 8) + code) | 1;
						do 
						{
							code += inc;
							hashPos = code & postingsHashMask;
							Enclosing_Instance.p = postingsHash[hashPos];
						}
						while (Enclosing_Instance.p != null && !Enclosing_Instance.PostingEquals(tokenText, tokenTextLen));
					}
					
					int proxCode;
					
					// If we hit an exception below, it's possible the
					// posting list or term vectors data will be
					// partially written and thus inconsistent if
					// flushed, so we have to abort all documents
					// since the last flush:
					
					try
					{
						
						if (Enclosing_Instance.p != null)
						{
							// term seen since last flush
							
							if (Enclosing_Instance.docID != Enclosing_Instance.p.lastDocID)
							{
								// term not yet seen in this doc
								
								// System.out.println("    seen before (new docID=" + docID + ") freqUpto=" + p.freqUpto +" proxUpto=" + p.proxUpto);

								System.Diagnostics.Debug.Assert(Enclosing_Instance.p.docFreq > 0);
								
								// Now that we know doc freq for previous doc,
								// write it & lastDocCode
								Enclosing_Instance.freqUpto = Enclosing_Instance.p.freqUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
								Enclosing_Instance.freq = Enclosing_Instance.postingsPool.buffers[Enclosing_Instance.p.freqUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT];
								if (1 == Enclosing_Instance.p.docFreq)
									Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.lastDocCode | 1);
								else
								{
									Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.lastDocCode);
									Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.docFreq);
								}
								Enclosing_Instance.p.freqUpto = Enclosing_Instance.freqUpto + (Enclosing_Instance.p.freqUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK);
								
								if (doVectors)
								{
									Enclosing_Instance.vector = AddNewVector();
									if (doVectorOffsets)
									{
										offsetStartCode = offsetStart = offset + token.StartOffset();
										offsetEnd = offset + token.EndOffset();
									}
								}
								
								proxCode = position;
								
								Enclosing_Instance.p.docFreq = 1;
								
								// Store code so we can write this after we're
								// done with this new doc
								Enclosing_Instance.p.lastDocCode = (Enclosing_Instance.docID - Enclosing_Instance.p.lastDocID) << 1;
								Enclosing_Instance.p.lastDocID = Enclosing_Instance.docID;
							}
							else
							{
								// term already seen in this doc
								// System.out.println("    seen before (same docID=" + docID + ") proxUpto=" + p.proxUpto);
								Enclosing_Instance.p.docFreq++;
								
								proxCode = position - Enclosing_Instance.p.lastPosition;
								
								if (doVectors)
								{
									Enclosing_Instance.vector = Enclosing_Instance.p.vector;
									if (Enclosing_Instance.vector == null)
										Enclosing_Instance.vector = AddNewVector();
									if (doVectorOffsets)
									{
										offsetStart = offset + token.StartOffset();
										offsetEnd = offset + token.EndOffset();
										offsetStartCode = offsetStart - Enclosing_Instance.vector.lastOffset;
									}
								}
							}
						}
						else
						{
							// term not seen before
							// System.out.println("    never seen docID=" + docID);
							
							// Refill?
							if (0 == Enclosing_Instance.postingsFreeCount)
							{
								Enclosing_Instance.Enclosing_Instance.GetPostings(Enclosing_Instance.postingsFreeList);
								Enclosing_Instance.postingsFreeCount = Enclosing_Instance.postingsFreeList.Length;
							}
							
							int textLen1 = 1 + tokenTextLen;
							if (textLen1 + Enclosing_Instance.charPool.byteUpto > Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE)
							{
								if (textLen1 > Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE)
								{
									// Just skip this term, to remain as robust as
									// possible during indexing.  A TokenFilter
									// can be inserted into the analyzer chain if
									// other behavior is wanted (pruning the term
									// to a prefix, throwing an exception, etc).
									if (Enclosing_Instance.maxTermPrefix == null)
										Enclosing_Instance.maxTermPrefix = new System.String(tokenText, 0, 30);
									
									// Still increment position:
									position++;
									return ;
								}
								Enclosing_Instance.charPool.NextBuffer();
							}
							char[] text = Enclosing_Instance.charPool.buffer;
							int textUpto = Enclosing_Instance.charPool.byteUpto;
							
							// Pull next free Posting from free list
							Enclosing_Instance.p = Enclosing_Instance.postingsFreeList[--Enclosing_Instance.postingsFreeCount];
							
							Enclosing_Instance.p.textStart = textUpto + Enclosing_Instance.charPool.byteOffset;
							Enclosing_Instance.charPool.byteUpto += textLen1;
							
							Array.Copy(tokenText, 0, text, textUpto, tokenTextLen);
							
							text[textUpto + tokenTextLen] = (char) (0xffff);
							
							System.Diagnostics.Debug.Assert(postingsHash [hashPos] == null);
							
							postingsHash[hashPos] = Enclosing_Instance.p;
							numPostings++;
							
							if (numPostings == postingsHashHalfSize)
								RehashPostings(2 * postingsHashSize);
							
							// Init first slice for freq & prox streams
							int firstSize = Lucene.Net.Index.DocumentsWriter.levelSizeArray[0];
							
							int upto1 = Enclosing_Instance.postingsPool.NewSlice(firstSize);
							Enclosing_Instance.p.freqStart = Enclosing_Instance.p.freqUpto = Enclosing_Instance.postingsPool.byteOffset + upto1;
							
							int upto2 = Enclosing_Instance.postingsPool.NewSlice(firstSize);
							Enclosing_Instance.p.proxStart = Enclosing_Instance.p.proxUpto = Enclosing_Instance.postingsPool.byteOffset + upto2;
							
							Enclosing_Instance.p.lastDocCode = Enclosing_Instance.docID << 1;
							Enclosing_Instance.p.lastDocID = Enclosing_Instance.docID;
							Enclosing_Instance.p.docFreq = 1;
							
							if (doVectors)
							{
								Enclosing_Instance.vector = AddNewVector();
								if (doVectorOffsets)
								{
									offsetStart = offsetStartCode = offset + token.StartOffset();
									offsetEnd = offset + token.EndOffset();
								}
							}
							
							proxCode = position;
						}
						
						Enclosing_Instance.proxUpto = Enclosing_Instance.p.proxUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
						Enclosing_Instance.prox = Enclosing_Instance.postingsPool.buffers[Enclosing_Instance.p.proxUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT];
						System.Diagnostics.Debug.Assert(Enclosing_Instance.prox != null);
						
						if (payload != null && payload.length > 0)
						{
							Enclosing_Instance.WriteProxVInt((proxCode << 1) | 1);
							Enclosing_Instance.WriteProxVInt(payload.length);
							Enclosing_Instance.WriteProxBytes(payload.data, payload.offset, payload.length);
							fieldInfo.storePayloads = true;
						}
						else
							Enclosing_Instance.WriteProxVInt(proxCode << 1);
						
						Enclosing_Instance.p.proxUpto = Enclosing_Instance.proxUpto + (Enclosing_Instance.p.proxUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK);
						
						Enclosing_Instance.p.lastPosition = position++;
						
						if (doVectorPositions)
						{
							Enclosing_Instance.posUpto = Enclosing_Instance.vector.posUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
							Enclosing_Instance.pos = Enclosing_Instance.vectorsPool.buffers[Enclosing_Instance.vector.posUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT];
							Enclosing_Instance.WritePosVInt(proxCode);
							Enclosing_Instance.vector.posUpto = Enclosing_Instance.posUpto + (Enclosing_Instance.vector.posUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK);
						}
						
						if (doVectorOffsets)
						{
							Enclosing_Instance.offsetUpto = Enclosing_Instance.vector.offsetUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
							Enclosing_Instance.offsets = Enclosing_Instance.vectorsPool.buffers[Enclosing_Instance.vector.offsetUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT];
							Enclosing_Instance.WriteOffsetVInt(offsetStartCode);
							Enclosing_Instance.WriteOffsetVInt(offsetEnd - offsetStart);
							Enclosing_Instance.vector.lastOffset = offsetEnd;
							Enclosing_Instance.vector.offsetUpto = Enclosing_Instance.offsetUpto + (Enclosing_Instance.vector.offsetUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK);
						}
					}
					catch (System.Exception t)
					{
						throw new AbortException(t, Enclosing_Instance.Enclosing_Instance);
					}
				}
Пример #12
0
        internal static void  Test(System.IO.TextReader reader, bool verbose, long bytes)
        {
            Analyzer    analyzer = new SimpleAnalyzer();
            TokenStream stream   = analyzer.TokenStream(null, reader);

            System.DateTime start = System.DateTime.Now;

            int count = 0;

            for (Token t = stream.Next(); t != null; t = stream.Next())
            {
                if (verbose)
                {
                    System.Console.Out.WriteLine("Text=" + t.TermText() + " start=" + t.StartOffset() + " end=" + t.EndOffset());
                }
                count++;
            }

            System.DateTime end = System.DateTime.Now;

            long time = end.Ticks - start.Ticks;

            System.Console.Out.WriteLine(time + " milliseconds to extract " + count + " tokens");
            System.Console.Out.WriteLine((time * 1000.0) / count + " microseconds/token");
            System.Console.Out.WriteLine((bytes * 1000.0 * 60.0 * 60.0) / (time * 1000000.0) + " megabytes/hour");
        }
Пример #13
0
        // Tokenizes the fields of a document into Postings.
        private void  InvertDocument(Document doc)
        {
            foreach (Field field in doc.Fields())
            {
                System.String fieldName   = field.Name();
                int           fieldNumber = fieldInfos.FieldNumber(fieldName);

                int length   = fieldLengths[fieldNumber];               // length of field
                int position = fieldPositions[fieldNumber];             // position in field
                if (length > 0)
                {
                    position += analyzer.GetPositionIncrementGap(fieldName);
                }
                int offset = fieldOffsets[fieldNumber];                 // offset field

                if (field.IsIndexed())
                {
                    if (!field.IsTokenized())
                    {
                        // un-tokenized field
                        System.String stringValue = field.StringValue();
                        if (field.IsStoreOffsetWithTermVector())
                        {
                            AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length));
                        }
                        else
                        {
                            AddPosition(fieldName, stringValue, position++, null);
                        }
                        offset += stringValue.Length;
                        length++;
                    }
                    else
                    {
                        System.IO.TextReader reader;                         // find or make Reader
                        if (field.ReaderValue() != null)
                        {
                            reader = field.ReaderValue();
                        }
                        else if (field.StringValue() != null)
                        {
                            reader = new System.IO.StringReader(field.StringValue());
                        }
                        else
                        {
                            throw new System.ArgumentException("field must have either String or Reader value");
                        }

                        // Tokenize field and add to postingTable
                        TokenStream stream = analyzer.TokenStream(fieldName, reader);
                        try
                        {
                            Token lastToken = null;
                            for (Token t = stream.Next(); t != null; t = stream.Next())
                            {
                                position += (t.GetPositionIncrement() - 1);

                                if (field.IsStoreOffsetWithTermVector())
                                {
                                    AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset()));
                                }
                                else
                                {
                                    AddPosition(fieldName, t.TermText(), position++, null);
                                }

                                lastToken = t;
                                if (++length > maxFieldLength)
                                {
                                    if (infoStream != null)
                                    {
                                        infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens");
                                    }
                                    break;
                                }
                            }

                            if (lastToken != null)
                            {
                                offset += lastToken.EndOffset() + 1;
                            }
                        }
                        finally
                        {
                            stream.Close();
                        }
                    }

                    fieldLengths[fieldNumber]   = length;                   // save field length
                    fieldPositions[fieldNumber] = position;                 // save field position
                    fieldBoosts[fieldNumber]   *= field.GetBoost();
                    fieldOffsets[fieldNumber]   = offset;
                }
            }
        }