Exemplo n.º 1
0
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p>Removes <tt>'s</tt> from the end of words.
        /// <p>Removes dots from acronyms.
        /// </summary>
        public override Token Next(/* in */ Token reusableToken)
        {
            System.Diagnostics.Debug.Assert(reusableToken != null);
            Token nextToken = input.Next(reusableToken);

            if (nextToken == null)
            {
                return(null);
            }

            char[] buffer       = nextToken.TermBuffer();
            int    bufferLength = nextToken.TermLength();

            System.String type = nextToken.Type();

            if (type == APOSTROPHE_TYPE &&
                bufferLength >= 2 &&
                buffer[bufferLength - 2] == '\'' &&
                (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                // Strip last 2 characters off
                nextToken.SetTermLength(bufferLength - 2);
            }
            else if (type == ACRONYM_TYPE)
            {
                // remove dots
                int upto = 0;
                for (int i = 0; i < bufferLength; i++)
                {
                    char c = buffer[i];
                    if (c != '.')
                    {
                        buffer[upto++] = c;
                    }
                }
                nextToken.SetTermLength(upto);
            }

            return(nextToken);
        }
Exemplo n.º 2
0
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p>Removes <tt>'s</tt> from the end of words.
        /// <p>Removes dots from acronyms.
        /// </summary>
        public override Token Next(Token result)
        {
            Token t = input.Next(result);

            if (t == null)
            {
                return(null);
            }

            char[] buffer       = t.TermBuffer();
            int    bufferLength = t.TermLength();

            System.String type = t.Type();

            if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                // Strip last 2 characters off
                t.SetTermLength(bufferLength - 2);
            }
            else if (type == ACRONYM_TYPE)
            {
                // remove dots
                int upto = 0;
                for (int i = 0; i < bufferLength; i++)
                {
                    char c = buffer[i];
                    if (c != '.')
                    {
                        buffer[upto++] = c;
                    }
                }
                t.SetTermLength(upto);
            }

            return(t);
        }
        // Primary entry point (for first TermsHash)
        internal override void add(Token token)
        {
            System.Diagnostics.Debug.Assert(!postingsCompacted);

            // We are first in the chain so we must "intern" the
            // term text into textStart address

            // Get the text of this term.
            char[] tokenText    = token.TermBuffer();
            int    tokenTextLen = token.TermLength();

            // Compute hashcode & replace any invalid UTF16 sequences
            int downto = tokenTextLen;
            int code   = 0;

            while (downto > 0)
            {
                char ch = tokenText[--downto];

                if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END)
                {
                    if (0 == downto)
                    {
                        // Unpaired
                        ch = tokenText[downto] = (char)UnicodeUtil.UNI_REPLACEMENT_CHAR;
                    }
                    else
                    {
                        char ch2 = tokenText[downto - 1];
                        if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END)
                        {
                            // OK: high followed by low.  This is a valid
                            // surrogate pair.
                            code = ((code * 31) + ch) * 31 + ch2;
                            downto--;
                            continue;
                        }
                        else
                        {
                            // Unpaired
                            ch = tokenText[downto] = (char)UnicodeUtil.UNI_REPLACEMENT_CHAR;
                        }
                    }
                }
                else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END)
                {
                    // Unpaired
                    ch = tokenText[downto] = (char)UnicodeUtil.UNI_REPLACEMENT_CHAR;
                }

                code = (code * 31) + ch;
            }

            int hashPos = code & postingsHashMask;

            // Locate RawPostingList in hash
            p = postingsHash[hashPos];

            if (p != null && !postingEquals(tokenText, tokenTextLen))
            {
                // Conflict: keep searching different locations in
                // the hash table.
                int inc = ((code >> 8) + code) | 1;
                do
                {
                    code   += inc;
                    hashPos = code & postingsHashMask;
                    p       = postingsHash[hashPos];
                } while (p != null && !postingEquals(tokenText, tokenTextLen));
            }

            if (p == null)
            {
                // First time we are seeing this token since we last
                // flushed the hash.
                int textLen1 = 1 + tokenTextLen;
                if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE)
                {
                    if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE)
                    {
                        // Just skip this term, to remain as robust as
                        // possible during indexing.  A TokenFilter
                        // can be inserted into the analyzer chain if
                        // other behavior is wanted (pruning the term
                        // to a prefix, throwing an exception, etc).

                        if (docState.maxTermPrefix == null)
                        {
                            docState.maxTermPrefix = new System.String(tokenText, 0, 30);
                        }

                        consumer.skippingLongTerm(token);
                        return;
                    }
                    charPool.nextBuffer();
                }

                // Refill?
                if (0 == perThread.freePostingsCount)
                {
                    perThread.morePostings();
                }

                // Pull next free RawPostingList from free list
                p = perThread.freePostings[--perThread.freePostingsCount];
                System.Diagnostics.Debug.Assert(p != null);

                char[] text     = charPool.buffer;
                int    textUpto = charPool.charUpto;
                p.textStart        = textUpto + charPool.charOffset;
                charPool.charUpto += textLen1;
                System.Array.Copy(tokenText, 0, text, textUpto, tokenTextLen);
                text[textUpto + tokenTextLen] = (char)0xffff;

                System.Diagnostics.Debug.Assert(postingsHash[hashPos] == null);
                postingsHash[hashPos] = p;
                numPostings++;

                if (numPostings == postingsHashHalfSize)
                {
                    rehashPostings(2 * postingsHashSize);
                }

                // Init stream slices
                if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
                {
                    intPool.nextBuffer();
                }

                if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE)
                {
                    bytePool.NextBuffer();
                }

                intUptos         = intPool.buffer;
                intUptoStart     = intPool.intUpto;
                intPool.intUpto += streamCount;

                p.intStart = intUptoStart + intPool.intOffset;

                for (int i = 0; i < streamCount; i++)
                {
                    int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
                    intUptos[intUptoStart + i] = upto + bytePool.byteOffset;
                }
                p.byteStart = intUptos[intUptoStart];

                consumer.newTerm(token, p);
            }
            else
            {
                intUptos     = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
                intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
                consumer.addTerm(token, p);
            }

            if (doNextCall)
            {
                nextPerField.add(token, p.textStart);
            }
        }
Exemplo n.º 4
0
				/// <summary>This is the hotspot of indexing: it's called once
				/// for every term of every document.  Its job is to *
				/// update the postings byte stream (Postings hash) *
				/// based on the occurence of a single term. 
				/// </summary>
				private void  AddPosition(Token token)
				{
					
					Payload payload = token.GetPayload();
					
					// Get the text of this term.  Term can either
					// provide a String token or offset into a char[]
					// array
					char[] tokenText = token.TermBuffer();
					int tokenTextLen = token.TermLength();
					
					int code = 0;
					
					// Compute hashcode
					int downto = tokenTextLen;
					while (downto > 0)
						code = (code * 31) + tokenText[--downto];
					
					// System.out.println("  addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
					
					int hashPos = code & postingsHashMask;
					
					System.Diagnostics.Debug.Assert(!postingsCompacted);
					
					// Locate Posting in hash
					Enclosing_Instance.p = postingsHash[hashPos];
					
					if (Enclosing_Instance.p != null && !Enclosing_Instance.PostingEquals(tokenText, tokenTextLen))
					{
						// Conflict: keep searching different locations in
						// the hash table.
						int inc = ((code >> 8) + code) | 1;
						do 
						{
							code += inc;
							hashPos = code & postingsHashMask;
							Enclosing_Instance.p = postingsHash[hashPos];
						}
						while (Enclosing_Instance.p != null && !Enclosing_Instance.PostingEquals(tokenText, tokenTextLen));
					}
					
					int proxCode;
					
					// If we hit an exception below, it's possible the
					// posting list or term vectors data will be
					// partially written and thus inconsistent if
					// flushed, so we have to abort all documents
					// since the last flush:
					
					try
					{
						
						if (Enclosing_Instance.p != null)
						{
							// term seen since last flush
							
							if (Enclosing_Instance.docID != Enclosing_Instance.p.lastDocID)
							{
								// term not yet seen in this doc
								
								// System.out.println("    seen before (new docID=" + docID + ") freqUpto=" + p.freqUpto +" proxUpto=" + p.proxUpto);

								System.Diagnostics.Debug.Assert(Enclosing_Instance.p.docFreq > 0);
								
								// Now that we know doc freq for previous doc,
								// write it & lastDocCode
								Enclosing_Instance.freqUpto = Enclosing_Instance.p.freqUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
								Enclosing_Instance.freq = Enclosing_Instance.postingsPool.buffers[Enclosing_Instance.p.freqUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT];
								if (1 == Enclosing_Instance.p.docFreq)
									Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.lastDocCode | 1);
								else
								{
									Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.lastDocCode);
									Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.docFreq);
								}
								Enclosing_Instance.p.freqUpto = Enclosing_Instance.freqUpto + (Enclosing_Instance.p.freqUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK);
								
								if (doVectors)
								{
									Enclosing_Instance.vector = AddNewVector();
									if (doVectorOffsets)
									{
										offsetStartCode = offsetStart = offset + token.StartOffset();
										offsetEnd = offset + token.EndOffset();
									}
								}
								
								proxCode = position;
								
								Enclosing_Instance.p.docFreq = 1;
								
								// Store code so we can write this after we're
								// done with this new doc
								Enclosing_Instance.p.lastDocCode = (Enclosing_Instance.docID - Enclosing_Instance.p.lastDocID) << 1;
								Enclosing_Instance.p.lastDocID = Enclosing_Instance.docID;
							}
							else
							{
								// term already seen in this doc
								// System.out.println("    seen before (same docID=" + docID + ") proxUpto=" + p.proxUpto);
								Enclosing_Instance.p.docFreq++;
								
								proxCode = position - Enclosing_Instance.p.lastPosition;
								
								if (doVectors)
								{
									Enclosing_Instance.vector = Enclosing_Instance.p.vector;
									if (Enclosing_Instance.vector == null)
										Enclosing_Instance.vector = AddNewVector();
									if (doVectorOffsets)
									{
										offsetStart = offset + token.StartOffset();
										offsetEnd = offset + token.EndOffset();
										offsetStartCode = offsetStart - Enclosing_Instance.vector.lastOffset;
									}
								}
							}
						}
						else
						{
							// term not seen before
							// System.out.println("    never seen docID=" + docID);
							
							// Refill?
							if (0 == Enclosing_Instance.postingsFreeCount)
							{
								Enclosing_Instance.Enclosing_Instance.GetPostings(Enclosing_Instance.postingsFreeList);
								Enclosing_Instance.postingsFreeCount = Enclosing_Instance.postingsFreeList.Length;
							}
							
							int textLen1 = 1 + tokenTextLen;
							if (textLen1 + Enclosing_Instance.charPool.byteUpto > Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE)
							{
								if (textLen1 > Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE)
								{
									// Just skip this term, to remain as robust as
									// possible during indexing.  A TokenFilter
									// can be inserted into the analyzer chain if
									// other behavior is wanted (pruning the term
									// to a prefix, throwing an exception, etc).
									if (Enclosing_Instance.maxTermPrefix == null)
										Enclosing_Instance.maxTermPrefix = new System.String(tokenText, 0, 30);
									
									// Still increment position:
									position++;
									return ;
								}
								Enclosing_Instance.charPool.NextBuffer();
							}
							char[] text = Enclosing_Instance.charPool.buffer;
							int textUpto = Enclosing_Instance.charPool.byteUpto;
							
							// Pull next free Posting from free list
							Enclosing_Instance.p = Enclosing_Instance.postingsFreeList[--Enclosing_Instance.postingsFreeCount];
							
							Enclosing_Instance.p.textStart = textUpto + Enclosing_Instance.charPool.byteOffset;
							Enclosing_Instance.charPool.byteUpto += textLen1;
							
							Array.Copy(tokenText, 0, text, textUpto, tokenTextLen);
							
							text[textUpto + tokenTextLen] = (char) (0xffff);
							
							System.Diagnostics.Debug.Assert(postingsHash [hashPos] == null);
							
							postingsHash[hashPos] = Enclosing_Instance.p;
							numPostings++;
							
							if (numPostings == postingsHashHalfSize)
								RehashPostings(2 * postingsHashSize);
							
							// Init first slice for freq & prox streams
							int firstSize = Lucene.Net.Index.DocumentsWriter.levelSizeArray[0];
							
							int upto1 = Enclosing_Instance.postingsPool.NewSlice(firstSize);
							Enclosing_Instance.p.freqStart = Enclosing_Instance.p.freqUpto = Enclosing_Instance.postingsPool.byteOffset + upto1;
							
							int upto2 = Enclosing_Instance.postingsPool.NewSlice(firstSize);
							Enclosing_Instance.p.proxStart = Enclosing_Instance.p.proxUpto = Enclosing_Instance.postingsPool.byteOffset + upto2;
							
							Enclosing_Instance.p.lastDocCode = Enclosing_Instance.docID << 1;
							Enclosing_Instance.p.lastDocID = Enclosing_Instance.docID;
							Enclosing_Instance.p.docFreq = 1;
							
							if (doVectors)
							{
								Enclosing_Instance.vector = AddNewVector();
								if (doVectorOffsets)
								{
									offsetStart = offsetStartCode = offset + token.StartOffset();
									offsetEnd = offset + token.EndOffset();
								}
							}
							
							proxCode = position;
						}
						
						Enclosing_Instance.proxUpto = Enclosing_Instance.p.proxUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
						Enclosing_Instance.prox = Enclosing_Instance.postingsPool.buffers[Enclosing_Instance.p.proxUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT];
						System.Diagnostics.Debug.Assert(Enclosing_Instance.prox != null);
						
						if (payload != null && payload.length > 0)
						{
							Enclosing_Instance.WriteProxVInt((proxCode << 1) | 1);
							Enclosing_Instance.WriteProxVInt(payload.length);
							Enclosing_Instance.WriteProxBytes(payload.data, payload.offset, payload.length);
							fieldInfo.storePayloads = true;
						}
						else
							Enclosing_Instance.WriteProxVInt(proxCode << 1);
						
						Enclosing_Instance.p.proxUpto = Enclosing_Instance.proxUpto + (Enclosing_Instance.p.proxUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK);
						
						Enclosing_Instance.p.lastPosition = position++;
						
						if (doVectorPositions)
						{
							Enclosing_Instance.posUpto = Enclosing_Instance.vector.posUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
							Enclosing_Instance.pos = Enclosing_Instance.vectorsPool.buffers[Enclosing_Instance.vector.posUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT];
							Enclosing_Instance.WritePosVInt(proxCode);
							Enclosing_Instance.vector.posUpto = Enclosing_Instance.posUpto + (Enclosing_Instance.vector.posUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK);
						}
						
						if (doVectorOffsets)
						{
							Enclosing_Instance.offsetUpto = Enclosing_Instance.vector.offsetUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
							Enclosing_Instance.offsets = Enclosing_Instance.vectorsPool.buffers[Enclosing_Instance.vector.offsetUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT];
							Enclosing_Instance.WriteOffsetVInt(offsetStartCode);
							Enclosing_Instance.WriteOffsetVInt(offsetEnd - offsetStart);
							Enclosing_Instance.vector.lastOffset = offsetEnd;
							Enclosing_Instance.vector.offsetUpto = Enclosing_Instance.offsetUpto + (Enclosing_Instance.vector.offsetUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK);
						}
					}
					catch (System.Exception t)
					{
						throw new AbortException(t, Enclosing_Instance.Enclosing_Instance);
					}
				}
        // Primary entry point (for first TermsHash)
        internal override void add(Token token)
        {
            System.Diagnostics.Debug.Assert(!postingsCompacted);

            // We are first in the chain so we must "intern" the
            // term text into textStart address

            // Get the text of this term.
            char[] tokenText = token.TermBuffer();
            int tokenTextLen = token.TermLength();

            // Compute hashcode & replace any invalid UTF16 sequences
            int downto = tokenTextLen;
            int code = 0;
            while (downto > 0)
            {
                char ch = tokenText[--downto];

                if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END)
                {
                    if (0 == downto)
                    {
                        // Unpaired
                        ch = tokenText[downto] = (char) UnicodeUtil.UNI_REPLACEMENT_CHAR;
                    }
                    else
                    {
                        char ch2 = tokenText[downto - 1];
                        if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END)
                        {
                            // OK: high followed by low.  This is a valid
                            // surrogate pair.
                            code = ((code * 31) + ch) * 31 + ch2;
                            downto--;
                            continue;
                        }
                        else
                        {
                            // Unpaired
                            ch = tokenText[downto] = (char) UnicodeUtil.UNI_REPLACEMENT_CHAR;
                        }
                    }
                }
                else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END)
                    // Unpaired
                    ch = tokenText[downto] = (char) UnicodeUtil.UNI_REPLACEMENT_CHAR;

                code = (code * 31) + ch;
            }

            int hashPos = code & postingsHashMask;

            // Locate RawPostingList in hash
            p = postingsHash[hashPos];

            if (p != null && !postingEquals(tokenText, tokenTextLen))
            {
                // Conflict: keep searching different locations in
                // the hash table.
                int inc = ((code >> 8) + code) | 1;
                do
                {
                    code += inc;
                    hashPos = code & postingsHashMask;
                    p = postingsHash[hashPos];
                } while (p != null && !postingEquals(tokenText, tokenTextLen));
            }

            if (p == null)
            {

                // First time we are seeing this token since we last
                // flushed the hash.
                int textLen1 = 1 + tokenTextLen;
                if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE)
                {
                    if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE)
                    {
                        // Just skip this term, to remain as robust as
                        // possible during indexing.  A TokenFilter
                        // can be inserted into the analyzer chain if
                        // other behavior is wanted (pruning the term
                        // to a prefix, throwing an exception, etc).

                        if (docState.maxTermPrefix == null)
                            docState.maxTermPrefix = new System.String(tokenText, 0, 30);

                        consumer.skippingLongTerm(token);
                        return;
                    }
                    charPool.nextBuffer();
                }

                // Refill?
                if (0 == perThread.freePostingsCount)
                    perThread.morePostings();

                // Pull next free RawPostingList from free list
                p = perThread.freePostings[--perThread.freePostingsCount];
                System.Diagnostics.Debug.Assert(p != null);

                char[] text = charPool.buffer;
                int textUpto = charPool.charUpto;
                p.textStart = textUpto + charPool.charOffset;
                charPool.charUpto += textLen1;
                System.Array.Copy(tokenText, 0, text, textUpto, tokenTextLen);
                text[textUpto + tokenTextLen] = (char) 0xffff;

                System.Diagnostics.Debug.Assert(postingsHash[hashPos] == null);
                postingsHash[hashPos] = p;
                numPostings++;

                if (numPostings == postingsHashHalfSize)
                    rehashPostings(2 * postingsHashSize);

                // Init stream slices
                if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
                    intPool.nextBuffer();

                if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE)
                    bytePool.NextBuffer();

                intUptos = intPool.buffer;
                intUptoStart = intPool.intUpto;
                intPool.intUpto += streamCount;

                p.intStart = intUptoStart + intPool.intOffset;

                for (int i = 0; i < streamCount; i++)
                {
                    int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
                    intUptos[intUptoStart + i] = upto + bytePool.byteOffset;
                }
                p.byteStart = intUptos[intUptoStart];

                consumer.newTerm(token, p);

            }
            else
            {
                intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
                intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
                consumer.addTerm(token, p);
            }

            if (doNextCall)
                nextPerField.add(token, p.textStart);
        }