Esempio n. 1
0
        public void  Read(IndexInput input, FieldInfos fieldInfos)
        {
            this.term = null;             // invalidate cache
            int start       = input.ReadVInt();
            int length      = input.ReadVInt();
            int totalLength = start + length;

            if (preUTF8Strings)
            {
                text.SetLength(totalLength);
                input.ReadChars(text.result, start, length);
            }
            else
            {
                if (dirty)
                {
                    // Fully convert all bytes since bytes is dirty
                    UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
                    bytes.SetLength(totalLength);
                    input.ReadBytes(bytes.result, start, length);
                    UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text);
                    dirty = false;
                }
                else
                {
                    // Incrementally convert only the UTF8 bytes that are new:
                    bytes.SetLength(totalLength);
                    input.ReadBytes(bytes.result, start, length);
                    UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text);
                }
            }
            this.field = fieldInfos.FieldName(input.ReadVInt());
        }
Esempio n. 2
0
 /// <summary>Writes a string.</summary>
 /// <seealso cref="IndexInput.ReadString()">
 /// </seealso>
 public virtual void  WriteString(string s)
 {
     UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
     UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8Result);
     WriteVInt(utf8Result.length);
     WriteBytes(utf8Result.result, 0, utf8Result.length);
 }
Esempio n. 3
0
 /// <summary>Decompress the byte array previously returned by
 /// compressString back into a String
 /// </summary>
 public static System.String DecompressString(byte[] value_Renamed)
 {
     UnicodeUtil.UTF16Result result = new UnicodeUtil.UTF16Result();
     byte[] bytes = Decompress(value_Renamed);
     UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.Length, result);
     return(new System.String(result.result, 0, result.length));
 }
Esempio n. 4
0
        public override void SetUp()
        {
            base.SetUp();
            dir = NewDirectory();
            fieldName = Random.NextBoolean() ? "field" : ""; // sometimes use an empty string as field name
            RandomIndexWriter writer = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random, MockTokenizer.KEYWORD, false)).SetMaxBufferedDocs(TestUtil.NextInt32(Random, 50, 1000)));
            Document doc = new Document();
            Field field = NewStringField(fieldName, "", Field.Store.NO);
            doc.Add(field);
            List<string> terms = new List<string>();
            int num = AtLeast(200);
            for (int i = 0; i < num; i++)
            {
                string s = TestUtil.RandomUnicodeString(Random);
                field.SetStringValue(s);
                terms.Add(s);
                writer.AddDocument(doc);
            }

            if (Verbose)
            {
                // utf16 order
                terms.Sort();
                Console.WriteLine("UTF16 order:");
                foreach (string s in terms)
                {
                    Console.WriteLine("  " + UnicodeUtil.ToHexString(s));
                }
            }

            reader = writer.GetReader();
            searcher1 = NewSearcher(reader);
            searcher2 = NewSearcher(reader);
            writer.Dispose();
        }
Esempio n. 5
0
 internal virtual System.String BytesToString(byte[] bytes)
 {
     lock (this)
     {
         System.String s = System.Text.Encoding.UTF8.GetString(bytes);
         UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8Result);
         try
         {
             return(System.Text.Encoding.UTF8.GetString(utf8Result.result, 0, utf8Result.length));
         }
         catch (System.IO.IOException uee)
         {
             return(null);
         }
     }
 }
        /// <summary>Called when we are done adding docs to this term </summary>
        internal override void  Finish()
        {
            long skipPointer = skipListWriter.WriteSkip(out_Renamed);

            // TODO: this is abstraction violation -- we should not
            // peek up into parents terms encoding format
            termInfo.Set(df, parent.freqStart, parent.proxStart, (int)(skipPointer - parent.freqStart));

            // TODO: we could do this incrementally
            UnicodeUtil.UTF16toUTF8(parent.currentTerm, parent.currentTermStart, utf8);

            if (df > 0)
            {
                parent.termsOut.Add(fieldInfo.number, utf8.result, utf8.length, termInfo);
            }

            lastDocID = 0;
            df        = 0;
        }
Esempio n. 7
0
        // Currently used only by assert statement
        private int CompareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
        {
            if (lastFieldNumber != fieldNumber)
            {
                int cmp = String.CompareOrdinal(fieldInfos.FieldName(lastFieldNumber), fieldInfos.FieldName(fieldNumber));
                // If there is a field named "" (empty string) then we
                // will get 0 on this comparison, yet, it's "OK".  But
                // it's not OK if two different field numbers map to
                // the same name.
                if (cmp != 0 || lastFieldNumber != -1)
                {
                    return(cmp);
                }
            }

            UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
            UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);

            int len;

            if (utf16Result1.length < utf16Result2.length)
            {
                len = utf16Result1.length;
            }
            else
            {
                len = utf16Result2.length;
            }

            for (int i = 0; i < len; i++)
            {
                char ch1 = utf16Result1.result[i];
                char ch2 = utf16Result2.result[i];
                if (ch1 != ch2)
                {
                    return(ch1 - ch2);
                }
            }
            return(utf16Result1.length - utf16Result2.length);
        }
Esempio n. 8
0
        /// <summary> Add a complete document specified by all its term vectors. If document has no
        /// term vectors, add value for tvx.
        ///
        /// </summary>
        /// <param name="vectors">
        /// </param>
        /// <throws>  IOException </throws>
        public void  AddAllDocVectors(TermFreqVector[] vectors)
        {
            tvx.WriteLong(tvd.GetFilePointer());
            tvx.WriteLong(tvf.GetFilePointer());

            if (vectors != null)
            {
                int numFields = vectors.Length;
                tvd.WriteVInt(numFields);

                long[] fieldPointers = new long[numFields];

                for (int i = 0; i < numFields; i++)
                {
                    fieldPointers[i] = tvf.GetFilePointer();

                    int fieldNumber = fieldInfos.FieldNumber(vectors[i].GetField());

                    // 1st pass: write field numbers to tvd
                    tvd.WriteVInt(fieldNumber);

                    int numTerms = vectors[i].Size();
                    tvf.WriteVInt(numTerms);

                    TermPositionVector tpVector;

                    byte bits;
                    bool storePositions;
                    bool storeOffsets;

                    if (vectors[i] is TermPositionVector)
                    {
                        // May have positions & offsets
                        tpVector       = (TermPositionVector)vectors[i];
                        storePositions = tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null;
                        storeOffsets   = tpVector.Size() > 0 && tpVector.GetOffsets(0) != null;
                        bits           = (byte)((storePositions ? TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR : (byte)0) + (storeOffsets ? TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR : (byte)0));
                    }
                    else
                    {
                        tpVector       = null;
                        bits           = 0;
                        storePositions = false;
                        storeOffsets   = false;
                    }

                    tvf.WriteVInt(bits);

                    string[] terms = vectors[i].GetTerms();
                    int[]    freqs = vectors[i].GetTermFrequencies();

                    int utf8Upto = 0;
                    utf8Results[1].length = 0;

                    for (int j = 0; j < numTerms; j++)
                    {
                        UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].Length, utf8Results[utf8Upto]);

                        int start = StringHelper.bytesDifference(
                            utf8Results[1 - utf8Upto].result, utf8Results[1 - utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length);

                        int length = utf8Results[utf8Upto].length - start;
                        tvf.WriteVInt(start);                                        // write shared prefix length
                        tvf.WriteVInt(length);                                       // write delta length
                        tvf.WriteBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes
                        utf8Upto = 1 - utf8Upto;

                        int termFreq = freqs[j];

                        tvf.WriteVInt(termFreq);

                        if (storePositions)
                        {
                            int[] positions = tpVector.GetTermPositions(j);
                            if (positions == null)
                            {
                                throw new System.SystemException("Trying to write positions that are null!");
                            }
                            System.Diagnostics.Debug.Assert(positions.Length == termFreq);

                            // use delta encoding for positions
                            int lastPosition = 0;
                            for (int k = 0; k < positions.Length; k++)
                            {
                                int position = positions[k];
                                tvf.WriteVInt(position - lastPosition);
                                lastPosition = position;
                            }
                        }

                        if (storeOffsets)
                        {
                            TermVectorOffsetInfo[] offsets = tpVector.GetOffsets(j);
                            if (offsets == null)
                            {
                                throw new System.SystemException("Trying to write offsets that are null!");
                            }
                            System.Diagnostics.Debug.Assert(offsets.Length == termFreq);

                            // use delta encoding for offsets
                            int lastEndOffset = 0;
                            for (int k = 0; k < offsets.Length; k++)
                            {
                                int startOffset = offsets[k].GetStartOffset();
                                int endOffset   = offsets[k].GetEndOffset();
                                tvf.WriteVInt(startOffset - lastEndOffset);
                                tvf.WriteVInt(endOffset - startOffset);
                                lastEndOffset = endOffset;
                            }
                        }
                    }
                }

                // 2nd pass: write field pointers to tvd
                long lastFieldPointer = fieldPointers[0];
                for (int i = 1; i < numFields; i++)
                {
                    long fieldPointer = fieldPointers[i];
                    tvd.WriteVLong(fieldPointer - lastFieldPointer);
                    lastFieldPointer = fieldPointer;
                }
            }
            else
            {
                tvd.WriteVInt(0);
            }
        }
Esempio n. 9
0
 internal void  Add(Term term, TermInfo ti)
 {
     UnicodeUtil.UTF16toUTF8(term.Text, 0, term.Text.Length, utf8Result);
     Add(fieldInfos.FieldNumber(term.Field), utf8Result.result, utf8Result.length, ti);
 }
        /// <summary>Called once per field per document if term vectors
        /// are enabled, to write the vectors to
        /// RAMOutputStream, which is then quickly flushed to
        /// * the real term vectors files in the Directory.
        /// </summary>
        internal override void  Finish()
        {
            System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start"));

            int numPostings = termsHashPerField.numPostings;

            System.Diagnostics.Debug.Assert(numPostings >= 0);

            if (!doVectors || numPostings == 0)
            {
                return;
            }

            if (numPostings > maxNumPostings)
            {
                maxNumPostings = numPostings;
            }

            IndexOutput tvf = perThread.doc.tvf;

            // This is called once, after inverting all occurences
            // of a given field in the doc.  At this point we flush
            // our hash into the DocWriter.

            System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector);
            System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo));

            perThread.doc.AddField(termsHashPerField.fieldInfo.number);

            RawPostingList[] postings = termsHashPerField.SortPostings();

            tvf.WriteVInt(numPostings);
            byte bits = (byte)(0x0);

            if (doVectorPositions)
            {
                bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
            }
            if (doVectorOffsets)
            {
                bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
            }
            tvf.WriteByte(bits);

            int encoderUpto        = 0;
            int lastTermBytesCount = 0;

            ByteSliceReader reader = perThread.vectorSliceReader;

            char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
            for (int j = 0; j < numPostings; j++)
            {
                TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList)postings[j];
                int freq = posting.freq;

                char[] text2  = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
                int    start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;

                // We swap between two encoders to save copying
                // last Term's byte array
                UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];

                // TODO: we could do this incrementally
                UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
                int termBytesCount = utf8Result.length;

                // TODO: UTF16toUTF8 could tell us this prefix
                // Compute common prefix between last term and
                // this term
                int prefix = 0;
                if (j > 0)
                {
                    byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result;
                    byte[] termBytes     = perThread.utf8Results[encoderUpto].result;
                    while (prefix < lastTermBytesCount && prefix < termBytesCount)
                    {
                        if (lastTermBytes[prefix] != termBytes[prefix])
                        {
                            break;
                        }
                        prefix++;
                    }
                }
                encoderUpto        = 1 - encoderUpto;
                lastTermBytesCount = termBytesCount;

                int suffix = termBytesCount - prefix;
                tvf.WriteVInt(prefix);
                tvf.WriteVInt(suffix);
                tvf.WriteBytes(utf8Result.result, prefix, suffix);
                tvf.WriteVInt(freq);

                if (doVectorPositions)
                {
                    termsHashPerField.InitReader(reader, posting, 0);
                    reader.WriteTo(tvf);
                }

                if (doVectorOffsets)
                {
                    termsHashPerField.InitReader(reader, posting, 1);
                    reader.WriteTo(tvf);
                }
            }

            termsHashPerField.Reset();
            perThread.termsHashPerThread.Reset(false);
        }
Esempio n. 11
0
 /// <summary>Compresses the String value using the specified
 /// compressionLevel (constants are defined in
 /// java.util.zip.Deflater).
 /// </summary>
 public static byte[] CompressString(System.String value_Renamed, int compressionLevel)
 {
     UnicodeUtil.UTF8Result result = new UnicodeUtil.UTF8Result();
     UnicodeUtil.UTF16toUTF8(value_Renamed, 0, value_Renamed.Length, result);
     return(Compress(result.result, 0, result.length, compressionLevel));
 }
        /* Walk through all unique text tokens (Posting
         * instances) found in this field and serialize them
         * into a single RAM segment. */
        void AppendPostings(DocumentsWriter.FlushState flushState,
                            FreqProxTermsWriterPerField[] fields,
                            TermInfosWriter termsOut,
                            IndexOutput freqOut,
                            IndexOutput proxOut,
                            DefaultSkipListWriter skipListWriter)
        {
            int fieldNumber = fields[0].fieldInfo.number;
            int numFields   = fields.Length;

            FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];

            for (int i = 0; i < numFields; i++)
            {
                FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]);

                System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields[0].fieldInfo);

                // Should always be true
                bool result = fms.nextTerm();
                System.Diagnostics.Debug.Assert(result);
            }

            int  skipInterval       = termsOut.skipInterval;
            bool currentFieldOmitTf = fields[0].fieldInfo.omitTf;

            // If current field omits tf then it cannot store
            // payloads.  We silently drop the payloads in this case:
            bool currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads;

            FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];

            while (numFields > 0)
            {
                // Get the next term to merge
                termStates[0] = mergeStates[0];
                int numToMerge = 1;

                for (int i = 1; i < numFields; i++)
                {
                    char[] text       = mergeStates[i].text;
                    int    textOffset = mergeStates[i].textOffset;
                    int    cmp        = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset);

                    if (cmp < 0)
                    {
                        termStates[0] = mergeStates[i];
                        numToMerge    = 1;
                    }
                    else if (cmp == 0)
                    {
                        termStates[numToMerge++] = mergeStates[i];
                    }
                }

                int df = 0;
                int lastPayloadLength = -1;

                int lastDoc = 0;

                char[] text_Renamed = termStates[0].text;
                int    start        = termStates[0].textOffset;

                long freqPointer = freqOut.GetFilePointer();
                long proxPointer;
                if (proxOut != null)
                {
                    proxPointer = proxOut.GetFilePointer();
                }
                else
                {
                    proxPointer = 0;
                }

                skipListWriter.ResetSkip();

                // Now termStates has numToMerge FieldMergeStates
                // which all share the same term.  Now we must
                // interleave the docID streams.
                while (numToMerge > 0)
                {
                    if ((++df % skipInterval) == 0)
                    {
                        skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength);
                        skipListWriter.BufferSkip(df);
                    }

                    FreqProxFieldMergeState minState = termStates[0];
                    for (int i = 1; i < numToMerge; i++)
                    {
                        if (termStates[i].docID < minState.docID)
                        {
                            minState = termStates[i];
                        }
                    }

                    int doc         = minState.docID;
                    int termDocFreq = minState.termFreq;

                    System.Diagnostics.Debug.Assert(doc < flushState.numDocsInRAM);
                    System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1);

                    ByteSliceReader prox = minState.prox;

                    // Carefully copy over the prox + payload info,
                    // changing the format to match Lucene's segment
                    // format.
                    if (!currentFieldOmitTf)
                    {
                        // omitTf == false so we do write positions & payload
                        System.Diagnostics.Debug.Assert(proxOut != null);
                        for (int j = 0; j < termDocFreq; j++)
                        {
                            int code = prox.ReadVInt();
                            if (currentFieldStorePayloads)
                            {
                                int payloadLength;
                                if ((code & 1) != 0)
                                {
                                    // This position has a payload
                                    payloadLength = prox.ReadVInt();
                                }
                                else
                                {
                                    payloadLength = 0;
                                }
                                if (payloadLength != lastPayloadLength)
                                {
                                    proxOut.WriteVInt(code | 1);
                                    proxOut.WriteVInt(payloadLength);
                                    lastPayloadLength = payloadLength;
                                }
                                else
                                {
                                    proxOut.WriteVInt(code & (~1));
                                }
                                if (payloadLength > 0)
                                {
                                    copyBytes(prox, proxOut, payloadLength);
                                }
                            }
                            else
                            {
                                System.Diagnostics.Debug.Assert(0 == (code & 1));
                                proxOut.WriteVInt(code >> 1);
                            }
                        } //End for

                        int newDocCode = (doc - lastDoc) << 1;

                        if (1 == termDocFreq)
                        {
                            freqOut.WriteVInt(newDocCode | 1);
                        }
                        else
                        {
                            freqOut.WriteVInt(newDocCode);
                            freqOut.WriteVInt(termDocFreq);
                        }
                    }
                    else
                    {
                        // omitTf==true: we store only the docs, without
                        // term freq, positions, payloads
                        freqOut.WriteVInt(doc - lastDoc);
                    }

                    lastDoc = doc;

                    if (!minState.nextDoc())
                    {
                        // Remove from termStates
                        int upto = 0;
                        for (int i = 0; i < numToMerge; i++)
                        {
                            if (termStates[i] != minState)
                            {
                                termStates[upto++] = termStates[i];
                            }
                        }
                        numToMerge--;
                        System.Diagnostics.Debug.Assert(upto == numToMerge);

                        // Advance this state to the next term

                        if (!minState.nextTerm())
                        {
                            // OK, no more terms, so remove from mergeStates
                            // as well
                            upto = 0;
                            for (int i = 0; i < numFields; i++)
                            {
                                if (mergeStates[i] != minState)
                                {
                                    mergeStates[upto++] = mergeStates[i];
                                }
                            }
                            numFields--;
                            System.Diagnostics.Debug.Assert(upto == numFields);
                        }
                    }
                }

                System.Diagnostics.Debug.Assert(df > 0);

                // Done merging this term

                long skipPointer = skipListWriter.WriteSkip(freqOut);

                // Write term
                termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer));

                // TODO: we could do this incrementally
                UnicodeUtil.UTF16toUTF8(text_Renamed, start, termsUTF8);

                // TODO: we could save O(n) re-scan of the term by
                // computing the shared prefix with the last term
                // while during the UTF8 encoding
                termsOut.Add(fieldNumber,
                             termsUTF8.result,
                             termsUTF8.length,
                             termInfo);
            }
        }
Esempio n. 13
0
 protected override AcceptStatus Accept(BytesRef term)
 {
     UnicodeUtil.UTF8toUTF16(term.Bytes, term.Offset, term.Length, utf16);
     return runAutomaton.Run(utf16.Chars, 0, utf16.Length) ? AcceptStatus.YES : AcceptStatus.NO;
 }