Exemplo n.º 1
0
        public virtual void TestRandomUnicodeStrings()
        {
            char[] buffer   = new char[20];
            char[] expected = new char[20];

            BytesRef utf8  = new BytesRef(20);
            CharsRef utf16 = new CharsRef(20);

            int num = AtLeast(100000);

            for (int iter = 0; iter < num; iter++)
            {
                bool hasIllegal = FillUnicode(buffer, expected, 0, 20);

                UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
                if (!hasIllegal)
                {
#pragma warning disable 612, 618
                    var b = (new string(buffer, 0, 20)).GetBytes(IOUtils.CHARSET_UTF_8);
#pragma warning restore 612, 618
                    Assert.AreEqual(b.Length, utf8.Length);
                    for (int i = 0; i < b.Length; i++)
                    {
                        Assert.AreEqual(b[i], utf8.Bytes[i]);
                    }
                }

                UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16);
                Assert.AreEqual(utf16.Length, 20);
                for (int i = 0; i < 20; i++)
                {
                    Assert.AreEqual(expected[i], utf16.Chars[i]);
                }
            }
        }
Exemplo n.º 2
0
 /// <summary>Writes a string.</summary>
 /// <seealso cref="IndexInput.ReadString()">
 /// </seealso>
 public virtual void  WriteString(System.String s)
 {
     UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
     UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8Result);
     WriteVInt(utf8Result.length);
     WriteBytes(utf8Result.result, 0, utf8Result.length);
 }
Exemplo n.º 3
0
            /// <summary>
            /// Adds an input string and it's stemmer override output to this builder.
            /// </summary>
            /// <param name="input"> the input char sequence </param>
            /// <param name="output"> the stemmer override output char sequence </param>
            /// <returns> <code>false</code> iff the input has already been added to this builder otherwise <code>true</code>. </returns>
            public virtual bool add(ICharSequence input, ICharSequence output)
            {
                int length = input.length();

                if (ignoreCase)
                {
                    // convert on the fly to lowercase
                    charsSpare.grow(length);
                    char[] buffer = charsSpare.chars;
                    for (int i = 0; i < length;)
                    {
                        i += char.toChars(char.ToLower(char.codePointAt(input, i)), buffer, i);
                    }
                    UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
                }
                else
                {
                    UnicodeUtil.UTF16toUTF8(input, 0, length, spare);
                }
                if (hash.add(spare) >= 0)
                {
                    outputValues.Add(output);
                    return(true);
                }
                return(false);
            }
Exemplo n.º 4
0
        /// <summary>
        /// Compresses the <see cref="string"/> value using the specified
        /// <paramref name="compressionLevel"/>.
        /// </summary>
        public static byte[] CompressString(string value, CompressionLevel compressionLevel)
        {
            var result = new BytesRef();

            UnicodeUtil.UTF16toUTF8(value.ToCharArray(), 0, value.Length, result);
            return(Compress(result.Bytes, 0, result.Length, compressionLevel));
        }
Exemplo n.º 5
0
        public void  Read(IndexInput input, FieldInfos fieldInfos)
        {
            this.term = null; // invalidate cache
            int start       = input.ReadVInt();
            int length      = input.ReadVInt();
            int totalLength = start + length;

            if (preUTF8Strings)
            {
                text.SetLength(totalLength);
                input.ReadChars(text.result, start, length);
            }
            else
            {
                if (dirty)
                {
                    // Fully convert all bytes since bytes is dirty
                    UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
                    bytes.SetLength(totalLength);
                    input.ReadBytes(bytes.result, start, length);
                    UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text);
                    dirty = false;
                }
                else
                {
                    // Incrementally convert only the UTF8 bytes that are new:
                    bytes.SetLength(totalLength);
                    input.ReadBytes(bytes.result, start, length);
                    UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text);
                }
            }
            this.field = fieldInfos.FieldName(input.ReadVInt());
        }
Exemplo n.º 6
0
            /// <summary>
            /// Adds an input string and it's stemmer override output to this builder.
            /// </summary>
            /// <param name="input"> the input char sequence </param>
            /// <param name="output"> the stemmer override output char sequence </param>
            /// <returns> <code>false</code> iff the input has already been added to this builder otherwise <code>true</code>. </returns>
            public virtual bool add(CharSequence input, CharSequence output)
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int length = input.length();
                int length = input.length();

                if (ignoreCase)
                {
                    // convert on the fly to lowercase
                    charsSpare.grow(length);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final char[] buffer = charsSpare.chars;
                    char[] buffer = charsSpare.chars;
                    for (int i = 0; i < length;)
                    {
                        i += char.toChars(char.ToLower(char.codePointAt(input, i)), buffer, i);
                    }
                    UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
                }
                else
                {
                    UnicodeUtil.UTF16toUTF8(input, 0, length, spare);
                }
                if (hash.add(spare) >= 0)
                {
                    outputValues.Add(output);
                    return(true);
                }
                return(false);
            }
Exemplo n.º 7
0
            /// <summary>
            /// Adds an input string and it's stemmer override output to this builder.
            /// </summary>
            /// <param name="input"> the input char sequence </param>
            /// <param name="output"> the stemmer override output char sequence </param>
            /// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns>
            public virtual bool Add(string input, string output)
            {
                int length = input.Length;

                if (ignoreCase)
                {
                    // convert on the fly to lowercase
                    charsSpare.Grow(length);
                    char[] buffer = charsSpare.Chars;
                    for (int i = 0; i < length;)
                    {
                        i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i), CultureInfo.InvariantCulture), buffer, i);
                    }
                    UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
                }
                else
                {
                    UnicodeUtil.UTF16toUTF8(input.ToCharArray(), 0, length, spare);
                }
                if (hash.Add(spare) >= 0)
                {
                    outputValues.Add(output);
                    return(true);
                }
                return(false);
            }
Exemplo n.º 8
0
        /// <summary>
        /// Writes a string.
        /// <p>
        /// Writes strings as UTF-8 encoded bytes. First the length, in bytes, is
        /// written as a <seealso cref="#writeVInt VInt"/>, followed by the bytes.
        /// </summary>
        /// <seealso cref= DataInput#readString() </seealso>
        public virtual void WriteString(string s)
        {
            BytesRef utf8Result = new BytesRef(10);

            UnicodeUtil.UTF16toUTF8(s.ToCharArray(), 0, s.Length, utf8Result);
            WriteVInt(utf8Result.Length);
            WriteBytes(utf8Result.Bytes, 0, utf8Result.Length);
        }
Exemplo n.º 9
0
        /// <summary>
        /// Writes a string.
        /// <para/>
        /// Writes strings as UTF-8 encoded bytes. First the length, in bytes, is
        /// written as a <see cref="WriteVInt32"/>, followed by the bytes.
        /// </summary>
        /// <seealso cref="DataInput.ReadString()"/>
        public virtual void WriteString(string s)
        {
            var utf8Result = new BytesRef(10);

            UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8Result);
            WriteVInt32(utf8Result.Length);
            WriteBytes(utf8Result.Bytes, 0, utf8Result.Length);
        }
Exemplo n.º 10
0
            // NOTE: while it's tempting to make this public, since
            // caller's parser likely knows the
            // numInput/numOutputWords, sneaky exceptions, much later
            // on, will result if these values are wrong; so we always
            // recompute ourselves to be safe:
            internal virtual void Add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig)
            {
                // first convert to UTF-8
                if (numInputWords <= 0)
                {
                    throw new ArgumentOutOfRangeException(nameof(numInputWords), "numInputWords must be > 0 (got " + numInputWords + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
                }
                if (input.Length <= 0)
                {
                    throw new ArgumentOutOfRangeException(nameof(input.Length), "input.Length must be > 0 (got " + input.Length + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
                }
                if (numOutputWords <= 0)
                {
                    throw new ArgumentOutOfRangeException(nameof(numOutputWords), "numOutputWords must be > 0 (got " + numOutputWords + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
                }
                if (output.Length <= 0)
                {
                    throw new ArgumentOutOfRangeException(nameof(output.Length), "output.Length must be > 0 (got " + output.Length + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
                }

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(!HasHoles(input), "input has holes: {0}", input);
                    Debugging.Assert(!HasHoles(output), "output has holes: {0}", output);
                }

                //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
                UnicodeUtil.UTF16toUTF8(output.Chars, output.Offset, output.Length, utf8Scratch);
                // lookup in hash
                int ord = words.Add(utf8Scratch);

                if (ord < 0)
                {
                    // already exists in our hash
                    ord = (-ord) - 1;
                    //System.out.println("  output=" + output + " old ord=" + ord);
                }
                else
                {
                    //System.out.println("  output=" + output + " new ord=" + ord);
                }

                if (!workingSet.TryGetValue(input, out MapEntry e) || e is null)
                {
                    e = new MapEntry();
                    workingSet[CharsRef.DeepCopyOf(input)] = e; // make a copy, since we will keep around in our map
                }

                e.ords.Add(ord);
                e.includeOrig       |= includeOrig;
                maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords);
                maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords);
            }
Exemplo n.º 11
0
            // NOTE: while it's tempting to make this public, since
            // caller's parser likely knows the
            // numInput/numOutputWords, sneaky exceptions, much later
            // on, will result if these values are wrong; so we always
            // recompute ourselves to be safe:
            internal virtual void Add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig)
            {
                // first convert to UTF-8
                if (numInputWords <= 0)
                {
                    throw new System.ArgumentException("numInputWords must be > 0 (got " + numInputWords + ")");
                }
                if (input.Length <= 0)
                {
                    throw new System.ArgumentException("input.length must be > 0 (got " + input.Length + ")");
                }
                if (numOutputWords <= 0)
                {
                    throw new System.ArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")");
                }
                if (output.Length <= 0)
                {
                    throw new System.ArgumentException("output.length must be > 0 (got " + output.Length + ")");
                }

                Debug.Assert(!HasHoles(input), "input has holes: " + input);
                Debug.Assert(!HasHoles(output), "output has holes: " + output);

                //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
                UnicodeUtil.UTF16toUTF8(output.Chars, output.Offset, output.Length, utf8Scratch);
                // lookup in hash
                int ord = words.Add(utf8Scratch);

                if (ord < 0)
                {
                    // already exists in our hash
                    ord = (-ord) - 1;
                    //System.out.println("  output=" + output + " old ord=" + ord);
                }
                else
                {
                    //System.out.println("  output=" + output + " new ord=" + ord);
                }

                MapEntry e = workingSet.ContainsKey(input) ? workingSet[input] : null;

                if (e == null)
                {
                    e = new MapEntry();
                    workingSet[CharsRef.DeepCopyOf(input)] = e; // make a copy, since we will keep around in our map
                }

                e.ords.Add(ord);
                e.includeOrig       |= includeOrig;
                maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords);
                maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords);
            }
Exemplo n.º 12
0
        public virtual void Reset(string token)
        {
            Debug.Assert(Level != 0);
            this.token = token;
            shapeRel   = SpatialRelation.NULL_VALUE;

            //converting string t0 byte[]
            //bytes = Encoding.UTF8.GetBytes(token);
            BytesRef utf8Result = new BytesRef(token.Length);

            UnicodeUtil.UTF16toUTF8(token.ToCharArray(), 0, token.Length, utf8Result);
            bytes = utf8Result.bytes.ToByteArray();

            b_off = 0;
            b_len = bytes.Length;
            B_fixLeaf();
        }
        /// <summary>Called when we are done adding docs to this term </summary>
        internal override void  Finish()
        {
            long skipPointer = skipListWriter.WriteSkip(out_Renamed);

            // TODO: this is abstraction violation -- we should not
            // peek up into parents terms encoding format
            termInfo.Set(df, parent.freqStart, parent.proxStart, (int)(skipPointer - parent.freqStart));

            // TODO: we could do this incrementally
            UnicodeUtil.UTF16toUTF8(parent.currentTerm, parent.currentTermStart, utf8);

            if (df > 0)
            {
                parent.termsOut.Add(fieldInfo.number, utf8.result, utf8.length, termInfo);
            }

            lastDocID = 0;
            df        = 0;
        }
Exemplo n.º 14
0
        public virtual void TestAllUnicodeChars()
        {
            BytesRef utf8  = new BytesRef(10);
            CharsRef utf16 = new CharsRef(10);

            char[] chars = new char[2];
            for (int ch = 0; ch < 0x0010FFFF; ch++)
            {
                if (ch == 0xd800)
                // Skip invalid code points
                {
                    ch = 0xe000;
                }

                int len = 0;
                if (ch <= 0xffff)
                {
                    chars[len++] = (char)ch;
                }
                else
                {
                    chars[len++] = (char)(((ch - 0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
                    chars[len++] = (char)(((ch - 0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
                }

                UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8);

                string s1 = new string(chars, 0, len);
                string s2 = Encoding.UTF8.GetString(utf8.Bytes, utf8.Offset, utf8.Length);
                Assert.AreEqual(s1, s2, "codepoint " + ch);

                UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16);
                Assert.AreEqual(s1, new string(utf16.Chars, 0, utf16.Length), "codepoint " + ch);

                var b = s1.GetBytes(Encoding.UTF8);
                Assert.AreEqual(utf8.Length, b.Length);
                for (int j = 0; j < utf8.Length; j++)
                {
                    Assert.AreEqual(utf8.Bytes[j], b[j]);
                }
            }
        }
Exemplo n.º 15
0
 /// <summary>Note: doesn't contain a trailing leaf byte.</summary>
 /// <remarks>Note: doesn't contain a trailing leaf byte.</remarks>
 public virtual byte[] GetTokenBytes()
 {
     if (bytes != null)
     {
         if (b_off != 0 || b_len != bytes.Length)
         {
             throw new InvalidOperationException("Not supported if byte[] needs to be recreated.");
         }
     }
     else
     {
         //converting string t0 byte[]
         //bytes = Encoding.UTF8.GetBytes(token);
         BytesRef utf8Result = new BytesRef(token.Length);
         UnicodeUtil.UTF16toUTF8(token.ToCharArray(), 0, token.Length, utf8Result);
         bytes = utf8Result.bytes.ToByteArray();
         b_off = 0;
         b_len = bytes.Length;
     }
     return(bytes);
 }
Exemplo n.º 16
0
 /// <summary>Compresses the String value using the specified
 /// compressionLevel (constants are defined in
 /// java.util.zip.Deflater).
 /// </summary>
 public static byte[] CompressString(System.String value_Renamed, int compressionLevel)
 {
     UnicodeUtil.UTF8Result result = new UnicodeUtil.UTF8Result();
     UnicodeUtil.UTF16toUTF8(value_Renamed, 0, value_Renamed.Length, result);
     return(Compress(result.result, 0, result.length, compressionLevel));
 }
Exemplo n.º 17
0
 public static void Write(DataOutput output, string s, BytesRef scratch)
 {
     UnicodeUtil.UTF16toUTF8(s.ToCharArray(), 0, s.Length, scratch);
     Write(output, scratch);
 }
Exemplo n.º 18
0
        /// <summary> Add a complete document specified by all its term vectors. If document has no
        /// term vectors, add value for tvx.
        ///
        /// </summary>
        /// <param name="vectors">
        /// </param>
        /// <throws>  IOException </throws>
        public void  AddAllDocVectors(ITermFreqVector[] vectors)
        {
            tvx.WriteLong(tvd.FilePointer);
            tvx.WriteLong(tvf.FilePointer);

            if (vectors != null)
            {
                int numFields = vectors.Length;
                tvd.WriteVInt(numFields);

                var fieldPointers = new long[numFields];

                for (int i = 0; i < numFields; i++)
                {
                    fieldPointers[i] = tvf.FilePointer;

                    int fieldNumber = fieldInfos.FieldNumber(vectors[i].Field);

                    // 1st pass: write field numbers to tvd
                    tvd.WriteVInt(fieldNumber);

                    int numTerms = vectors[i].Size;
                    tvf.WriteVInt(numTerms);

                    TermPositionVector tpVector;

                    byte bits;
                    bool storePositions;
                    bool storeOffsets;

                    if (vectors[i] is TermPositionVector)
                    {
                        // May have positions & offsets
                        tpVector       = (TermPositionVector)vectors[i];
                        storePositions = tpVector.Size > 0 && tpVector.GetTermPositions(0) != null;
                        storeOffsets   = tpVector.Size > 0 && tpVector.GetOffsets(0) != null;
                        bits           = (byte)((storePositions?TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR: (byte)0) + (storeOffsets?TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR: (byte)0));
                    }
                    else
                    {
                        tpVector       = null;
                        bits           = 0;
                        storePositions = false;
                        storeOffsets   = false;
                    }

                    tvf.WriteVInt(bits);

                    System.String[] terms = vectors[i].GetTerms();
                    int[]           freqs = vectors[i].GetTermFrequencies();

                    int utf8Upto = 0;
                    utf8Results[1].length = 0;

                    for (int j = 0; j < numTerms; j++)
                    {
                        UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].Length, utf8Results[utf8Upto]);

                        int start  = StringHelper.BytesDifference(utf8Results[1 - utf8Upto].result, utf8Results[1 - utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length);
                        int length = utf8Results[utf8Upto].length - start;
                        tvf.WriteVInt(start);                                        // write shared prefix length
                        tvf.WriteVInt(length);                                       // write delta length
                        tvf.WriteBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes
                        utf8Upto = 1 - utf8Upto;

                        int termFreq = freqs[j];

                        tvf.WriteVInt(termFreq);

                        if (storePositions)
                        {
                            int[] positions = tpVector.GetTermPositions(j);
                            if (positions == null)
                            {
                                throw new System.SystemException("Trying to write positions that are null!");
                            }
                            System.Diagnostics.Debug.Assert(positions.Length == termFreq);

                            // use delta encoding for positions
                            int lastPosition = 0;
                            foreach (int position in positions)
                            {
                                tvf.WriteVInt(position - lastPosition);
                                lastPosition = position;
                            }
                        }

                        if (storeOffsets)
                        {
                            TermVectorOffsetInfo[] offsets = tpVector.GetOffsets(j);
                            if (offsets == null)
                            {
                                throw new System.SystemException("Trying to write offsets that are null!");
                            }
                            System.Diagnostics.Debug.Assert(offsets.Length == termFreq);

                            // use delta encoding for offsets
                            int lastEndOffset = 0;
                            foreach (TermVectorOffsetInfo t in offsets)
                            {
                                int startOffset = t.StartOffset;
                                int endOffset   = t.EndOffset;
                                tvf.WriteVInt(startOffset - lastEndOffset);
                                tvf.WriteVInt(endOffset - startOffset);
                                lastEndOffset = endOffset;
                            }
                        }
                    }
                }

                // 2nd pass: write field pointers to tvd
                if (numFields > 1)
                {
                    long lastFieldPointer = fieldPointers[0];
                    for (int i = 1; i < numFields; i++)
                    {
                        long fieldPointer = fieldPointers[i];
                        tvd.WriteVLong(fieldPointer - lastFieldPointer);
                        lastFieldPointer = fieldPointer;
                    }
                }
            }
            else
            {
                tvd.WriteVInt(0);
            }
        }
Exemplo n.º 19
0
 public virtual void FillBytesRef()
 {
     UnicodeUtil.UTF16toUTF8(TermBuffer, 0, TermLength, Bytes);
 }
Exemplo n.º 20
0
        /// <summary>Called once per field per document if term vectors
        /// are enabled, to write the vectors to
        /// RAMOutputStream, which is then quickly flushed to
        /// the real term vectors files in the Directory.
        /// </summary>
        internal override void  Finish()
        {
            System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start"));

            int numPostings = termsHashPerField.numPostings;

            System.Diagnostics.Debug.Assert(numPostings >= 0);

            if (!doVectors || numPostings == 0)
            {
                return;
            }

            if (numPostings > maxNumPostings)
            {
                maxNumPostings = numPostings;
            }

            IndexOutput tvf = perThread.doc.perDocTvf;

            // This is called once, after inverting all occurences
            // of a given field in the doc.  At this point we flush
            // our hash into the DocWriter.

            System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector);
            System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo));

            perThread.doc.AddField(termsHashPerField.fieldInfo.number);

            RawPostingList[] postings = termsHashPerField.SortPostings();

            tvf.WriteVInt(numPostings);
            byte bits = (byte)(0x0);

            if (doVectorPositions)
            {
                bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
            }
            if (doVectorOffsets)
            {
                bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
            }
            tvf.WriteByte(bits);

            int encoderUpto        = 0;
            int lastTermBytesCount = 0;

            ByteSliceReader reader = perThread.vectorSliceReader;

            char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
            for (int j = 0; j < numPostings; j++)
            {
                TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList)postings[j];
                int freq = posting.freq;

                char[] text2  = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
                int    start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;

                // We swap between two encoders to save copying
                // last Term's byte array
                UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];

                // TODO: we could do this incrementally
                UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
                int termBytesCount = utf8Result.length;

                // TODO: UTF16toUTF8 could tell us this prefix
                // Compute common prefix between last term and
                // this term
                int prefix = 0;
                if (j > 0)
                {
                    byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result;
                    byte[] termBytes     = perThread.utf8Results[encoderUpto].result;
                    while (prefix < lastTermBytesCount && prefix < termBytesCount)
                    {
                        if (lastTermBytes[prefix] != termBytes[prefix])
                        {
                            break;
                        }
                        prefix++;
                    }
                }
                encoderUpto        = 1 - encoderUpto;
                lastTermBytesCount = termBytesCount;

                int suffix = termBytesCount - prefix;
                tvf.WriteVInt(prefix);
                tvf.WriteVInt(suffix);
                tvf.WriteBytes(utf8Result.result, prefix, suffix);
                tvf.WriteVInt(freq);

                if (doVectorPositions)
                {
                    termsHashPerField.InitReader(reader, posting, 0);
                    reader.WriteTo(tvf);
                }

                if (doVectorOffsets)
                {
                    termsHashPerField.InitReader(reader, posting, 1);
                    reader.WriteTo(tvf);
                }
            }

            termsHashPerField.Reset();

            // NOTE: we clear, per-field, at the thread level,
            // because term vectors fully write themselves on each
            // field; this saves RAM (eg if large doc has two large
            // fields w/ term vectors on) because we recycle/reuse
            // all RAM after each field:
            perThread.termsHashPerThread.Reset(false);
        }
Exemplo n.º 21
0
 private bool Matches(ByteRunAutomaton a, int code)
 {
     char[] chars = Character.ToChars(code);
     UnicodeUtil.UTF16toUTF8(chars, 0, chars.Length, b);
     return(a.Run(b.Bytes, 0, b.Length));
 }
Exemplo n.º 22
0
 internal void  Add(Term term, TermInfo ti)
 {
     UnicodeUtil.UTF16toUTF8(term.Text, 0, term.Text.Length, utf8Result);
     Add(fieldInfos.FieldNumber(term.Field), utf8Result.result, utf8Result.length, ti);
 }