Beispiel #1
0
 /// <summary>Writes a string.</summary>
 /// <seealso cref="IndexInput.ReadString()">
 /// </seealso>
 public virtual void  WriteString(string s)
 {
     UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
     UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8Result);
     WriteVInt(utf8Result.length);
     WriteBytes(utf8Result.result, 0, utf8Result.length);
 }
        /// <summary>Called once per field per document if term vectors
        /// are enabled, to write the vectors to
        /// RAMOutputStream, which is then quickly flushed to
        /// * the real term vectors files in the Directory.
        /// </summary>
        internal override void  Finish()
        {
            System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start"));

            int numPostings = termsHashPerField.numPostings;

            System.Diagnostics.Debug.Assert(numPostings >= 0);

            if (!doVectors || numPostings == 0)
            {
                return;
            }

            if (numPostings > maxNumPostings)
            {
                maxNumPostings = numPostings;
            }

            IndexOutput tvf = perThread.doc.tvf;

            // This is called once, after inverting all occurences
            // of a given field in the doc.  At this point we flush
            // our hash into the DocWriter.

            System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector);
            System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo));

            perThread.doc.AddField(termsHashPerField.fieldInfo.number);

            RawPostingList[] postings = termsHashPerField.SortPostings();

            tvf.WriteVInt(numPostings);
            byte bits = (byte)(0x0);

            if (doVectorPositions)
            {
                bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
            }
            if (doVectorOffsets)
            {
                bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
            }
            tvf.WriteByte(bits);

            int encoderUpto        = 0;
            int lastTermBytesCount = 0;

            ByteSliceReader reader = perThread.vectorSliceReader;

            char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
            for (int j = 0; j < numPostings; j++)
            {
                TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList)postings[j];
                int freq = posting.freq;

                char[] text2  = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
                int    start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;

                // We swap between two encoders to save copying
                // last Term's byte array
                UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];

                // TODO: we could do this incrementally
                UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
                int termBytesCount = utf8Result.length;

                // TODO: UTF16toUTF8 could tell us this prefix
                // Compute common prefix between last term and
                // this term
                int prefix = 0;
                if (j > 0)
                {
                    byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result;
                    byte[] termBytes     = perThread.utf8Results[encoderUpto].result;
                    while (prefix < lastTermBytesCount && prefix < termBytesCount)
                    {
                        if (lastTermBytes[prefix] != termBytes[prefix])
                        {
                            break;
                        }
                        prefix++;
                    }
                }
                encoderUpto        = 1 - encoderUpto;
                lastTermBytesCount = termBytesCount;

                int suffix = termBytesCount - prefix;
                tvf.WriteVInt(prefix);
                tvf.WriteVInt(suffix);
                tvf.WriteBytes(utf8Result.result, prefix, suffix);
                tvf.WriteVInt(freq);

                if (doVectorPositions)
                {
                    termsHashPerField.InitReader(reader, posting, 0);
                    reader.WriteTo(tvf);
                }

                if (doVectorOffsets)
                {
                    termsHashPerField.InitReader(reader, posting, 1);
                    reader.WriteTo(tvf);
                }
            }

            termsHashPerField.Reset();
            perThread.termsHashPerThread.Reset(false);
        }
Beispiel #3
0
 /// <summary>Compresses the String value using the specified
 /// compressionLevel (constants are defined in
 /// java.util.zip.Deflater).
 /// </summary>
 public static byte[] CompressString(System.String value_Renamed, int compressionLevel)
 {
     UnicodeUtil.UTF8Result result = new UnicodeUtil.UTF8Result();
     UnicodeUtil.UTF16toUTF8(value_Renamed, 0, value_Renamed.Length, result);
     return(Compress(result.result, 0, result.length, compressionLevel));
 }
Beispiel #4
0
 /// <summary>Writes a string.</summary>
 /// <seealso cref="IndexInput.ReadString()">
 /// </seealso>
 public virtual void  WriteString(System.String s)
 {
     UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
     UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8Result);
     WriteVInt(utf8Result.length);
     WriteBytes(utf8Result.result, 0, utf8Result.length);
 }
Beispiel #5
0
		/// <summary>Compresses the String value using the specified
		/// compressionLevel (constants are defined in
		/// java.util.zip.Deflater). 
		/// </summary>
		public static byte[] CompressString(System.String value_Renamed, int compressionLevel)
		{
			UnicodeUtil.UTF8Result result = new UnicodeUtil.UTF8Result();
			UnicodeUtil.UTF16toUTF8(value_Renamed, 0, value_Renamed.Length, result);
			return Compress(result.result, 0, result.length, compressionLevel);
		}
Beispiel #6
0
		public virtual void  TestIncrementalUnicodeStrings()
		{
			r = NewRandom();
			char[] buffer = new char[20];
			char[] expected = new char[20];
			
			UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
			UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
			UnicodeUtil.UTF16Result utf16a = new UnicodeUtil.UTF16Result();
			
			bool hasIllegal = false;
			byte[] last = new byte[60];
			
			for (int iter = 0; iter < 100000; iter++)
			{
				
				int prefix;
				
				if (iter == 0 || hasIllegal)
					prefix = 0;
				else
					prefix = NextInt(20);
				
				hasIllegal = FillUnicode(buffer, expected, prefix, 20 - prefix);
				
				UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
				if (!hasIllegal)
				{
					byte[] b = System.Text.Encoding.GetEncoding("UTF-8").GetBytes(new System.String(buffer, 0, 20));
					Assert.AreEqual(b.Length, utf8.length);
					for (int i = 0; i < b.Length; i++)
						Assert.AreEqual(b[i], utf8.result[i]);
				}
				
				int bytePrefix = 20;
				if (iter == 0 || hasIllegal)
					bytePrefix = 0;
				else
					for (int i = 0; i < 20; i++)
						if (last[i] != utf8.result[i])
						{
							bytePrefix = i;
							break;
						}
				System.Array.Copy(utf8.result, 0, last, 0, utf8.length);
				
				UnicodeUtil.UTF8toUTF16(utf8.result, bytePrefix, utf8.length - bytePrefix, utf16);
				Assert.AreEqual(20, utf16.length);
				for (int i = 0; i < 20; i++)
					Assert.AreEqual(expected[i], utf16.result[i]);
				
				UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16a);
				Assert.AreEqual(20, utf16a.length);
				for (int i = 0; i < 20; i++)
					Assert.AreEqual(expected[i], utf16a.result[i]);
			}
		}
Beispiel #7
0
		public virtual void  TestRandomUnicodeStrings()
		{
			r = NewRandom();
			
			char[] buffer = new char[20];
			char[] expected = new char[20];
			
			UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
			UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
			
			for (int iter = 0; iter < 100000; iter++)
			{
				bool hasIllegal = FillUnicode(buffer, expected, 0, 20);
				
				UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
				if (!hasIllegal)
				{
					byte[] b = System.Text.Encoding.GetEncoding("UTF-8").GetBytes(new System.String(buffer, 0, 20));
					Assert.AreEqual(b.Length, utf8.length);
					for (int i = 0; i < b.Length; i++)
						Assert.AreEqual(b[i], utf8.result[i]);
				}
				
				UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16);
				Assert.AreEqual(utf16.length, 20);
				for (int i = 0; i < 20; i++)
					Assert.AreEqual(expected[i], utf16.result[i]);
			}
		}
Beispiel #8
0
		public virtual void  TestAllUnicodeChars()
		{
			
			UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
			UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
			char[] chars = new char[2];
			for (int ch = 0; ch < 0x0010FFFF; ch++)
			{
				
				if (ch == 0xd800)
				// Skip invalid code points
					ch = 0xe000;
				
				int len = 0;
				if (ch <= 0xffff)
				{
					chars[len++] = (char) ch;
				}
				else
				{
					chars[len++] = (char) (((ch - 0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
					chars[len++] = (char) (((ch - 0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
				}
				
				UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8);
				
				System.String s1 = new System.String(chars, 0, len);
				System.String s2 = System.Text.Encoding.UTF8.GetString(utf8.result, 0, utf8.length);
				Assert.AreEqual(s1, s2, "codepoint " + ch);
				
				UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16);
				Assert.AreEqual(s1, new String(utf16.result, 0, utf16.length), "codepoint " + ch);
				
				byte[] b = System.Text.Encoding.GetEncoding("UTF-8").GetBytes(s1);
				Assert.AreEqual(utf8.length, b.Length);
				for (int j = 0; j < utf8.length; j++)
					Assert.AreEqual(utf8.result[j], b[j]);
			}
		}