/// <summary>Writes a string.</summary> /// <seealso cref="IndexInput.ReadString()"> /// </seealso> public virtual void WriteString(string s) { UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8Result); WriteVInt(utf8Result.length); WriteBytes(utf8Result.result, 0, utf8Result.length); }
/// <summary>Called once per field per document if term vectors /// are enabled, to write the vectors to /// RAMOutputStream, which is then quickly flushed to /// * the real term vectors files in the Directory. /// </summary> internal override void Finish() { System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start")); int numPostings = termsHashPerField.numPostings; System.Diagnostics.Debug.Assert(numPostings >= 0); if (!doVectors || numPostings == 0) { return; } if (numPostings > maxNumPostings) { maxNumPostings = numPostings; } IndexOutput tvf = perThread.doc.tvf; // This is called once, after inverting all occurences // of a given field in the doc. At this point we flush // our hash into the DocWriter. System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector); System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo)); perThread.doc.AddField(termsHashPerField.fieldInfo.number); RawPostingList[] postings = termsHashPerField.SortPostings(); tvf.WriteVInt(numPostings); byte bits = (byte)(0x0); if (doVectorPositions) { bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; } if (doVectorOffsets) { bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; } tvf.WriteByte(bits); int encoderUpto = 0; int lastTermBytesCount = 0; ByteSliceReader reader = perThread.vectorSliceReader; char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; for (int j = 0; j < numPostings; j++) { TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList)postings[j]; int freq = posting.freq; char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK; // We swap between two encoders to save copying // last Term's byte array UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto]; // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); int termBytesCount = utf8Result.length; // TODO: UTF16toUTF8 could tell us this prefix // Compute common prefix between last term and // this term int prefix = 0; if (j > 0) { byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result; byte[] termBytes = perThread.utf8Results[encoderUpto].result; while (prefix < lastTermBytesCount && prefix < termBytesCount) { if (lastTermBytes[prefix] != termBytes[prefix]) { break; } prefix++; } } encoderUpto = 1 - encoderUpto; lastTermBytesCount = termBytesCount; int suffix = termBytesCount - prefix; tvf.WriteVInt(prefix); tvf.WriteVInt(suffix); tvf.WriteBytes(utf8Result.result, prefix, suffix); tvf.WriteVInt(freq); if (doVectorPositions) { termsHashPerField.InitReader(reader, posting, 0); reader.WriteTo(tvf); } if (doVectorOffsets) { termsHashPerField.InitReader(reader, posting, 1); reader.WriteTo(tvf); } } termsHashPerField.Reset(); perThread.termsHashPerThread.Reset(false); }
/// <summary>Compresses the String value using the specified /// compressionLevel (constants are defined in /// java.util.zip.Deflater). /// </summary> public static byte[] CompressString(System.String value_Renamed, int compressionLevel) { UnicodeUtil.UTF8Result result = new UnicodeUtil.UTF8Result(); UnicodeUtil.UTF16toUTF8(value_Renamed, 0, value_Renamed.Length, result); return(Compress(result.result, 0, result.length, compressionLevel)); }
/// <summary>Writes a string.</summary> /// <seealso cref="IndexInput.ReadString()"> /// </seealso> public virtual void WriteString(System.String s) { UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8Result); WriteVInt(utf8Result.length); WriteBytes(utf8Result.result, 0, utf8Result.length); }
/// <summary>Compresses the String value using the specified /// compressionLevel (constants are defined in /// java.util.zip.Deflater). /// </summary> public static byte[] CompressString(System.String value_Renamed, int compressionLevel) { UnicodeUtil.UTF8Result result = new UnicodeUtil.UTF8Result(); UnicodeUtil.UTF16toUTF8(value_Renamed, 0, value_Renamed.Length, result); return Compress(result.result, 0, result.length, compressionLevel); }
public virtual void TestIncrementalUnicodeStrings() { r = NewRandom(); char[] buffer = new char[20]; char[] expected = new char[20]; UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); UnicodeUtil.UTF16Result utf16a = new UnicodeUtil.UTF16Result(); bool hasIllegal = false; byte[] last = new byte[60]; for (int iter = 0; iter < 100000; iter++) { int prefix; if (iter == 0 || hasIllegal) prefix = 0; else prefix = NextInt(20); hasIllegal = FillUnicode(buffer, expected, prefix, 20 - prefix); UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8); if (!hasIllegal) { byte[] b = System.Text.Encoding.GetEncoding("UTF-8").GetBytes(new System.String(buffer, 0, 20)); Assert.AreEqual(b.Length, utf8.length); for (int i = 0; i < b.Length; i++) Assert.AreEqual(b[i], utf8.result[i]); } int bytePrefix = 20; if (iter == 0 || hasIllegal) bytePrefix = 0; else for (int i = 0; i < 20; i++) if (last[i] != utf8.result[i]) { bytePrefix = i; break; } System.Array.Copy(utf8.result, 0, last, 0, utf8.length); UnicodeUtil.UTF8toUTF16(utf8.result, bytePrefix, utf8.length - bytePrefix, utf16); Assert.AreEqual(20, utf16.length); for (int i = 0; i < 20; i++) Assert.AreEqual(expected[i], utf16.result[i]); UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16a); Assert.AreEqual(20, utf16a.length); for (int i = 0; i < 20; i++) Assert.AreEqual(expected[i], utf16a.result[i]); } }
public virtual void TestRandomUnicodeStrings() { r = NewRandom(); char[] buffer = new char[20]; char[] expected = new char[20]; UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); for (int iter = 0; iter < 100000; iter++) { bool hasIllegal = FillUnicode(buffer, expected, 0, 20); UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8); if (!hasIllegal) { byte[] b = System.Text.Encoding.GetEncoding("UTF-8").GetBytes(new System.String(buffer, 0, 20)); Assert.AreEqual(b.Length, utf8.length); for (int i = 0; i < b.Length; i++) Assert.AreEqual(b[i], utf8.result[i]); } UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16); Assert.AreEqual(utf16.length, 20); for (int i = 0; i < 20; i++) Assert.AreEqual(expected[i], utf16.result[i]); } }
public virtual void TestAllUnicodeChars() { UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); char[] chars = new char[2]; for (int ch = 0; ch < 0x0010FFFF; ch++) { if (ch == 0xd800) // Skip invalid code points ch = 0xe000; int len = 0; if (ch <= 0xffff) { chars[len++] = (char) ch; } else { chars[len++] = (char) (((ch - 0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START); chars[len++] = (char) (((ch - 0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START); } UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8); System.String s1 = new System.String(chars, 0, len); System.String s2 = System.Text.Encoding.UTF8.GetString(utf8.result, 0, utf8.length); Assert.AreEqual(s1, s2, "codepoint " + ch); UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16); Assert.AreEqual(s1, new String(utf16.result, 0, utf16.length), "codepoint " + ch); byte[] b = System.Text.Encoding.GetEncoding("UTF-8").GetBytes(s1); Assert.AreEqual(utf8.length, b.Length); for (int j = 0; j < utf8.length; j++) Assert.AreEqual(utf8.result[j], b[j]); } }