public virtual void TestRandomUnicodeStrings() { char[] buffer = new char[20]; char[] expected = new char[20]; BytesRef utf8 = new BytesRef(20); CharsRef utf16 = new CharsRef(20); int num = AtLeast(100000); for (int iter = 0; iter < num; iter++) { bool hasIllegal = FillUnicode(buffer, expected, 0, 20); UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8); if (!hasIllegal) { #pragma warning disable 612, 618 var b = (new string(buffer, 0, 20)).GetBytes(IOUtils.CHARSET_UTF_8); #pragma warning restore 612, 618 Assert.AreEqual(b.Length, utf8.Length); for (int i = 0; i < b.Length; i++) { Assert.AreEqual(b[i], utf8.Bytes[i]); } } UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16); Assert.AreEqual(utf16.Length, 20); for (int i = 0; i < 20; i++) { Assert.AreEqual(expected[i], utf16.Chars[i]); } } }
/// <summary>Writes a string.</summary> /// <seealso cref="IndexInput.ReadString()"> /// </seealso> public virtual void WriteString(System.String s) { UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8Result); WriteVInt(utf8Result.length); WriteBytes(utf8Result.result, 0, utf8Result.length); }
/// <summary> /// Adds an input string and it's stemmer override output to this builder. /// </summary> /// <param name="input"> the input char sequence </param> /// <param name="output"> the stemmer override output char sequence </param> /// <returns> <code>false</code> iff the input has already been added to this builder otherwise <code>true</code>. </returns> public virtual bool add(ICharSequence input, ICharSequence output) { int length = input.length(); if (ignoreCase) { // convert on the fly to lowercase charsSpare.grow(length); char[] buffer = charsSpare.chars; for (int i = 0; i < length;) { i += char.toChars(char.ToLower(char.codePointAt(input, i)), buffer, i); } UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare); } else { UnicodeUtil.UTF16toUTF8(input, 0, length, spare); } if (hash.add(spare) >= 0) { outputValues.Add(output); return(true); } return(false); }
/// <summary> /// Compresses the <see cref="string"/> value using the specified /// <paramref name="compressionLevel"/>. /// </summary> public static byte[] CompressString(string value, CompressionLevel compressionLevel) { var result = new BytesRef(); UnicodeUtil.UTF16toUTF8(value.ToCharArray(), 0, value.Length, result); return(Compress(result.Bytes, 0, result.Length, compressionLevel)); }
public void Read(IndexInput input, FieldInfos fieldInfos) { this.term = null; // invalidate cache int start = input.ReadVInt(); int length = input.ReadVInt(); int totalLength = start + length; if (preUTF8Strings) { text.SetLength(totalLength); input.ReadChars(text.result, start, length); } else { if (dirty) { // Fully convert all bytes since bytes is dirty UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); bytes.SetLength(totalLength); input.ReadBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text); dirty = false; } else { // Incrementally convert only the UTF8 bytes that are new: bytes.SetLength(totalLength); input.ReadBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text); } } this.field = fieldInfos.FieldName(input.ReadVInt()); }
/// <summary> /// Adds an input string and it's stemmer override output to this builder. /// </summary> /// <param name="input"> the input char sequence </param> /// <param name="output"> the stemmer override output char sequence </param> /// <returns> <code>false</code> iff the input has already been added to this builder otherwise <code>true</code>. </returns> public virtual bool add(CharSequence input, CharSequence output) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int length = input.length(); int length = input.length(); if (ignoreCase) { // convert on the fly to lowercase charsSpare.grow(length); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final char[] buffer = charsSpare.chars; char[] buffer = charsSpare.chars; for (int i = 0; i < length;) { i += char.toChars(char.ToLower(char.codePointAt(input, i)), buffer, i); } UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare); } else { UnicodeUtil.UTF16toUTF8(input, 0, length, spare); } if (hash.add(spare) >= 0) { outputValues.Add(output); return(true); } return(false); }
/// <summary> /// Adds an input string and it's stemmer override output to this builder. /// </summary> /// <param name="input"> the input char sequence </param> /// <param name="output"> the stemmer override output char sequence </param> /// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns> public virtual bool Add(string input, string output) { int length = input.Length; if (ignoreCase) { // convert on the fly to lowercase charsSpare.Grow(length); char[] buffer = charsSpare.Chars; for (int i = 0; i < length;) { i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i), CultureInfo.InvariantCulture), buffer, i); } UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare); } else { UnicodeUtil.UTF16toUTF8(input.ToCharArray(), 0, length, spare); } if (hash.Add(spare) >= 0) { outputValues.Add(output); return(true); } return(false); }
/// <summary> /// Writes a string. /// <p> /// Writes strings as UTF-8 encoded bytes. First the length, in bytes, is /// written as a <seealso cref="#writeVInt VInt"/>, followed by the bytes. /// </summary> /// <seealso cref= DataInput#readString() </seealso> public virtual void WriteString(string s) { BytesRef utf8Result = new BytesRef(10); UnicodeUtil.UTF16toUTF8(s.ToCharArray(), 0, s.Length, utf8Result); WriteVInt(utf8Result.Length); WriteBytes(utf8Result.Bytes, 0, utf8Result.Length); }
/// <summary> /// Writes a string. /// <para/> /// Writes strings as UTF-8 encoded bytes. First the length, in bytes, is /// written as a <see cref="WriteVInt32"/>, followed by the bytes. /// </summary> /// <seealso cref="DataInput.ReadString()"/> public virtual void WriteString(string s) { var utf8Result = new BytesRef(10); UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8Result); WriteVInt32(utf8Result.Length); WriteBytes(utf8Result.Bytes, 0, utf8Result.Length); }
// NOTE: while it's tempting to make this public, since // caller's parser likely knows the // numInput/numOutputWords, sneaky exceptions, much later // on, will result if these values are wrong; so we always // recompute ourselves to be safe: internal virtual void Add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig) { // first convert to UTF-8 if (numInputWords <= 0) { throw new ArgumentOutOfRangeException(nameof(numInputWords), "numInputWords must be > 0 (got " + numInputWords + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (input.Length <= 0) { throw new ArgumentOutOfRangeException(nameof(input.Length), "input.Length must be > 0 (got " + input.Length + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (numOutputWords <= 0) { throw new ArgumentOutOfRangeException(nameof(numOutputWords), "numOutputWords must be > 0 (got " + numOutputWords + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (output.Length <= 0) { throw new ArgumentOutOfRangeException(nameof(output.Length), "output.Length must be > 0 (got " + output.Length + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (Debugging.AssertsEnabled) { Debugging.Assert(!HasHoles(input), "input has holes: {0}", input); Debugging.Assert(!HasHoles(output), "output has holes: {0}", output); } //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords); UnicodeUtil.UTF16toUTF8(output.Chars, output.Offset, output.Length, utf8Scratch); // lookup in hash int ord = words.Add(utf8Scratch); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; //System.out.println(" output=" + output + " old ord=" + ord); } else { //System.out.println(" output=" + output + " new ord=" + ord); } if (!workingSet.TryGetValue(input, out MapEntry e) || e is null) { e = new MapEntry(); workingSet[CharsRef.DeepCopyOf(input)] = e; // make a copy, since we will keep around in our map } e.ords.Add(ord); e.includeOrig |= includeOrig; maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords); maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords); }
// NOTE: while it's tempting to make this public, since // caller's parser likely knows the // numInput/numOutputWords, sneaky exceptions, much later // on, will result if these values are wrong; so we always // recompute ourselves to be safe: internal virtual void Add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig) { // first convert to UTF-8 if (numInputWords <= 0) { throw new System.ArgumentException("numInputWords must be > 0 (got " + numInputWords + ")"); } if (input.Length <= 0) { throw new System.ArgumentException("input.length must be > 0 (got " + input.Length + ")"); } if (numOutputWords <= 0) { throw new System.ArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")"); } if (output.Length <= 0) { throw new System.ArgumentException("output.length must be > 0 (got " + output.Length + ")"); } Debug.Assert(!HasHoles(input), "input has holes: " + input); Debug.Assert(!HasHoles(output), "output has holes: " + output); //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords); UnicodeUtil.UTF16toUTF8(output.Chars, output.Offset, output.Length, utf8Scratch); // lookup in hash int ord = words.Add(utf8Scratch); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; //System.out.println(" output=" + output + " old ord=" + ord); } else { //System.out.println(" output=" + output + " new ord=" + ord); } MapEntry e = workingSet.ContainsKey(input) ? workingSet[input] : null; if (e == null) { e = new MapEntry(); workingSet[CharsRef.DeepCopyOf(input)] = e; // make a copy, since we will keep around in our map } e.ords.Add(ord); e.includeOrig |= includeOrig; maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords); maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords); }
public virtual void Reset(string token) { Debug.Assert(Level != 0); this.token = token; shapeRel = SpatialRelation.NULL_VALUE; //converting string t0 byte[] //bytes = Encoding.UTF8.GetBytes(token); BytesRef utf8Result = new BytesRef(token.Length); UnicodeUtil.UTF16toUTF8(token.ToCharArray(), 0, token.Length, utf8Result); bytes = utf8Result.bytes.ToByteArray(); b_off = 0; b_len = bytes.Length; B_fixLeaf(); }
/// <summary>Called when we are done adding docs to this term </summary> internal override void Finish() { long skipPointer = skipListWriter.WriteSkip(out_Renamed); // TODO: this is abstraction violation -- we should not // peek up into parents terms encoding format termInfo.Set(df, parent.freqStart, parent.proxStart, (int)(skipPointer - parent.freqStart)); // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(parent.currentTerm, parent.currentTermStart, utf8); if (df > 0) { parent.termsOut.Add(fieldInfo.number, utf8.result, utf8.length, termInfo); } lastDocID = 0; df = 0; }
public virtual void TestAllUnicodeChars() { BytesRef utf8 = new BytesRef(10); CharsRef utf16 = new CharsRef(10); char[] chars = new char[2]; for (int ch = 0; ch < 0x0010FFFF; ch++) { if (ch == 0xd800) // Skip invalid code points { ch = 0xe000; } int len = 0; if (ch <= 0xffff) { chars[len++] = (char)ch; } else { chars[len++] = (char)(((ch - 0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START); chars[len++] = (char)(((ch - 0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START); } UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8); string s1 = new string(chars, 0, len); string s2 = Encoding.UTF8.GetString(utf8.Bytes, utf8.Offset, utf8.Length); Assert.AreEqual(s1, s2, "codepoint " + ch); UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16); Assert.AreEqual(s1, new string(utf16.Chars, 0, utf16.Length), "codepoint " + ch); var b = s1.GetBytes(Encoding.UTF8); Assert.AreEqual(utf8.Length, b.Length); for (int j = 0; j < utf8.Length; j++) { Assert.AreEqual(utf8.Bytes[j], b[j]); } } }
/// <summary>Note: doesn't contain a trailing leaf byte.</summary> /// <remarks>Note: doesn't contain a trailing leaf byte.</remarks> public virtual byte[] GetTokenBytes() { if (bytes != null) { if (b_off != 0 || b_len != bytes.Length) { throw new InvalidOperationException("Not supported if byte[] needs to be recreated."); } } else { //converting string t0 byte[] //bytes = Encoding.UTF8.GetBytes(token); BytesRef utf8Result = new BytesRef(token.Length); UnicodeUtil.UTF16toUTF8(token.ToCharArray(), 0, token.Length, utf8Result); bytes = utf8Result.bytes.ToByteArray(); b_off = 0; b_len = bytes.Length; } return(bytes); }
/// <summary>Compresses the String value using the specified /// compressionLevel (constants are defined in /// java.util.zip.Deflater). /// </summary> public static byte[] CompressString(System.String value_Renamed, int compressionLevel) { UnicodeUtil.UTF8Result result = new UnicodeUtil.UTF8Result(); UnicodeUtil.UTF16toUTF8(value_Renamed, 0, value_Renamed.Length, result); return(Compress(result.result, 0, result.length, compressionLevel)); }
public static void Write(DataOutput output, string s, BytesRef scratch) { UnicodeUtil.UTF16toUTF8(s.ToCharArray(), 0, s.Length, scratch); Write(output, scratch); }
/// <summary> Add a complete document specified by all its term vectors. If document has no /// term vectors, add value for tvx. /// /// </summary> /// <param name="vectors"> /// </param> /// <throws> IOException </throws> public void AddAllDocVectors(ITermFreqVector[] vectors) { tvx.WriteLong(tvd.FilePointer); tvx.WriteLong(tvf.FilePointer); if (vectors != null) { int numFields = vectors.Length; tvd.WriteVInt(numFields); var fieldPointers = new long[numFields]; for (int i = 0; i < numFields; i++) { fieldPointers[i] = tvf.FilePointer; int fieldNumber = fieldInfos.FieldNumber(vectors[i].Field); // 1st pass: write field numbers to tvd tvd.WriteVInt(fieldNumber); int numTerms = vectors[i].Size; tvf.WriteVInt(numTerms); TermPositionVector tpVector; byte bits; bool storePositions; bool storeOffsets; if (vectors[i] is TermPositionVector) { // May have positions & offsets tpVector = (TermPositionVector)vectors[i]; storePositions = tpVector.Size > 0 && tpVector.GetTermPositions(0) != null; storeOffsets = tpVector.Size > 0 && tpVector.GetOffsets(0) != null; bits = (byte)((storePositions?TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR: (byte)0) + (storeOffsets?TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR: (byte)0)); } else { tpVector = null; bits = 0; storePositions = false; storeOffsets = false; } tvf.WriteVInt(bits); System.String[] terms = vectors[i].GetTerms(); int[] freqs = vectors[i].GetTermFrequencies(); int utf8Upto = 0; utf8Results[1].length = 0; for (int j = 0; j < numTerms; j++) { UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].Length, utf8Results[utf8Upto]); int start = StringHelper.BytesDifference(utf8Results[1 - utf8Upto].result, utf8Results[1 - utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length); int length = utf8Results[utf8Upto].length - start; tvf.WriteVInt(start); // write shared prefix length tvf.WriteVInt(length); // write delta length tvf.WriteBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes utf8Upto = 1 - utf8Upto; int termFreq = freqs[j]; tvf.WriteVInt(termFreq); if (storePositions) { int[] positions = tpVector.GetTermPositions(j); if (positions == null) { throw new System.SystemException("Trying to write positions that are null!"); } System.Diagnostics.Debug.Assert(positions.Length == termFreq); // use delta encoding for positions int lastPosition = 0; foreach (int position in positions) { tvf.WriteVInt(position - lastPosition); lastPosition = position; } } if (storeOffsets) { TermVectorOffsetInfo[] offsets = tpVector.GetOffsets(j); if (offsets == null) { throw new System.SystemException("Trying to write offsets that are null!"); } System.Diagnostics.Debug.Assert(offsets.Length == termFreq); // use delta encoding for offsets int lastEndOffset = 0; foreach (TermVectorOffsetInfo t in offsets) { int startOffset = t.StartOffset; int endOffset = t.EndOffset; tvf.WriteVInt(startOffset - lastEndOffset); tvf.WriteVInt(endOffset - startOffset); lastEndOffset = endOffset; } } } } // 2nd pass: write field pointers to tvd if (numFields > 1) { long lastFieldPointer = fieldPointers[0]; for (int i = 1; i < numFields; i++) { long fieldPointer = fieldPointers[i]; tvd.WriteVLong(fieldPointer - lastFieldPointer); lastFieldPointer = fieldPointer; } } } else { tvd.WriteVInt(0); } }
public virtual void FillBytesRef() { UnicodeUtil.UTF16toUTF8(TermBuffer, 0, TermLength, Bytes); }
/// <summary>Called once per field per document if term vectors /// are enabled, to write the vectors to /// RAMOutputStream, which is then quickly flushed to /// the real term vectors files in the Directory. /// </summary> internal override void Finish() { System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start")); int numPostings = termsHashPerField.numPostings; System.Diagnostics.Debug.Assert(numPostings >= 0); if (!doVectors || numPostings == 0) { return; } if (numPostings > maxNumPostings) { maxNumPostings = numPostings; } IndexOutput tvf = perThread.doc.perDocTvf; // This is called once, after inverting all occurences // of a given field in the doc. At this point we flush // our hash into the DocWriter. System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector); System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo)); perThread.doc.AddField(termsHashPerField.fieldInfo.number); RawPostingList[] postings = termsHashPerField.SortPostings(); tvf.WriteVInt(numPostings); byte bits = (byte)(0x0); if (doVectorPositions) { bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; } if (doVectorOffsets) { bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; } tvf.WriteByte(bits); int encoderUpto = 0; int lastTermBytesCount = 0; ByteSliceReader reader = perThread.vectorSliceReader; char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; for (int j = 0; j < numPostings; j++) { TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList)postings[j]; int freq = posting.freq; char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK; // We swap between two encoders to save copying // last Term's byte array UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto]; // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); int termBytesCount = utf8Result.length; // TODO: UTF16toUTF8 could tell us this prefix // Compute common prefix between last term and // this term int prefix = 0; if (j > 0) { byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result; byte[] termBytes = perThread.utf8Results[encoderUpto].result; while (prefix < lastTermBytesCount && prefix < termBytesCount) { if (lastTermBytes[prefix] != termBytes[prefix]) { break; } prefix++; } } encoderUpto = 1 - encoderUpto; lastTermBytesCount = termBytesCount; int suffix = termBytesCount - prefix; tvf.WriteVInt(prefix); tvf.WriteVInt(suffix); tvf.WriteBytes(utf8Result.result, prefix, suffix); tvf.WriteVInt(freq); if (doVectorPositions) { termsHashPerField.InitReader(reader, posting, 0); reader.WriteTo(tvf); } if (doVectorOffsets) { termsHashPerField.InitReader(reader, posting, 1); reader.WriteTo(tvf); } } termsHashPerField.Reset(); // NOTE: we clear, per-field, at the thread level, // because term vectors fully write themselves on each // field; this saves RAM (eg if large doc has two large // fields w/ term vectors on) because we recycle/reuse // all RAM after each field: perThread.termsHashPerThread.Reset(false); }
private bool Matches(ByteRunAutomaton a, int code) { char[] chars = Character.ToChars(code); UnicodeUtil.UTF16toUTF8(chars, 0, chars.Length, b); return(a.Run(b.Bytes, 0, b.Length)); }
internal void Add(Term term, TermInfo ti) { UnicodeUtil.UTF16toUTF8(term.Text, 0, term.Text.Length, utf8Result); Add(fieldInfos.FieldNumber(term.Field), utf8Result.result, utf8Result.length, ti); }