public void Read(IndexInput input, FieldInfos fieldInfos) { this.term = null; // invalidate cache int start = input.ReadVInt(); int length = input.ReadVInt(); int totalLength = start + length; if (preUTF8Strings) { text.SetLength(totalLength); input.ReadChars(text.result, start, length); } else { if (dirty) { // Fully convert all bytes since bytes is dirty UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); bytes.SetLength(totalLength); input.ReadBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text); dirty = false; } else { // Incrementally convert only the UTF8 bytes that are new: bytes.SetLength(totalLength); input.ReadBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text); } } this.field = fieldInfos.FieldName(input.ReadVInt()); }
/// <summary>Writes a string.</summary> /// <seealso cref="IndexInput.ReadString()"> /// </seealso> public virtual void WriteString(string s) { UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8Result); WriteVInt(utf8Result.length); WriteBytes(utf8Result.result, 0, utf8Result.length); }
/// <summary>Decompress the byte array previously returned by /// compressString back into a String /// </summary> public static System.String DecompressString(byte[] value_Renamed) { UnicodeUtil.UTF16Result result = new UnicodeUtil.UTF16Result(); byte[] bytes = Decompress(value_Renamed); UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.Length, result); return(new System.String(result.result, 0, result.length)); }
public override void SetUp() { base.SetUp(); dir = NewDirectory(); fieldName = Random.NextBoolean() ? "field" : ""; // sometimes use an empty string as field name RandomIndexWriter writer = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random, MockTokenizer.KEYWORD, false)).SetMaxBufferedDocs(TestUtil.NextInt32(Random, 50, 1000))); Document doc = new Document(); Field field = NewStringField(fieldName, "", Field.Store.NO); doc.Add(field); List<string> terms = new List<string>(); int num = AtLeast(200); for (int i = 0; i < num; i++) { string s = TestUtil.RandomUnicodeString(Random); field.SetStringValue(s); terms.Add(s); writer.AddDocument(doc); } if (Verbose) { // utf16 order terms.Sort(); Console.WriteLine("UTF16 order:"); foreach (string s in terms) { Console.WriteLine(" " + UnicodeUtil.ToHexString(s)); } } reader = writer.GetReader(); searcher1 = NewSearcher(reader); searcher2 = NewSearcher(reader); writer.Dispose(); }
internal virtual System.String BytesToString(byte[] bytes) { lock (this) { System.String s = System.Text.Encoding.UTF8.GetString(bytes); UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8Result); try { return(System.Text.Encoding.UTF8.GetString(utf8Result.result, 0, utf8Result.length)); } catch (System.IO.IOException uee) { return(null); } } }
/// <summary>Called when we are done adding docs to this term </summary> internal override void Finish() { long skipPointer = skipListWriter.WriteSkip(out_Renamed); // TODO: this is abstraction violation -- we should not // peek up into parents terms encoding format termInfo.Set(df, parent.freqStart, parent.proxStart, (int)(skipPointer - parent.freqStart)); // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(parent.currentTerm, parent.currentTermStart, utf8); if (df > 0) { parent.termsOut.Add(fieldInfo.number, utf8.result, utf8.length, termInfo); } lastDocID = 0; df = 0; }
// Currently used only by assert statement private int CompareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { if (lastFieldNumber != fieldNumber) { int cmp = String.CompareOrdinal(fieldInfos.FieldName(lastFieldNumber), fieldInfos.FieldName(fieldNumber)); // If there is a field named "" (empty string) then we // will get 0 on this comparison, yet, it's "OK". But // it's not OK if two different field numbers map to // the same name. if (cmp != 0 || lastFieldNumber != -1) { return(cmp); } } UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); int len; if (utf16Result1.length < utf16Result2.length) { len = utf16Result1.length; } else { len = utf16Result2.length; } for (int i = 0; i < len; i++) { char ch1 = utf16Result1.result[i]; char ch2 = utf16Result2.result[i]; if (ch1 != ch2) { return(ch1 - ch2); } } return(utf16Result1.length - utf16Result2.length); }
/// <summary> Add a complete document specified by all its term vectors. If document has no /// term vectors, add value for tvx. /// /// </summary> /// <param name="vectors"> /// </param> /// <throws> IOException </throws> public void AddAllDocVectors(TermFreqVector[] vectors) { tvx.WriteLong(tvd.GetFilePointer()); tvx.WriteLong(tvf.GetFilePointer()); if (vectors != null) { int numFields = vectors.Length; tvd.WriteVInt(numFields); long[] fieldPointers = new long[numFields]; for (int i = 0; i < numFields; i++) { fieldPointers[i] = tvf.GetFilePointer(); int fieldNumber = fieldInfos.FieldNumber(vectors[i].GetField()); // 1st pass: write field numbers to tvd tvd.WriteVInt(fieldNumber); int numTerms = vectors[i].Size(); tvf.WriteVInt(numTerms); TermPositionVector tpVector; byte bits; bool storePositions; bool storeOffsets; if (vectors[i] is TermPositionVector) { // May have positions & offsets tpVector = (TermPositionVector)vectors[i]; storePositions = tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null; storeOffsets = tpVector.Size() > 0 && tpVector.GetOffsets(0) != null; bits = (byte)((storePositions ? TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR : (byte)0) + (storeOffsets ? TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR : (byte)0)); } else { tpVector = null; bits = 0; storePositions = false; storeOffsets = false; } tvf.WriteVInt(bits); string[] terms = vectors[i].GetTerms(); int[] freqs = vectors[i].GetTermFrequencies(); int utf8Upto = 0; utf8Results[1].length = 0; for (int j = 0; j < numTerms; j++) { UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].Length, utf8Results[utf8Upto]); int start = StringHelper.bytesDifference( utf8Results[1 - utf8Upto].result, utf8Results[1 - utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length); int length = utf8Results[utf8Upto].length - start; tvf.WriteVInt(start); // write shared prefix length tvf.WriteVInt(length); // write delta length tvf.WriteBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes utf8Upto = 1 - utf8Upto; int termFreq = freqs[j]; tvf.WriteVInt(termFreq); if (storePositions) { int[] positions = tpVector.GetTermPositions(j); if (positions == null) { throw new System.SystemException("Trying to write positions that are null!"); } System.Diagnostics.Debug.Assert(positions.Length == termFreq); // use delta encoding for positions int lastPosition = 0; for (int k = 0; k < positions.Length; k++) { int position = positions[k]; tvf.WriteVInt(position - lastPosition); lastPosition = position; } } if (storeOffsets) { TermVectorOffsetInfo[] offsets = tpVector.GetOffsets(j); if (offsets == null) { throw new System.SystemException("Trying to write offsets that are null!"); } System.Diagnostics.Debug.Assert(offsets.Length == termFreq); // use delta encoding for offsets int lastEndOffset = 0; for (int k = 0; k < offsets.Length; k++) { int startOffset = offsets[k].GetStartOffset(); int endOffset = offsets[k].GetEndOffset(); tvf.WriteVInt(startOffset - lastEndOffset); tvf.WriteVInt(endOffset - startOffset); lastEndOffset = endOffset; } } } } // 2nd pass: write field pointers to tvd long lastFieldPointer = fieldPointers[0]; for (int i = 1; i < numFields; i++) { long fieldPointer = fieldPointers[i]; tvd.WriteVLong(fieldPointer - lastFieldPointer); lastFieldPointer = fieldPointer; } } else { tvd.WriteVInt(0); } }
internal void Add(Term term, TermInfo ti) { UnicodeUtil.UTF16toUTF8(term.Text, 0, term.Text.Length, utf8Result); Add(fieldInfos.FieldNumber(term.Field), utf8Result.result, utf8Result.length, ti); }
/// <summary>Called once per field per document if term vectors /// are enabled, to write the vectors to /// RAMOutputStream, which is then quickly flushed to /// * the real term vectors files in the Directory. /// </summary> internal override void Finish() { System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start")); int numPostings = termsHashPerField.numPostings; System.Diagnostics.Debug.Assert(numPostings >= 0); if (!doVectors || numPostings == 0) { return; } if (numPostings > maxNumPostings) { maxNumPostings = numPostings; } IndexOutput tvf = perThread.doc.tvf; // This is called once, after inverting all occurences // of a given field in the doc. At this point we flush // our hash into the DocWriter. System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector); System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo)); perThread.doc.AddField(termsHashPerField.fieldInfo.number); RawPostingList[] postings = termsHashPerField.SortPostings(); tvf.WriteVInt(numPostings); byte bits = (byte)(0x0); if (doVectorPositions) { bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; } if (doVectorOffsets) { bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; } tvf.WriteByte(bits); int encoderUpto = 0; int lastTermBytesCount = 0; ByteSliceReader reader = perThread.vectorSliceReader; char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; for (int j = 0; j < numPostings; j++) { TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList)postings[j]; int freq = posting.freq; char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK; // We swap between two encoders to save copying // last Term's byte array UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto]; // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); int termBytesCount = utf8Result.length; // TODO: UTF16toUTF8 could tell us this prefix // Compute common prefix between last term and // this term int prefix = 0; if (j > 0) { byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result; byte[] termBytes = perThread.utf8Results[encoderUpto].result; while (prefix < lastTermBytesCount && prefix < termBytesCount) { if (lastTermBytes[prefix] != termBytes[prefix]) { break; } prefix++; } } encoderUpto = 1 - encoderUpto; lastTermBytesCount = termBytesCount; int suffix = termBytesCount - prefix; tvf.WriteVInt(prefix); tvf.WriteVInt(suffix); tvf.WriteBytes(utf8Result.result, prefix, suffix); tvf.WriteVInt(freq); if (doVectorPositions) { termsHashPerField.InitReader(reader, posting, 0); reader.WriteTo(tvf); } if (doVectorOffsets) { termsHashPerField.InitReader(reader, posting, 1); reader.WriteTo(tvf); } } termsHashPerField.Reset(); perThread.termsHashPerThread.Reset(false); }
/// <summary>Compresses the String value using the specified /// compressionLevel (constants are defined in /// java.util.zip.Deflater). /// </summary> public static byte[] CompressString(System.String value_Renamed, int compressionLevel) { UnicodeUtil.UTF8Result result = new UnicodeUtil.UTF8Result(); UnicodeUtil.UTF16toUTF8(value_Renamed, 0, value_Renamed.Length, result); return(Compress(result.result, 0, result.length, compressionLevel)); }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ void AppendPostings(DocumentsWriter.FlushState flushState, FreqProxTermsWriterPerField[] fields, TermInfosWriter termsOut, IndexOutput freqOut, IndexOutput proxOut, DefaultSkipListWriter skipListWriter) { int fieldNumber = fields[0].fieldInfo.number; int numFields = fields.Length; FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields]; for (int i = 0; i < numFields; i++) { FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]); System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields[0].fieldInfo); // Should always be true bool result = fms.nextTerm(); System.Diagnostics.Debug.Assert(result); } int skipInterval = termsOut.skipInterval; bool currentFieldOmitTf = fields[0].fieldInfo.omitTf; // If current field omits tf then it cannot store // payloads. We silently drop the payloads in this case: bool currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads; FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; while (numFields > 0) { // Get the next term to merge termStates[0] = mergeStates[0]; int numToMerge = 1; for (int i = 1; i < numFields; i++) { char[] text = mergeStates[i].text; int textOffset = mergeStates[i].textOffset; int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset); if (cmp < 0) { termStates[0] = mergeStates[i]; numToMerge = 1; } else if (cmp == 0) { termStates[numToMerge++] = mergeStates[i]; } } int df = 0; int lastPayloadLength = -1; int lastDoc = 0; char[] text_Renamed = termStates[0].text; int start = termStates[0].textOffset; long freqPointer = freqOut.GetFilePointer(); long proxPointer; if (proxOut != null) { proxPointer = proxOut.GetFilePointer(); } else { proxPointer = 0; } skipListWriter.ResetSkip(); // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. while (numToMerge > 0) { if ((++df % skipInterval) == 0) { skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength); skipListWriter.BufferSkip(df); } FreqProxFieldMergeState minState = termStates[0]; for (int i = 1; i < numToMerge; i++) { if (termStates[i].docID < minState.docID) { minState = termStates[i]; } } int doc = minState.docID; int termDocFreq = minState.termFreq; System.Diagnostics.Debug.Assert(doc < flushState.numDocsInRAM); System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1); ByteSliceReader prox = minState.prox; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (!currentFieldOmitTf) { // omitTf == false so we do write positions & payload System.Diagnostics.Debug.Assert(proxOut != null); for (int j = 0; j < termDocFreq; j++) { int code = prox.ReadVInt(); if (currentFieldStorePayloads) { int payloadLength; if ((code & 1) != 0) { // This position has a payload payloadLength = prox.ReadVInt(); } else { payloadLength = 0; } if (payloadLength != lastPayloadLength) { proxOut.WriteVInt(code | 1); proxOut.WriteVInt(payloadLength); lastPayloadLength = payloadLength; } else { proxOut.WriteVInt(code & (~1)); } if (payloadLength > 0) { copyBytes(prox, proxOut, payloadLength); } } else { System.Diagnostics.Debug.Assert(0 == (code & 1)); proxOut.WriteVInt(code >> 1); } } //End for int newDocCode = (doc - lastDoc) << 1; if (1 == termDocFreq) { freqOut.WriteVInt(newDocCode | 1); } else { freqOut.WriteVInt(newDocCode); freqOut.WriteVInt(termDocFreq); } } else { // omitTf==true: we store only the docs, without // term freq, positions, payloads freqOut.WriteVInt(doc - lastDoc); } lastDoc = doc; if (!minState.nextDoc()) { // Remove from termStates int upto = 0; for (int i = 0; i < numToMerge; i++) { if (termStates[i] != minState) { termStates[upto++] = termStates[i]; } } numToMerge--; System.Diagnostics.Debug.Assert(upto == numToMerge); // Advance this state to the next term if (!minState.nextTerm()) { // OK, no more terms, so remove from mergeStates // as well upto = 0; for (int i = 0; i < numFields; i++) { if (mergeStates[i] != minState) { mergeStates[upto++] = mergeStates[i]; } } numFields--; System.Diagnostics.Debug.Assert(upto == numFields); } } } System.Diagnostics.Debug.Assert(df > 0); // Done merging this term long skipPointer = skipListWriter.WriteSkip(freqOut); // Write term termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer)); // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text_Renamed, start, termsUTF8); // TODO: we could save O(n) re-scan of the term by // computing the shared prefix with the last term // while during the UTF8 encoding termsOut.Add(fieldNumber, termsUTF8.result, termsUTF8.length, termInfo); } }
protected override AcceptStatus Accept(BytesRef term) { UnicodeUtil.UTF8toUTF16(term.Bytes, term.Offset, term.Length, utf16); return runAutomaton.Run(utf16.Chars, 0, utf16.Length) ? AcceptStatus.YES : AcceptStatus.NO; }