private TermInfo termInfo = new TermInfo(); // minimize consing /// <summary>Merge one term found in one or more segments. The array <code>smis</code> /// contains segments that are positioned at the same term. <code>N</code> /// is the number of cells in the array actually occupied. /// /// </summary> /// <param name="smis">array of segments /// </param> /// <param name="n">number of cells in the array actually occupied /// </param> private void MergeTermInfo(SegmentMergeInfo[] smis, int n) { long freqPointer = freqOutput.GetFilePointer(); long proxPointer = proxOutput.GetFilePointer(); int df = AppendPostings(smis, n); // append posting data long skipPointer = WriteSkip(); if (df > 0) { // add an entry to the dictionary with pointers to prox and freq files termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer)); termInfosWriter.Add(smis[0].term, termInfo); } }
/// <summary>Adds a new <<fieldNumber, termText>, TermInfo> pair to the set. /// Term must be lexicographically greater than all previous Terms added. /// TermInfo pointers must be positive and greater than all previous. /// </summary> internal void Add(int fieldNumber, char[] termText, int termTextStart, int termTextLength, TermInfo ti) { System.Diagnostics.Debug.Assert(CompareToLastTerm(fieldNumber, termText, termTextStart, termTextLength) < 0 || (isIndex && termTextLength == 0 && lastTermTextLength == 0), "Terms are out of order: field=" + fieldInfos.FieldName(fieldNumber) + "(number " + fieldNumber + ")" + " lastField=" + fieldInfos.FieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + " text=" + new String(termText, termTextStart, termTextLength) + " lastText=" + new String(lastTermText, 0, lastTermTextLength)); System.Diagnostics.Debug.Assert(ti.freqPointer >= lastTi.freqPointer, "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"); System.Diagnostics.Debug.Assert(ti.proxPointer >= lastTi.proxPointer, "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"); if (!isIndex && size % indexInterval == 0) { other.Add(lastFieldNumber, lastTermText, 0, lastTermTextLength, lastTi); // add an index term } WriteTerm(fieldNumber, termText, termTextStart, termTextLength); // write term output.WriteVInt(ti.docFreq); // write doc freq output.WriteVLong(ti.freqPointer - lastTi.freqPointer); // write pointers output.WriteVLong(ti.proxPointer - lastTi.proxPointer); if (ti.docFreq >= skipInterval) { output.WriteVInt(ti.skipOffset); } if (isIndex) { output.WriteVLong(other.output.GetFilePointer() - lastIndexPointer); lastIndexPointer = other.output.GetFilePointer(); // write pointer } if (lastTermText.Length < termTextLength) { lastTermText = new char[(int)(termTextLength * 1.25)]; } Array.Copy(termText, termTextStart, lastTermText, 0, termTextLength); lastTermTextLength = termTextLength; lastFieldNumber = fieldNumber; lastTi.Set(ti); size++; }
/// <summary>Adds a new <Term, TermInfo> pair to the set. /// Term must be lexicographically greater than all previous Terms added. /// TermInfo pointers must be positive and greater than all previous. /// </summary> public /*internal*/ void Add(Term term, TermInfo ti) { if (!isIndex && term.CompareTo(lastTerm) <= 0) { throw new System.IO.IOException("term out of order"); } if (ti.freqPointer < lastTi.freqPointer) { throw new System.IO.IOException("freqPointer out of order"); } if (ti.proxPointer < lastTi.proxPointer) { throw new System.IO.IOException("proxPointer out of order"); } if (!isIndex && size % indexInterval == 0) { other.Add(lastTerm, lastTi); // add an index term } WriteTerm(term); // write term output.WriteVInt(ti.docFreq); // write doc freq output.WriteVLong(ti.freqPointer - lastTi.freqPointer); // write pointers output.WriteVLong(ti.proxPointer - lastTi.proxPointer); if (ti.docFreq >= skipInterval) { output.WriteVInt(ti.skipOffset); } if (isIndex) { output.WriteVLong(other.output.GetFilePointer() - lastIndexPointer); lastIndexPointer = other.output.GetFilePointer(); // write pointer } lastTi.Set(ti); size++; }
/// <summary>Adds a new <fieldNumber, termBytes>, TermInfo> pair to the set. /// Term must be lexicographically greater than all previous Terms added. /// TermInfo pointers must be positive and greater than all previous. /// </summary> internal void Add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) { System.Diagnostics.Debug.Assert(CompareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || (isIndex && termBytesLength == 0 && lastTermBytesLength == 0), "Terms are out of order: field=" + fieldInfos.FieldName(fieldNumber) + " (number " + fieldNumber + ")" + " lastField=" + fieldInfos.FieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + " text=" + System.Text.Encoding.UTF8.GetString(termBytes, 0, termBytesLength) + " lastText=" + System.Text.Encoding.UTF8.GetString(lastTermBytes, 0, lastTermBytesLength)); System.Diagnostics.Debug.Assert(ti.freqPointer >= lastTi.freqPointer, "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"); System.Diagnostics.Debug.Assert(ti.proxPointer >= lastTi.proxPointer, "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"); if (!isIndex && size % indexInterval == 0) { other.Add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term } WriteTerm(fieldNumber, termBytes, termBytesLength); // write term output.WriteVInt(ti.docFreq); // write doc freq output.WriteVLong(ti.freqPointer - lastTi.freqPointer); // write pointers output.WriteVLong(ti.proxPointer - lastTi.proxPointer); if (ti.docFreq >= skipInterval) { output.WriteVInt(ti.skipOffset); } if (isIndex) { output.WriteVLong(other.output.FilePointer - lastIndexPointer); lastIndexPointer = other.output.FilePointer; // write pointer } lastFieldNumber = fieldNumber; lastTi.Set(ti); size++; }
private void WritePostings(Posting[] postings, System.String segment) { IndexOutput freq = null, prox = null; TermInfosWriter tis = null; TermVectorsWriter termVectorWriter = null; try { //open files for inverse index storage freq = directory.CreateOutput(segment + ".frq"); prox = directory.CreateOutput(segment + ".prx"); tis = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); TermInfo ti = new TermInfo(); System.String currentField = null; for (int i = 0; i < postings.Length; i++) { Posting posting = postings[i]; // add an entry to the dictionary with pointers to prox and freq files ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), - 1); tis.Add(posting.term, ti); // add an entry to the freq file int postingFreq = posting.freq; if (postingFreq == 1) // optimize freq=1 freq.WriteVInt(1); // set low bit of doc num. else { freq.WriteVInt(0); // the document number freq.WriteVInt(postingFreq); // frequency in doc } int lastPosition = 0; // write positions int[] positions = posting.positions; for (int j = 0; j < postingFreq; j++) { // use delta-encoding int position = positions[j]; prox.WriteVInt(position - lastPosition); lastPosition = position; } // check to see if we switched to a new field System.String termField = posting.term.Field(); if (currentField != termField) { // changing field - see if there is something to save currentField = termField; FieldInfo fi = fieldInfos.FieldInfo(currentField); if (fi.storeTermVector) { if (termVectorWriter == null) { termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos); termVectorWriter.OpenDocument(); } termVectorWriter.OpenField(currentField); } else if (termVectorWriter != null) { termVectorWriter.CloseField(); } } if (termVectorWriter != null && termVectorWriter.IsFieldOpen()) { termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets); } } if (termVectorWriter != null) termVectorWriter.CloseDocument(); } finally { // make an effort to close all streams we can but remember and re-throw // the first exception encountered in this process System.IO.IOException keep = null; if (freq != null) try { freq.Close(); } catch (System.IO.IOException e) { if (keep == null) keep = e; } if (prox != null) try { prox.Close(); } catch (System.IO.IOException e) { if (keep == null) keep = e; } if (tis != null) try { tis.Close(); } catch (System.IO.IOException e) { if (keep == null) keep = e; } if (termVectorWriter != null) try { termVectorWriter.Close(); } catch (System.IO.IOException e) { if (keep == null) keep = e; } if (keep != null) { throw new System.IO.IOException(keep.StackTrace); } } }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ void AppendPostings(DocumentsWriter.FlushState flushState, FreqProxTermsWriterPerField[] fields, TermInfosWriter termsOut, IndexOutput freqOut, IndexOutput proxOut, DefaultSkipListWriter skipListWriter) { int fieldNumber = fields[0].fieldInfo.number; int numFields = fields.Length; FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields]; for (int i = 0; i < numFields; i++) { FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]); System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields[0].fieldInfo); // Should always be true bool result = fms.nextTerm(); System.Diagnostics.Debug.Assert(result); } int skipInterval = termsOut.skipInterval; bool currentFieldOmitTf = fields[0].fieldInfo.omitTf; // If current field omits tf then it cannot store // payloads. We silently drop the payloads in this case: bool currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads; FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; while (numFields > 0) { // Get the next term to merge termStates[0] = mergeStates[0]; int numToMerge = 1; for (int i = 1; i < numFields; i++) { char[] text = mergeStates[i].text; int textOffset = mergeStates[i].textOffset; int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset); if (cmp < 0) { termStates[0] = mergeStates[i]; numToMerge = 1; } else if (cmp == 0) termStates[numToMerge++] = mergeStates[i]; } int df = 0; int lastPayloadLength = -1; int lastDoc = 0; char[] text_Renamed = termStates[0].text; int start = termStates[0].textOffset; long freqPointer = freqOut.GetFilePointer(); long proxPointer; if (proxOut != null) proxPointer = proxOut.GetFilePointer(); else proxPointer = 0; skipListWriter.ResetSkip(); // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. while (numToMerge > 0) { if ((++df % skipInterval) == 0) { skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength); skipListWriter.BufferSkip(df); } FreqProxFieldMergeState minState = termStates[0]; for (int i = 1; i < numToMerge; i++) if (termStates[i].docID < minState.docID) minState = termStates[i]; int doc = minState.docID; int termDocFreq = minState.termFreq; System.Diagnostics.Debug.Assert(doc < flushState.numDocsInRAM); System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1); ByteSliceReader prox = minState.prox; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (!currentFieldOmitTf) { // omitTf == false so we do write positions & payload System.Diagnostics.Debug.Assert(proxOut != null); for (int j = 0; j < termDocFreq; j++) { int code = prox.ReadVInt(); if (currentFieldStorePayloads) { int payloadLength; if ((code & 1) != 0) { // This position has a payload payloadLength = prox.ReadVInt(); } else payloadLength = 0; if (payloadLength != lastPayloadLength) { proxOut.WriteVInt(code | 1); proxOut.WriteVInt(payloadLength); lastPayloadLength = payloadLength; } else proxOut.WriteVInt(code & (~1)); if (payloadLength > 0) copyBytes(prox, proxOut, payloadLength); } else { System.Diagnostics.Debug.Assert(0 == (code & 1)); proxOut.WriteVInt(code >> 1); } } //End for int newDocCode = (doc - lastDoc) << 1; if (1 == termDocFreq) { freqOut.WriteVInt(newDocCode | 1); } else { freqOut.WriteVInt(newDocCode); freqOut.WriteVInt(termDocFreq); } } else { // omitTf==true: we store only the docs, without // term freq, positions, payloads freqOut.WriteVInt(doc - lastDoc); } lastDoc = doc; if (!minState.nextDoc()) { // Remove from termStates int upto = 0; for (int i = 0; i < numToMerge; i++) if (termStates[i] != minState) termStates[upto++] = termStates[i]; numToMerge--; System.Diagnostics.Debug.Assert(upto == numToMerge); // Advance this state to the next term if (!minState.nextTerm()) { // OK, no more terms, so remove from mergeStates // as well upto = 0; for (int i = 0; i < numFields; i++) if (mergeStates[i] != minState) mergeStates[upto++] = mergeStates[i]; numFields--; System.Diagnostics.Debug.Assert(upto == numFields); } } } System.Diagnostics.Debug.Assert(df > 0); // Done merging this term long skipPointer = skipListWriter.WriteSkip(freqOut); // Write term termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer)); // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text_Renamed, start, termsUTF8); // TODO: we could save O(n) re-scan of the term by // computing the shared prefix with the last term // while during the UTF8 encoding termsOut.Add(fieldNumber, termsUTF8.result, termsUTF8.length, termInfo); } }
// FIXME: OG: remove hard-coded file names public static void Test() { System.IO.FileInfo file = new System.IO.FileInfo("words.txt"); System.Console.Out.WriteLine(" reading word file containing " + file.Length + " bytes"); System.DateTime start = System.DateTime.Now; System.Collections.ArrayList keys = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); System.IO.FileStream ws = new System.IO.FileStream(file.FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read); System.IO.StreamReader wr = new System.IO.StreamReader(new System.IO.StreamReader(ws, System.Text.Encoding.Default).BaseStream, new System.IO.StreamReader(ws, System.Text.Encoding.Default).CurrentEncoding); for (System.String key = wr.ReadLine(); key != null; key = wr.ReadLine()) { keys.Add(new Term("word", key)); } wr.Close(); System.DateTime end = System.DateTime.Now; System.Console.Out.Write(end.Ticks - start.Ticks); System.Console.Out.WriteLine(" milliseconds to read " + keys.Count + " words"); start = System.DateTime.Now; System.Random gen = new System.Random((System.Int32) 1251971); long fp = (gen.Next() & 0xF) + 1; long pp = (gen.Next() & 0xF) + 1; int[] docFreqs = new int[keys.Count]; long[] freqPointers = new long[keys.Count]; long[] proxPointers = new long[keys.Count]; for (int i = 0; i < keys.Count; i++) { docFreqs[i] = (gen.Next() & 0xF) + 1; freqPointers[i] = fp; proxPointers[i] = pp; fp += (gen.Next() & 0xF) + 1; ; pp += (gen.Next() & 0xF) + 1; ; } end = System.DateTime.Now; System.Console.Out.Write(end.Ticks - start.Ticks); System.Console.Out.WriteLine(" milliseconds to generate values"); start = System.DateTime.Now; Directory store = FSDirectory.GetDirectory("test.store", true); FieldInfos fis = new FieldInfos(); TermInfosWriter writer = new TermInfosWriter(store, "words", fis); fis.Add("word", false); for (int i = 0; i < keys.Count; i++) { writer.Add((Term)keys[i], new TermInfo(docFreqs[i], freqPointers[i], proxPointers[i])); } writer.Close(); end = System.DateTime.Now; System.Console.Out.Write(end.Ticks - start.Ticks); System.Console.Out.WriteLine(" milliseconds to write table"); System.Console.Out.WriteLine(" table occupies " + store.FileLength("words.tis") + " bytes"); start = System.DateTime.Now; TermInfosReader reader = new TermInfosReader(store, "words", fis); end = System.DateTime.Now; System.Console.Out.Write(end.Ticks - start.Ticks); System.Console.Out.WriteLine(" milliseconds to open table"); start = System.DateTime.Now; SegmentTermEnum enumerator = reader.Terms(); for (int i = 0; i < keys.Count; i++) { enumerator.Next(); Term key = (Term)keys[i]; if (!key.Equals(enumerator.Term())) { throw new System.Exception("wrong term: " + enumerator.Term() + ", expected: " + key + " at " + i); } TermInfo ti = enumerator.TermInfo(); if (ti.docFreq != docFreqs[i]) { throw new System.Exception("wrong value: " + System.Convert.ToString(ti.docFreq, 16) + ", expected: " + System.Convert.ToString(docFreqs[i], 16) + " at " + i); } if (ti.freqPointer != freqPointers[i]) { throw new System.Exception("wrong value: " + System.Convert.ToString(ti.freqPointer, 16) + ", expected: " + System.Convert.ToString(freqPointers[i], 16) + " at " + i); } if (ti.proxPointer != proxPointers[i]) { throw new System.Exception("wrong value: " + System.Convert.ToString(ti.proxPointer, 16) + ", expected: " + System.Convert.ToString(proxPointers[i], 16) + " at " + i); } } end = System.DateTime.Now; System.Console.Out.Write(end.Ticks - start.Ticks); System.Console.Out.WriteLine(" milliseconds to iterate over " + keys.Count + " words"); start = System.DateTime.Now; for (int i = 0; i < keys.Count; i++) { Term key = (Term)keys[i]; TermInfo ti = reader.Get(key); if (ti.docFreq != docFreqs[i]) { throw new System.Exception("wrong value: " + System.Convert.ToString(ti.docFreq, 16) + ", expected: " + System.Convert.ToString(docFreqs[i], 16) + " at " + i); } if (ti.freqPointer != freqPointers[i]) { throw new System.Exception("wrong value: " + System.Convert.ToString(ti.freqPointer, 16) + ", expected: " + System.Convert.ToString(freqPointers[i], 16) + " at " + i); } if (ti.proxPointer != proxPointers[i]) { throw new System.Exception("wrong value: " + System.Convert.ToString(ti.proxPointer, 16) + ", expected: " + System.Convert.ToString(proxPointers[i], 16) + " at " + i); } } end = System.DateTime.Now; System.Console.Out.Write((end.Ticks - start.Ticks) / (float)keys.Count); System.Console.Out.WriteLine(" average milliseconds per lookup"); TermEnum e = reader.Terms(new Term("word", "azz")); System.Console.Out.WriteLine("Word after azz is " + e.Term().text); reader.Close(); store.Close(); }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ internal void AppendPostings(ThreadState.FieldData[] fields, TermInfosWriter termsOut, IndexOutput freqOut, IndexOutput proxOut) { int fieldNumber = fields[0].fieldInfo.number; int numFields = fields.Length; FieldMergeState[] mergeStates = new FieldMergeState[numFields]; for (int i = 0; i < numFields; i++) { FieldMergeState fms = mergeStates[i] = new FieldMergeState(); fms.field = fields[i]; fms.postings = fms.field.SortPostings(); System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields [0].fieldInfo); // Should always be true bool result = fms.NextTerm(); System.Diagnostics.Debug.Assert(result); } int skipInterval = termsOut.skipInterval; currentFieldStorePayloads = fields[0].fieldInfo.storePayloads; FieldMergeState[] termStates = new FieldMergeState[numFields]; while (numFields > 0) { // Get the next term to merge termStates[0] = mergeStates[0]; int numToMerge = 1; for (int i = 1; i < numFields; i++) { char[] text = mergeStates[i].text; int textOffset = mergeStates[i].textOffset; int cmp = CompareText(text, textOffset, termStates[0].text, termStates[0].textOffset); if (cmp < 0) { termStates[0] = mergeStates[i]; numToMerge = 1; } else if (cmp == 0) termStates[numToMerge++] = mergeStates[i]; } int df = 0; int lastPayloadLength = - 1; int lastDoc = 0; char[] text2 = termStates[0].text; int start = termStates[0].textOffset; int pos = start; while (text2[pos] != 0xffff) pos++; long freqPointer = freqOut.GetFilePointer(); long proxPointer = proxOut.GetFilePointer(); skipListWriter.ResetSkip(); // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. while (numToMerge > 0) { if ((++df % skipInterval) == 0) { skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength); skipListWriter.BufferSkip(df); } FieldMergeState minState = termStates[0]; for (int i = 1; i < numToMerge; i++) if (termStates[i].docID < minState.docID) minState = termStates[i]; int doc = minState.docID; int termDocFreq = minState.termFreq; System.Diagnostics.Debug.Assert(doc < numDocsInRAM); System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1); int newDocCode = (doc - lastDoc) << 1; lastDoc = doc; ByteSliceReader prox = minState.prox; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. for (int j = 0; j < termDocFreq; j++) { int code = prox.ReadVInt(); if (currentFieldStorePayloads) { int payloadLength; if ((code & 1) != 0) { // This position has a payload payloadLength = prox.ReadVInt(); } else payloadLength = 0; if (payloadLength != lastPayloadLength) { proxOut.WriteVInt(code | 1); proxOut.WriteVInt(payloadLength); lastPayloadLength = payloadLength; } else proxOut.WriteVInt(code & (~ 1)); if (payloadLength > 0) CopyBytes(prox, proxOut, payloadLength); } else { System.Diagnostics.Debug.Assert(0 ==(code & 1)); proxOut.WriteVInt(code >> 1); } } if (1 == termDocFreq) { freqOut.WriteVInt(newDocCode | 1); } else { freqOut.WriteVInt(newDocCode); freqOut.WriteVInt(termDocFreq); } if (!minState.NextDoc()) { // Remove from termStates int upto = 0; for (int i = 0; i < numToMerge; i++) if (termStates[i] != minState) termStates[upto++] = termStates[i]; numToMerge--; System.Diagnostics.Debug.Assert(upto == numToMerge); // Advance this state to the next term if (!minState.NextTerm()) { // OK, no more terms, so remove from mergeStates // as well upto = 0; for (int i = 0; i < numFields; i++) if (mergeStates[i] != minState) mergeStates[upto++] = mergeStates[i]; numFields--; System.Diagnostics.Debug.Assert(upto == numFields); } } } System.Diagnostics.Debug.Assert(df > 0); // Done merging this term long skipPointer = skipListWriter.WriteSkip(freqOut); // Write term termInfo.Set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer)); termsOut.Add(fieldNumber, text2, start, pos - start, termInfo); } }
// FIXME: OG: remove hard-coded file names public static void Test() { System.IO.FileInfo file = new System.IO.FileInfo("words.txt"); System.Console.Out.WriteLine(" reading word file containing " + file.Length + " bytes"); System.DateTime start = System.DateTime.Now; System.Collections.ArrayList keys = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); System.IO.FileStream ws = new System.IO.FileStream(file.FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read); System.IO.StreamReader wr = new System.IO.StreamReader(new System.IO.StreamReader(ws, System.Text.Encoding.Default).BaseStream, new System.IO.StreamReader(ws, System.Text.Encoding.Default).CurrentEncoding); for (System.String key = wr.ReadLine(); key != null; key = wr.ReadLine()) keys.Add(new Term("word", key)); wr.Close(); System.DateTime end = System.DateTime.Now; System.Console.Out.Write(end.Ticks - start.Ticks); System.Console.Out.WriteLine(" milliseconds to read " + keys.Count + " words"); start = System.DateTime.Now; System.Random gen = new System.Random((System.Int32) 1251971); long fp = (gen.Next() & 0xF) + 1; long pp = (gen.Next() & 0xF) + 1; int[] docFreqs = new int[keys.Count]; long[] freqPointers = new long[keys.Count]; long[] proxPointers = new long[keys.Count]; for (int i = 0; i < keys.Count; i++) { docFreqs[i] = (gen.Next() & 0xF) + 1; freqPointers[i] = fp; proxPointers[i] = pp; fp += (gen.Next() & 0xF) + 1; ; pp += (gen.Next() & 0xF) + 1; ; } end = System.DateTime.Now; System.Console.Out.Write(end.Ticks - start.Ticks); System.Console.Out.WriteLine(" milliseconds to generate values"); start = System.DateTime.Now; Directory store = FSDirectory.GetDirectory("test.store", true); FieldInfos fis = new FieldInfos(); TermInfosWriter writer = new TermInfosWriter(store, "words", fis); fis.Add("word", false); for (int i = 0; i < keys.Count; i++) writer.Add((Term) keys[i], new TermInfo(docFreqs[i], freqPointers[i], proxPointers[i])); writer.Close(); end = System.DateTime.Now; System.Console.Out.Write(end.Ticks - start.Ticks); System.Console.Out.WriteLine(" milliseconds to write table"); System.Console.Out.WriteLine(" table occupies " + store.FileLength("words.tis") + " bytes"); start = System.DateTime.Now; TermInfosReader reader = new TermInfosReader(store, "words", fis); end = System.DateTime.Now; System.Console.Out.Write(end.Ticks - start.Ticks); System.Console.Out.WriteLine(" milliseconds to open table"); start = System.DateTime.Now; SegmentTermEnum enumerator = reader.Terms(); for (int i = 0; i < keys.Count; i++) { enumerator.Next(); Term key = (Term) keys[i]; if (!key.Equals(enumerator.Term())) { throw new System.Exception("wrong term: " + enumerator.Term() + ", expected: " + key + " at " + i); } TermInfo ti = enumerator.TermInfo(); if (ti.docFreq != docFreqs[i]) throw new System.Exception("wrong value: " + System.Convert.ToString(ti.docFreq, 16) + ", expected: " + System.Convert.ToString(docFreqs[i], 16) + " at " + i); if (ti.freqPointer != freqPointers[i]) throw new System.Exception("wrong value: " + System.Convert.ToString(ti.freqPointer, 16) + ", expected: " + System.Convert.ToString(freqPointers[i], 16) + " at " + i); if (ti.proxPointer != proxPointers[i]) throw new System.Exception("wrong value: " + System.Convert.ToString(ti.proxPointer, 16) + ", expected: " + System.Convert.ToString(proxPointers[i], 16) + " at " + i); } end = System.DateTime.Now; System.Console.Out.Write(end.Ticks - start.Ticks); System.Console.Out.WriteLine(" milliseconds to iterate over " + keys.Count + " words"); start = System.DateTime.Now; for (int i = 0; i < keys.Count; i++) { Term key = (Term) keys[i]; TermInfo ti = reader.Get(key); if (ti.docFreq != docFreqs[i]) throw new System.Exception("wrong value: " + System.Convert.ToString(ti.docFreq, 16) + ", expected: " + System.Convert.ToString(docFreqs[i], 16) + " at " + i); if (ti.freqPointer != freqPointers[i]) throw new System.Exception("wrong value: " + System.Convert.ToString(ti.freqPointer, 16) + ", expected: " + System.Convert.ToString(freqPointers[i], 16) + " at " + i); if (ti.proxPointer != proxPointers[i]) throw new System.Exception("wrong value: " + System.Convert.ToString(ti.proxPointer, 16) + ", expected: " + System.Convert.ToString(proxPointers[i], 16) + " at " + i); } end = System.DateTime.Now; System.Console.Out.Write((end.Ticks - start.Ticks) / (float) keys.Count); System.Console.Out.WriteLine(" average milliseconds per lookup"); TermEnum e = reader.Terms(new Term("word", "azz")); System.Console.Out.WriteLine("Word after azz is " + e.Term().text); reader.Close(); store.Close(); }
private void WritePostings(Posting[] postings, System.String segment) { IndexOutput freq = null, prox = null; TermInfosWriter tis = null; TermVectorsWriter termVectorWriter = null; try { //open files for inverse index storage freq = directory.CreateOutput(segment + ".frq"); prox = directory.CreateOutput(segment + ".prx"); tis = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); TermInfo ti = new TermInfo(); System.String currentField = null; for (int i = 0; i < postings.Length; i++) { Posting posting = postings[i]; // add an entry to the dictionary with pointers to prox and freq files ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), -1); tis.Add(posting.term, ti); // add an entry to the freq file int postingFreq = posting.freq; if (postingFreq == 1) { // optimize freq=1 freq.WriteVInt(1); } // set low bit of doc num. else { freq.WriteVInt(0); // the document number freq.WriteVInt(postingFreq); // frequency in doc } int lastPosition = 0; // write positions int[] positions = posting.positions; for (int j = 0; j < postingFreq; j++) { // use delta-encoding int position = positions[j]; prox.WriteVInt(position - lastPosition); lastPosition = position; } // check to see if we switched to a new field System.String termField = posting.term.Field(); if (currentField != termField) { // changing field - see if there is something to save currentField = termField; FieldInfo fi = fieldInfos.FieldInfo(currentField); if (fi.storeTermVector) { if (termVectorWriter == null) { termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos); termVectorWriter.OpenDocument(); } termVectorWriter.OpenField(currentField); } else if (termVectorWriter != null) { termVectorWriter.CloseField(); } } if (termVectorWriter != null && termVectorWriter.IsFieldOpen()) { termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets); } } if (termVectorWriter != null) { termVectorWriter.CloseDocument(); } } finally { // make an effort to close all streams we can but remember and re-throw // the first exception encountered in this process System.IO.IOException keep = null; if (freq != null) { try { freq.Close(); } catch (System.IO.IOException e) { if (keep == null) { keep = e; } } } if (prox != null) { try { prox.Close(); } catch (System.IO.IOException e) { if (keep == null) { keep = e; } } } if (tis != null) { try { tis.Close(); } catch (System.IO.IOException e) { if (keep == null) { keep = e; } } } if (termVectorWriter != null) { try { termVectorWriter.Close(); } catch (System.IO.IOException e) { if (keep == null) { keep = e; } } } if (keep != null) { throw new System.IO.IOException(keep.StackTrace); } } }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ void AppendPostings(DocumentsWriter.FlushState flushState, FreqProxTermsWriterPerField[] fields, TermInfosWriter termsOut, IndexOutput freqOut, IndexOutput proxOut, DefaultSkipListWriter skipListWriter) { int fieldNumber = fields[0].fieldInfo.number; int numFields = fields.Length; FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields]; for (int i = 0; i < numFields; i++) { FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]); System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields[0].fieldInfo); // Should always be true bool result = fms.nextTerm(); System.Diagnostics.Debug.Assert(result); } int skipInterval = termsOut.skipInterval; bool currentFieldOmitTf = fields[0].fieldInfo.omitTf; // If current field omits tf then it cannot store // payloads. We silently drop the payloads in this case: bool currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads; FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; while (numFields > 0) { // Get the next term to merge termStates[0] = mergeStates[0]; int numToMerge = 1; for (int i = 1; i < numFields; i++) { char[] text = mergeStates[i].text; int textOffset = mergeStates[i].textOffset; int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset); if (cmp < 0) { termStates[0] = mergeStates[i]; numToMerge = 1; } else if (cmp == 0) { termStates[numToMerge++] = mergeStates[i]; } } int df = 0; int lastPayloadLength = -1; int lastDoc = 0; char[] text_Renamed = termStates[0].text; int start = termStates[0].textOffset; long freqPointer = freqOut.GetFilePointer(); long proxPointer; if (proxOut != null) { proxPointer = proxOut.GetFilePointer(); } else { proxPointer = 0; } skipListWriter.ResetSkip(); // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. while (numToMerge > 0) { if ((++df % skipInterval) == 0) { skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength); skipListWriter.BufferSkip(df); } FreqProxFieldMergeState minState = termStates[0]; for (int i = 1; i < numToMerge; i++) { if (termStates[i].docID < minState.docID) { minState = termStates[i]; } } int doc = minState.docID; int termDocFreq = minState.termFreq; System.Diagnostics.Debug.Assert(doc < flushState.numDocsInRAM); System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1); ByteSliceReader prox = minState.prox; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (!currentFieldOmitTf) { // omitTf == false so we do write positions & payload System.Diagnostics.Debug.Assert(proxOut != null); for (int j = 0; j < termDocFreq; j++) { int code = prox.ReadVInt(); if (currentFieldStorePayloads) { int payloadLength; if ((code & 1) != 0) { // This position has a payload payloadLength = prox.ReadVInt(); } else { payloadLength = 0; } if (payloadLength != lastPayloadLength) { proxOut.WriteVInt(code | 1); proxOut.WriteVInt(payloadLength); lastPayloadLength = payloadLength; } else { proxOut.WriteVInt(code & (~1)); } if (payloadLength > 0) { copyBytes(prox, proxOut, payloadLength); } } else { System.Diagnostics.Debug.Assert(0 == (code & 1)); proxOut.WriteVInt(code >> 1); } } //End for int newDocCode = (doc - lastDoc) << 1; if (1 == termDocFreq) { freqOut.WriteVInt(newDocCode | 1); } else { freqOut.WriteVInt(newDocCode); freqOut.WriteVInt(termDocFreq); } } else { // omitTf==true: we store only the docs, without // term freq, positions, payloads freqOut.WriteVInt(doc - lastDoc); } lastDoc = doc; if (!minState.nextDoc()) { // Remove from termStates int upto = 0; for (int i = 0; i < numToMerge; i++) { if (termStates[i] != minState) { termStates[upto++] = termStates[i]; } } numToMerge--; System.Diagnostics.Debug.Assert(upto == numToMerge); // Advance this state to the next term if (!minState.nextTerm()) { // OK, no more terms, so remove from mergeStates // as well upto = 0; for (int i = 0; i < numFields; i++) { if (mergeStates[i] != minState) { mergeStates[upto++] = mergeStates[i]; } } numFields--; System.Diagnostics.Debug.Assert(upto == numFields); } } } System.Diagnostics.Debug.Assert(df > 0); // Done merging this term long skipPointer = skipListWriter.WriteSkip(freqOut); // Write term termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer)); // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text_Renamed, start, termsUTF8); // TODO: we could save O(n) re-scan of the term by // computing the shared prefix with the last term // while during the UTF8 encoding termsOut.Add(fieldNumber, termsUTF8.result, termsUTF8.length, termInfo); } }