private Posting[] SortPostingTable() { // copy postingTable into an array Posting[] array = new Posting[postingTable.Count]; System.Collections.IEnumerator postings = postingTable.Values.GetEnumerator(); for (int i = 0; postings.MoveNext(); i++) { array[i] = (Posting)postings.Current; } // sort the array QuickSort(array, 0, array.Length - 1); return(array); }
private Term termBuffer = new Term("", ""); // avoid consing private void AddPosition(System.String field, System.String text, int position, TermVectorOffsetInfo offset) { termBuffer.Set(field, text); //System.out.println("Offset: " + offset); Posting ti = (Posting)postingTable[termBuffer]; if (ti != null) { // word seen before int freq = ti.freq; if (ti.positions.Length == freq) { // positions array is full int[] newPositions = new int[freq * 2]; // double size int[] positions = ti.positions; for (int i = 0; i < freq; i++) { // copy old positions to new newPositions[i] = positions[i]; } ti.positions = newPositions; } ti.positions[freq] = position; // add new position if (offset != null) { if (ti.offsets.Length == freq) { TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[freq * 2]; TermVectorOffsetInfo[] offsets = ti.offsets; for (int i = 0; i < freq; i++) { newOffsets[i] = offsets[i]; } ti.offsets = newOffsets; } ti.offsets[freq] = offset; } ti.freq = freq + 1; // update frequency } else { // word not seen before Term term = new Term(field, text, false); postingTable[term] = new Posting(term, position, offset); } }
private Posting[] SortPostingTable() { // copy postingTable into an array Posting[] array = new Posting[postingTable.Count]; System.Collections.IEnumerator postings = postingTable.Values.GetEnumerator(); for (int i = 0; postings.MoveNext(); i++) { array[i] = (Posting) postings.Current; } // sort the array QuickSort(array, 0, array.Length - 1); return array; }
private Term termBuffer = new Term("", ""); // avoid consing private void AddPosition(System.String field, System.String text, int position, TermVectorOffsetInfo offset) { termBuffer.Set(field, text); //System.out.println("Offset: " + offset); Posting ti = (Posting) postingTable[termBuffer]; if (ti != null) { // word seen before int freq = ti.freq; if (ti.positions.Length == freq) { // positions array is full int[] newPositions = new int[freq * 2]; // double size int[] positions = ti.positions; Array.Copy(positions, 0, newPositions, 0, freq); ti.positions = newPositions; } ti.positions[freq] = position; // add new position if (offset != null) { if (ti.offsets.Length == freq) { TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[freq * 2]; TermVectorOffsetInfo[] offsets = ti.offsets; Array.Copy(offsets, 0, newOffsets, 0, freq); ti.offsets = newOffsets; } ti.offsets[freq] = offset; } ti.freq = freq + 1; // update frequency } else { // word not seen before Term term = new Term(field, text, false); postingTable[term] = new Posting(term, position, offset); } }
internal void RecyclePostings(Posting[] postings, int numPostings) { lock (this) { // Move all Postings from this ThreadState back to our // free list. We pre-allocated this array while we were // creating Postings to make sure it's large enough System.Diagnostics.Debug.Assert(postingsFreeCount + numPostings <= postingsFreeList.Length); Array.Copy(postings, 0, postingsFreeList, postingsFreeCount, numPostings); postingsFreeCount += numPostings; } }
/* Allocate more Postings from shared pool */ internal void GetPostings(Posting[] postings) { lock (this) { numBytesUsed += postings.Length * POSTING_NUM_BYTE; int numToCopy; if (postingsFreeCount < postings.Length) numToCopy = postingsFreeCount; else numToCopy = postings.Length; int start = postingsFreeCount - numToCopy; Array.Copy(postingsFreeList, start, postings, 0, numToCopy); postingsFreeCount -= numToCopy; // Directly allocate the remainder if any if (numToCopy < postings.Length) { int extra = postings.Length - numToCopy; int newPostingsAllocCount = postingsAllocCount + extra; if (newPostingsAllocCount > postingsFreeList.Length) { postingsFreeList = new Posting[(int) (1.25 * newPostingsAllocCount)]; } BalanceRAM(); for (int i = numToCopy; i < postings.Length; i++) { postings[i] = new Posting(); numBytesAlloc += POSTING_NUM_BYTE; postingsAllocCount++; } } } }
internal bool NextTerm() { postingUpto++; if (postingUpto == field.numPostings) return false; p = postings[postingUpto]; docID = 0; text = field.threadState.charPool.buffers[p.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT]; textOffset = p.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK; if (p.freqUpto > p.freqStart) freq.Init(field.threadState.postingsPool, p.freqStart, p.freqUpto); else freq.bufferOffset = freq.upto = freq.endIndex = 0; prox.Init(field.threadState.postingsPool, p.proxStart, p.proxUpto); // Should always be true bool result = NextDoc(); System.Diagnostics.Debug.Assert(result); return true; }
/// <summary>Creates a segment from all Postings in the Postings /// hashes across all ThreadStates & FieldDatas. /// </summary> private System.Collections.IList WriteSegment() { System.Diagnostics.Debug.Assert(AllThreadsIdle()); System.Diagnostics.Debug.Assert(nextDocID == numDocsInRAM); System.String segmentName; segmentName = segment; TermInfosWriter termsOut = new TermInfosWriter(directory, segmentName, fieldInfos, writer.GetTermIndexInterval()); IndexOutput freqOut = directory.CreateOutput(segmentName + ".frq"); IndexOutput proxOut = directory.CreateOutput(segmentName + ".prx"); // Gather all FieldData's that have postings, across all // ThreadStates System.Collections.ArrayList allFields = new System.Collections.ArrayList(); System.Diagnostics.Debug.Assert(AllThreadsIdle()); for (int i = 0; i < threadStates.Length; i++) { ThreadState state = threadStates[i]; state.TrimFields(); int numFields = state.numAllFieldData; for (int j = 0; j < numFields; j++) { ThreadState.FieldData fp = state.allFieldDataArray[j]; if (fp.numPostings > 0) allFields.Add(fp); } } // Sort by field name allFields.Sort(); int numAllFields = allFields.Count; skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, termsOut.maxSkipLevels, numDocsInRAM, freqOut, proxOut); int start = 0; while (start < numAllFields) { System.String fieldName = ((ThreadState.FieldData) allFields[start]).fieldInfo.name; int end = start + 1; while (end < numAllFields && ((ThreadState.FieldData) allFields[end]).fieldInfo.name.Equals(fieldName)) end++; ThreadState.FieldData[] fields = new ThreadState.FieldData[end - start]; for (int i = start; i < end; i++) fields[i - start] = (ThreadState.FieldData) allFields[i]; // If this field has postings then add them to the // segment AppendPostings(fields, termsOut, freqOut, proxOut); for (int i = 0; i < fields.Length; i++) fields[i].ResetPostingArrays(); start = end; } freqOut.Close(); proxOut.Close(); termsOut.Close(); // Record all files we have flushed System.Collections.IList flushedFiles = new System.Collections.ArrayList(); flushedFiles.Add(SegmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION)); flushedFiles.Add(SegmentFileName(IndexFileNames.FREQ_EXTENSION)); flushedFiles.Add(SegmentFileName(IndexFileNames.PROX_EXTENSION)); flushedFiles.Add(SegmentFileName(IndexFileNames.TERMS_EXTENSION)); flushedFiles.Add(SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)); if (hasNorms) { WriteNorms(segmentName, numDocsInRAM); flushedFiles.Add(SegmentFileName(IndexFileNames.NORMS_EXTENSION)); } if (infoStream != null) { long newSegmentSize = SegmentSize(segmentName); System.String message = String.Format(nf, " oldRAMSize={0:d} newFlushedSize={1:d} docs/MB={2:f} new/old={3:%}", new Object[] { numBytesUsed, newSegmentSize, (numDocsInRAM / (newSegmentSize / 1024.0 / 1024.0)), (newSegmentSize / numBytesUsed) }); infoStream.WriteLine(message); } ResetPostingsData(); nextDocID = 0; nextWriteDocID = 0; numDocsInRAM = 0; files = null; // Maybe downsize postingsFreeList array if (postingsFreeList.Length > 1.5 * postingsFreeCount) { int newSize = postingsFreeList.Length; while (newSize > 1.25 * postingsFreeCount) { newSize = (int) (newSize * 0.8); } Posting[] newArray = new Posting[newSize]; Array.Copy(postingsFreeList, 0, newArray, 0, postingsFreeCount); postingsFreeList = newArray; } return flushedFiles; }
private BufferedNorms[] norms; // Holds norms until we flush internal DocumentsWriter(Directory directory, IndexWriter writer) { InitBlock(); this.directory = directory; this.writer = writer; postingsFreeList = new Posting[0]; }
/// <summary>Compares term text for two Posting instance and /// returns -1 if p1 < p2; 1 if p1 > p2; else 0. /// </summary> internal int ComparePostings(Posting p1, Posting p2) { char[] text1 = charPool.buffers[p1.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT]; int pos1 = p1.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK; char[] text2 = charPool.buffers[p2.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT]; int pos2 = p2.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK; while (true) { char c1 = text1[pos1++]; char c2 = text2[pos2++]; if (c1 < c2) if (0xffff == c2) return 1; else return - 1; else if (c2 < c1) if (0xffff == c1) return - 1; else return 1; else if (0xffff == c1) return 0; } }
internal void QuickSort(Posting[] postings, int lo, int hi) { if (lo >= hi) return ; int mid = SupportClass.Number.URShift((lo + hi), 1); if (ComparePostings(postings[lo], postings[mid]) > 0) { Posting tmp = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp; } if (ComparePostings(postings[mid], postings[hi]) > 0) { Posting tmp = postings[mid]; postings[mid] = postings[hi]; postings[hi] = tmp; if (ComparePostings(postings[lo], postings[mid]) > 0) { Posting tmp2 = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp2; } } int left = lo + 1; int right = hi - 1; if (left >= right) return ; Posting partition = postings[mid]; for (; ; ) { while (ComparePostings(postings[right], partition) > 0) --right; while (left < right && ComparePostings(postings[left], partition) <= 0) ++left; if (left < right) { Posting tmp = postings[left]; postings[left] = postings[right]; postings[right] = tmp; --right; } else { break; } } QuickSort(postings, lo, left); QuickSort(postings, left + 1, hi); }
/// <summary>Do in-place sort of Posting array </summary> internal void DoPostingSort(Posting[] postings, int numPosting) { QuickSort(postings, 0, numPosting - 1); }
private void WritePostings(Posting[] postings, System.String segment) { IndexOutput freq = null, prox = null; TermInfosWriter tis = null; TermVectorsWriter termVectorWriter = null; try { //open files for inverse index storage freq = directory.CreateOutput(segment + ".frq"); prox = directory.CreateOutput(segment + ".prx"); tis = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); TermInfo ti = new TermInfo(); System.String currentField = null; for (int i = 0; i < postings.Length; i++) { Posting posting = postings[i]; // add an entry to the dictionary with pointers to prox and freq files ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), -1); tis.Add(posting.term, ti); // add an entry to the freq file int postingFreq = posting.freq; if (postingFreq == 1) { // optimize freq=1 freq.WriteVInt(1); } // set low bit of doc num. else { freq.WriteVInt(0); // the document number freq.WriteVInt(postingFreq); // frequency in doc } int lastPosition = 0; // write positions int[] positions = posting.positions; for (int j = 0; j < postingFreq; j++) { // use delta-encoding int position = positions[j]; prox.WriteVInt(position - lastPosition); lastPosition = position; } // check to see if we switched to a new field System.String termField = posting.term.Field(); if (currentField != termField) { // changing field - see if there is something to save currentField = termField; FieldInfo fi = fieldInfos.FieldInfo(currentField); if (fi.storeTermVector) { if (termVectorWriter == null) { termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos); termVectorWriter.OpenDocument(); } termVectorWriter.OpenField(currentField); } else if (termVectorWriter != null) { termVectorWriter.CloseField(); } } if (termVectorWriter != null && termVectorWriter.IsFieldOpen()) { termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets); } } if (termVectorWriter != null) { termVectorWriter.CloseDocument(); } } finally { // make an effort to close all streams we can but remember and re-throw // the first exception encountered in this process System.IO.IOException keep = null; if (freq != null) { try { freq.Close(); } catch (System.IO.IOException e) { if (keep == null) { keep = e; } } } if (prox != null) { try { prox.Close(); } catch (System.IO.IOException e) { if (keep == null) { keep = e; } } } if (tis != null) { try { tis.Close(); } catch (System.IO.IOException e) { if (keep == null) { keep = e; } } } if (termVectorWriter != null) { try { termVectorWriter.Close(); } catch (System.IO.IOException e) { if (keep == null) { keep = e; } } } if (keep != null) { throw new System.IO.IOException(keep.StackTrace); } } }
private static void QuickSort(Posting[] postings, int lo, int hi) { if (lo >= hi) { return; } int mid = (lo + hi) / 2; if (postings[lo].term.CompareTo(postings[mid].term) > 0) { Posting tmp = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp; } if (postings[mid].term.CompareTo(postings[hi].term) > 0) { Posting tmp = postings[mid]; postings[mid] = postings[hi]; postings[hi] = tmp; if (postings[lo].term.CompareTo(postings[mid].term) > 0) { Posting tmp2 = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp2; } } int left = lo + 1; int right = hi - 1; if (left >= right) { return; } Term partition = postings[mid].term; for (; ;) { while (postings[right].term.CompareTo(partition) > 0) { --right; } while (left < right && postings[left].term.CompareTo(partition) <= 0) { ++left; } if (left < right) { Posting tmp = postings[left]; postings[left] = postings[right]; postings[right] = tmp; --right; } else { break; } } QuickSort(postings, lo, left); QuickSort(postings, left + 1, hi); }
private static void QuickSort(Posting[] postings, int lo, int hi) { if (lo >= hi) return ; int mid = (lo + hi) / 2; if (postings[lo].term.CompareTo(postings[mid].term) > 0) { Posting tmp = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp; } if (postings[mid].term.CompareTo(postings[hi].term) > 0) { Posting tmp = postings[mid]; postings[mid] = postings[hi]; postings[hi] = tmp; if (postings[lo].term.CompareTo(postings[mid].term) > 0) { Posting tmp2 = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp2; } } int left = lo + 1; int right = hi - 1; if (left >= right) return ; Term partition = postings[mid].term; for (; ; ) { while (postings[right].term.CompareTo(partition) > 0) --right; while (left < right && postings[left].term.CompareTo(partition) <= 0) ++left; if (left < right) { Posting tmp = postings[left]; postings[left] = postings[right]; postings[right] = tmp; --right; } else { break; } } QuickSort(postings, lo, left); QuickSort(postings, left + 1, hi); }
private void WritePostings(Posting[] postings, System.String segment) { IndexOutput freq = null, prox = null; TermInfosWriter tis = null; TermVectorsWriter termVectorWriter = null; try { //open files for inverse index storage freq = directory.CreateOutput(segment + ".frq"); prox = directory.CreateOutput(segment + ".prx"); tis = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); TermInfo ti = new TermInfo(); System.String currentField = null; for (int i = 0; i < postings.Length; i++) { Posting posting = postings[i]; // add an entry to the dictionary with pointers to prox and freq files ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), - 1); tis.Add(posting.term, ti); // add an entry to the freq file int postingFreq = posting.freq; if (postingFreq == 1) // optimize freq=1 freq.WriteVInt(1); // set low bit of doc num. else { freq.WriteVInt(0); // the document number freq.WriteVInt(postingFreq); // frequency in doc } int lastPosition = 0; // write positions int[] positions = posting.positions; for (int j = 0; j < postingFreq; j++) { // use delta-encoding int position = positions[j]; prox.WriteVInt(position - lastPosition); lastPosition = position; } // check to see if we switched to a new field System.String termField = posting.term.Field(); if (currentField != termField) { // changing field - see if there is something to save currentField = termField; FieldInfo fi = fieldInfos.FieldInfo(currentField); if (fi.storeTermVector) { if (termVectorWriter == null) { termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos); termVectorWriter.OpenDocument(); } termVectorWriter.OpenField(currentField); } else if (termVectorWriter != null) { termVectorWriter.CloseField(); } } if (termVectorWriter != null && termVectorWriter.IsFieldOpen()) { termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets); } } if (termVectorWriter != null) termVectorWriter.CloseDocument(); } finally { // make an effort to close all streams we can but remember and re-throw // the first exception encountered in this process System.IO.IOException keep = null; if (freq != null) try { freq.Close(); } catch (System.IO.IOException e) { if (keep == null) keep = e; } if (prox != null) try { prox.Close(); } catch (System.IO.IOException e) { if (keep == null) keep = e; } if (tis != null) try { tis.Close(); } catch (System.IO.IOException e) { if (keep == null) keep = e; } if (termVectorWriter != null) try { termVectorWriter.Close(); } catch (System.IO.IOException e) { if (keep == null) keep = e; } if (keep != null) { throw new System.IO.IOException(keep.StackTrace); } } }
/// <summary>Called when postings hash is too small (> 50% /// occupied) or too large (< 20% occupied). /// </summary> internal void RehashPostings(int newSize) { int newMask = newSize - 1; Posting[] newHash = new Posting[newSize]; for (int i = 0; i < postingsHashSize; i++) { Posting p0 = postingsHash[i]; if (p0 != null) { int start = p0.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK; char[] text = Enclosing_Instance.charPool.buffers[p0.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT]; int pos = start; while (text[pos] != 0xffff) pos++; int code = 0; while (pos > start) code = (code * 31) + text[--pos]; int hashPos = code & newMask; System.Diagnostics.Debug.Assert(hashPos >= 0); if (newHash[hashPos] != null) { int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & newMask; } while (newHash[hashPos] != null); } newHash[hashPos] = p0; } } postingsHashMask = newMask; postingsHash = newHash; postingsHashSize = newSize; postingsHashHalfSize = newSize >> 1; }