public override void GetTermFreqVector(int docNumber, String field, TermVectorMapper mapper) { if (DEBUG) { System.Diagnostics.Debug.WriteLine("MemoryIndexReader.getTermFreqVector"); } Info info = GetInfo(field); if (info == null) { return; } info.SortTerms(); mapper.SetExpectations(field, info.SortedTerms.Length, _index.stride != 1, true); for (int i = info.SortedTerms.Length; --i >= 0;) { ArrayIntList positions = info.SortedTerms[i].Value; int size = positions.Size(); var offsets = new TermVectorOffsetInfo[size / _index.stride]; for (int k = 0, j = 1; j < size; k++, j += _index.stride) { int start = positions.Get(j); int end = positions.Get(j + 1); offsets[k] = new TermVectorOffsetInfo(start, end); } mapper.Map(info.SortedTerms[i].Key, _index.NumPositions(info.SortedTerms[i].Value), offsets, (info.SortedTerms[i].Value).ToArray(_index.stride)); } }
/* * Returns a String representation of the index data for debugging purposes. * * @return the string representation */ public override String ToString() { StringBuilder result = new StringBuilder(256); SortFields(); int sumChars = 0; int sumPositions = 0; int sumTerms = 0; for (int i = 0; i < sortedFields.Length; i++) { KeyValuePair <String, Info> entry = sortedFields[i]; String fieldName = entry.Key; Info info = entry.Value; info.SortTerms(); result.Append(fieldName + ":\n"); int numChars = 0; int numPos = 0; for (int j = 0; j < info.SortedTerms.Length; j++) { KeyValuePair <String, ArrayIntList> e = info.SortedTerms[j]; String term = e.Key; ArrayIntList positions = e.Value; result.Append("\t'" + term + "':" + NumPositions(positions) + ":"); result.Append(positions.ToString(stride)); // ignore offsets result.Append("\n"); numPos += NumPositions(positions); numChars += term.Length; } result.Append("\tterms=" + info.SortedTerms.Length); result.Append(", positions=" + numPos); result.Append(", Kchars=" + (numChars / 1000.0f)); result.Append("\n"); sumPositions += numPos; sumChars += numChars; sumTerms += info.SortedTerms.Length; } result.Append("\nfields=" + sortedFields.Length); result.Append(", terms=" + sumTerms); result.Append(", positions=" + sumPositions); result.Append(", Kchars=" + (sumChars / 1000.0f)); return(result.ToString()); }
public TermVectorOffsetInfo[] GetOffsets(int index) { if (_index.stride == 1) { return(null); // no offsets stored } ArrayIntList positions = sortedTerms[index].Value; int size = positions.Size(); TermVectorOffsetInfo[] offsets = new TermVectorOffsetInfo[size / _index.stride]; for (int i = 0, j = 1; j < size; i++, j += _index.stride) { int start = positions.Get(j); int end = positions.Get(j + 1); offsets[i] = new TermVectorOffsetInfo(start, end); } return(offsets); }
/* * Returns a reasonable approximation of the main memory [bytes] consumed by * this instance. Useful for smart memory sensititive caches/pools. Assumes * fieldNames are interned, whereas tokenized terms are memory-overlaid. * * @return the main memory consumption */ public int GetMemorySize() { // for example usage in a smart cache see nux.xom.pool.Pool int PTR = VM.PTR; int INT = VM.INT; int size = 0; size += VM.SizeOfObject(2 * PTR + INT); // memory index if (sortedFields != null) { size += VM.SizeOfObjectArray(sortedFields.Length); } size += VM.SizeOfHashMap(fields.Count); foreach (var entry in fields) { // for each Field Info Info info = entry.Value; size += VM.SizeOfObject(2 * INT + 3 * PTR); // Info instance vars if (info.SortedTerms != null) { size += VM.SizeOfObjectArray(info.SortedTerms.Length); } int len = info.Terms.Count; size += VM.SizeOfHashMap(len); var iter2 = info.Terms.GetEnumerator(); while (--len >= 0) { iter2.MoveNext(); // for each term KeyValuePair <String, ArrayIntList> e = iter2.Current; size += VM.SizeOfObject(PTR + 3 * INT); // assumes substring() memory overlay // size += STR + 2 * ((String) e.getKey()).length(); ArrayIntList positions = e.Value; size += VM.SizeOfArrayIntList(positions.Size()); } } return(size); }
public void Seek(Term term) { this.term = term; if (DEBUG) { System.Diagnostics.Debug.WriteLine(".seek: " + term); } if (term == null) { hasNext = true; // term==null means match all docs } else { Info info = _reader.GetInfo(term.Field); current = info == null ? null : info.GetPositions(term.Text); hasNext = (current != null); cursor = 0; } }
private int NumPositions(ArrayIntList positions) { return(positions.Size() / stride); }
/* * Iterates over the given token stream and adds the resulting terms to the index; * Equivalent to adding a tokenized, indexed, termVectorStored, unstored, * Lucene {@link org.apache.lucene.document.Field}. * Finally closes the token stream. Note that untokenized keywords can be added with this method via * {@link #CreateKeywordTokenStream(Collection)}, the Lucene contrib <c>KeywordTokenizer</c> or similar utilities. * * @param fieldName * a name to be associated with the text * @param stream * the token stream to retrieve tokens from. * @param boost * the boost factor for hits for this field * @see org.apache.lucene.document.Field#setBoost(float) */ public void AddField(String fieldName, TokenStream stream, float boost) { try { if (fieldName == null) { throw new ArgumentException("fieldName must not be null"); } if (stream == null) { throw new ArgumentException("token stream must not be null"); } if (boost <= 0.0f) { throw new ArgumentException("boost factor must be greater than 0.0"); } if (fields[fieldName] != null) { throw new ArgumentException("field must not be added more than once"); } var terms = new HashMap <String, ArrayIntList>(); int numTokens = 0; int numOverlapTokens = 0; int pos = -1; var termAtt = stream.AddAttribute <ITermAttribute>(); var posIncrAttribute = stream.AddAttribute <IPositionIncrementAttribute>(); var offsetAtt = stream.AddAttribute <IOffsetAttribute>(); stream.Reset(); while (stream.IncrementToken()) { String term = termAtt.Term; if (term.Length == 0) { continue; // nothing to do } // if (DEBUG) System.Diagnostics.Debug.WriteLine("token='" + term + "'"); numTokens++; int posIncr = posIncrAttribute.PositionIncrement; if (posIncr == 0) { numOverlapTokens++; } pos += posIncr; ArrayIntList positions = terms[term]; if (positions == null) { // term not seen before positions = new ArrayIntList(stride); terms[term] = positions; } if (stride == 1) { positions.Add(pos); } else { positions.Add(pos, offsetAtt.StartOffset, offsetAtt.EndOffset); } } stream.End(); // ensure infos.numTokens > 0 invariant; needed for correct operation of terms() if (numTokens > 0) { boost = boost * docBoost; // see DocumentWriter.addDocument(...) fields[fieldName] = new Info(terms, numTokens, numOverlapTokens, boost); sortedFields = null; // invalidate sorted view, if any } } catch (IOException e) { // can never happen throw new SystemException(string.Empty, e); } finally { try { if (stream != null) { stream.Close(); } } catch (IOException e2) { throw new SystemException(string.Empty, e2); } } }
public void Seek(Term term) { this.term = term; if (DEBUG) System.Diagnostics.Debug.WriteLine(".seek: " + term); if (term == null) { hasNext = true; // term==null means match all docs } else { Info info = _reader.GetInfo(term.Field); current = info == null ? null : info.GetPositions(term.Text); hasNext = (current != null); cursor = 0; } }