/// <summary> Test term vectors for a segment.</summary> private Status.TermVectorStatus TestTermVectors(SegmentInfo info, SegmentReader reader, System.Globalization.NumberFormatInfo format) { Status.TermVectorStatus status = new Status.TermVectorStatus(); try { if (infoStream != null) { infoStream.Write(" test: term vectors........"); } for (int j = 0; j < info.docCount; ++j) { if (!reader.IsDeleted(j)) { status.docCount++; TermFreqVector[] tfv = reader.GetTermFreqVectors(j); if (tfv != null) { status.totVectors += tfv.Length; } } } Msg(System.String.Format(format, "OK [{0:d} total vector count; avg {1:f} term/freq vector fields per doc]", new object[] { status.totVectors, (((float)status.totVectors) / status.docCount) })); } catch (System.Exception e) { Msg("ERROR [" + System.Convert.ToString(e.Message) + "]"); status.error = e; if (infoStream != null) { infoStream.WriteLine(e.StackTrace); } } return(status); }
/// <summary> Test term vectors for a segment.</summary> private Status.TermVectorStatus TestTermVectors(SegmentInfo info, SegmentReader reader, System.Globalization.NumberFormatInfo format) { var status = new Status.TermVectorStatus(); try { if (infoStream != null) { infoStream.Write(" test: term vectors........"); } for (int j = 0; j < info.docCount; ++j) { if (!reader.IsDeleted(j)) { status.docCount++; ITermFreqVector[] tfv = reader.GetTermFreqVectors(j); if (tfv != null) { status.totVectors += tfv.Length; } } } Msg(System.String.Format(format, "OK [{0:d} total vector count; avg {1:f} term/freq vector fields per doc]", new object[] { status.totVectors, (((float) status.totVectors) / status.docCount) })); } catch (System.Exception e) { Msg("ERROR [" + System.Convert.ToString(e.Message) + "]"); status.error = e; if (infoStream != null) { infoStream.WriteLine(e.StackTrace); } } return status; }
/// <summary> /// Test term vectors. /// @lucene.experimental /// </summary> public static Status.TermVectorStatus TestTermVectors(AtomicReader reader, TextWriter infoStream, bool verbose, bool crossCheckTermVectors) { Status.TermVectorStatus status = new Status.TermVectorStatus(); FieldInfos fieldInfos = reader.FieldInfos; Bits onlyDocIsDeleted = new FixedBitSet(1); try { if (infoStream != null) { infoStream.Write(" test: term vectors........"); } DocsEnum docs = null; DocsAndPositionsEnum postings = null; // Only used if crossCheckTermVectors is true: DocsEnum postingsDocs = null; DocsAndPositionsEnum postingsPostings = null; Bits liveDocs = reader.LiveDocs; Fields postingsFields; // TODO: testTermsIndex if (crossCheckTermVectors) { postingsFields = reader.Fields; } else { postingsFields = null; } TermsEnum termsEnum = null; TermsEnum postingsTermsEnum = null; for (int j = 0; j < reader.MaxDoc; ++j) { // Intentionally pull/visit (but don't count in // stats) deleted documents to make sure they too // are not corrupt: Fields tfv = reader.GetTermVectors(j); // TODO: can we make a IS(FIR) that searches just // this term vector... to pass for searcher? if (tfv != null) { // First run with no deletions: CheckFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose); // Again, with the one doc deleted: CheckFields(tfv, onlyDocIsDeleted, 1, fieldInfos, false, true, infoStream, verbose); // Only agg stats if the doc is live: bool doStats = liveDocs == null || liveDocs.Get(j); if (doStats) { status.DocCount++; } foreach (string field in tfv) { if (doStats) { status.TotVectors++; } // Make sure FieldInfo thinks this field is vector'd: FieldInfo fieldInfo = fieldInfos.FieldInfo(field); if (!fieldInfo.HasVectors()) { throw new Exception("docID=" + j + " has term vectors for field=" + field + " but FieldInfo has storeTermVector=false"); } if (crossCheckTermVectors) { Terms terms = tfv.Terms(field); termsEnum = terms.Iterator(termsEnum); bool postingsHasFreq = fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS; bool postingsHasPayload = fieldInfo.HasPayloads(); bool vectorsHasPayload = terms.HasPayloads(); Terms postingsTerms = postingsFields.Terms(field); if (postingsTerms == null) { throw new Exception("vector field=" + field + " does not exist in postings; doc=" + j); } postingsTermsEnum = postingsTerms.Iterator(postingsTermsEnum); bool hasProx = terms.HasOffsets() || terms.HasPositions(); BytesRef term = null; while ((term = termsEnum.Next()) != null) { if (hasProx) { postings = termsEnum.DocsAndPositions(null, postings); Debug.Assert(postings != null); docs = null; } else { docs = termsEnum.Docs(null, docs); Debug.Assert(docs != null); postings = null; } DocsEnum docs2; if (hasProx) { Debug.Assert(postings != null); docs2 = postings; } else { Debug.Assert(docs != null); docs2 = docs; } DocsEnum postingsDocs2; if (!postingsTermsEnum.SeekExact(term)) { throw new Exception("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j); } postingsPostings = postingsTermsEnum.DocsAndPositions(null, postingsPostings); if (postingsPostings == null) { // Term vectors were indexed w/ pos but postings were not postingsDocs = postingsTermsEnum.Docs(null, postingsDocs); if (postingsDocs == null) { throw new Exception("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j); } } if (postingsPostings != null) { postingsDocs2 = postingsPostings; } else { postingsDocs2 = postingsDocs; } int advanceDoc = postingsDocs2.Advance(j); if (advanceDoc != j) { throw new Exception("vector term=" + term + " field=" + field + ": doc=" + j + " was not found in postings (got: " + advanceDoc + ")"); } int doc = docs2.NextDoc(); if (doc != 0) { throw new Exception("vector for doc " + j + " didn't return docID=0: got docID=" + doc); } if (postingsHasFreq) { int tf = docs2.Freq(); if (postingsHasFreq && postingsDocs2.Freq() != tf) { throw new Exception("vector term=" + term + " field=" + field + " doc=" + j + ": freq=" + tf + " differs from postings freq=" + postingsDocs2.Freq()); } if (hasProx) { for (int i = 0; i < tf; i++) { int pos = postings.NextPosition(); if (postingsPostings != null) { int postingsPos = postingsPostings.NextPosition(); if (terms.HasPositions() && pos != postingsPos) { throw new Exception("vector term=" + term + " field=" + field + " doc=" + j + ": pos=" + pos + " differs from postings pos=" + postingsPos); } } // Call the methods to at least make // sure they don't throw exc: int startOffset = postings.StartOffset(); int endOffset = postings.EndOffset(); // TODO: these are too anal...? /* if (endOffset < startOffset) { throw new RuntimeException("vector startOffset=" + startOffset + " is > endOffset=" + endOffset); } if (startOffset < lastStartOffset) { throw new RuntimeException("vector startOffset=" + startOffset + " is < prior startOffset=" + lastStartOffset); } lastStartOffset = startOffset; */ if (postingsPostings != null) { int postingsStartOffset = postingsPostings.StartOffset(); int postingsEndOffset = postingsPostings.EndOffset(); if (startOffset != -1 && postingsStartOffset != -1 && startOffset != postingsStartOffset) { throw new Exception("vector term=" + term + " field=" + field + " doc=" + j + ": startOffset=" + startOffset + " differs from postings startOffset=" + postingsStartOffset); } if (endOffset != -1 && postingsEndOffset != -1 && endOffset != postingsEndOffset) { throw new Exception("vector term=" + term + " field=" + field + " doc=" + j + ": endOffset=" + endOffset + " differs from postings endOffset=" + postingsEndOffset); } } BytesRef payload = postings.Payload; if (payload != null) { Debug.Assert(vectorsHasPayload); } if (postingsHasPayload && vectorsHasPayload) { Debug.Assert(postingsPostings != null); if (payload == null) { // we have payloads, but not at this position. // postings has payloads too, it should not have one at this position if (postingsPostings.Payload != null) { throw new Exception("vector term=" + term + " field=" + field + " doc=" + j + " has no payload but postings does: " + postingsPostings.Payload); } } else { // we have payloads, and one at this position // postings should also have one at this position, with the same bytes. if (postingsPostings.Payload == null) { throw new Exception("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but postings does not."); } BytesRef postingsPayload = postingsPostings.Payload; if (!payload.Equals(postingsPayload)) { throw new Exception("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but differs from postings payload=" + postingsPayload); } } } } } } } } } } } float vectorAvg = status.DocCount == 0 ? 0 : status.TotVectors / (float)status.DocCount; Msg(infoStream, "OK [" + status.TotVectors + " total vector count; avg " + vectorAvg.ToString(CultureInfo.InvariantCulture.NumberFormat) + " term/freq vector fields per doc]"); } catch (Exception e) { Msg(infoStream, "ERROR [" + Convert.ToString(e.Message) + "]"); status.Error = e; if (infoStream != null) { // LUCENENET NOTE: Some tests rely on the error type being in // the message. We can't get the error type with StackTrace, we // need ToString() for that. infoStream.WriteLine(e.ToString()); //infoStream.WriteLine(e.StackTrace); } } return status; }