/// <summary> Adds terms and frequencies found in vector into the Map termFreqMap</summary> /// <param name="termFreqMap">a Map of terms and their frequencies /// </param> /// <param name="vector">List of terms and their frequencies for a doc/field /// </param> protected void AddTermFrequencies(IDictionary <string, Int> termFreqMap, ITermFreqVector vector) { System.String[] terms = vector.GetTerms(); int[] freqs = vector.GetTermFrequencies(); for (int j = 0; j < terms.Length; j++) { System.String term = terms[j]; if (IsNoiseWord(term)) { continue; } // increment frequency Int cnt = termFreqMap[term]; if (cnt == null) { cnt = new Int(); termFreqMap[term] = cnt; cnt.x = freqs[j]; } else { cnt.x += freqs[j]; } } }
private void TestTermVectors() { // check: int numDocs = reader.NumDocs(); long start = 0L; for (int docId = 0; docId < numDocs; docId++) { start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); ITermFreqVector[] vectors = reader.GetTermFreqVectors(docId, null); timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start; // verify vectors result VerifyVectors(vectors, docId); start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); ITermFreqVector vector = reader.GetTermFreqVector(docId, "field", null); timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start; vectors = new ITermFreqVector[1]; vectors[0] = vector; VerifyVectors(vectors, docId); } }
/// <summary> Find words for a more-like-this query former. /// /// </summary> /// <param name="docNum">the id of the lucene document from which to find terms /// </param> protected virtual PriorityQueue <object[]> RetrieveTerms(int docNum) { IDictionary <string, Int> termFreqMap = new HashMap <string, Int>(); for (int i = 0; i < fieldNames.Length; i++) { System.String fieldName = fieldNames[i]; ITermFreqVector vector = ir.GetTermFreqVector(docNum, fieldName); // field does not store term vector info if (vector == null) { Document d = ir.Document(docNum); System.String[] text = d.GetValues(fieldName); if (text != null) { for (int j = 0; j < text.Length; j++) { AddTermFrequencies(new System.IO.StringReader(text[j]), termFreqMap, fieldName); } } } else { AddTermFrequencies(termFreqMap, vector); } } return(CreateQueue(termFreqMap)); }
public TermVector(string fieldName, ITermFreqVector tfv) { // // Required for Windows Form Designer support // InitializeComponent(); lblField.Text = fieldName; IntPair[] tvs = new IntPair[tfv.Size]; String[] terms = tfv.GetTerms(); int[] freqs = tfv.GetTermFrequencies(); for (int i = 0; i < terms.Length; i++) { IntPair ip = new IntPair(freqs[i], terms[i]); tvs[i] = ip; } Array.Sort(tvs, new IntPair.PairComparator(false, true)); listViewTVF.BeginUpdate(); for (int i = 0; i < tvs.Length; i++) { ListViewItem item = new ListViewItem( new string[] { tvs[i].cnt.ToString(), tvs[i].text }); listViewTVF.Items.Add(item); } listViewTVF.EndUpdate(); }
public virtual void TestMixedTermVectorSettingsSameField() { Document doc = new Document(); // f1 first without tv then with tv doc.Add(new Field("f1", "v1", Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.NO)); doc.Add(new Field("f1", "v2", Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); // f2 first with tv then without tv doc.Add(new Field("f2", "v1", Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("f2", "v2", Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.NO)); IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED); writer.AddDocument(doc); writer.Close(); _TestUtil.CheckIndex(dir); IndexReader reader = IndexReader.Open(dir, true); // f1 ITermFreqVector tfv1 = reader.GetTermFreqVector(0, "f1"); Assert.IsNotNull(tfv1); Assert.AreEqual(2, tfv1.GetTerms().Length, "the 'with_tv' setting should rule!"); // f2 ITermFreqVector tfv2 = reader.GetTermFreqVector(0, "f2"); Assert.IsNotNull(tfv2); Assert.AreEqual(2, tfv2.GetTerms().Length, "the 'with_tv' setting should rule!"); }
public virtual void TestMerge() { SegmentMerger merger = new SegmentMerger(mergedDir, mergedSegment); merger.Add(reader1); merger.Add(reader2); int docsMerged = merger.Merge(null); merger.CloseReaders(); Assert.IsTrue(docsMerged == 2); //Should be able to open a new SegmentReader against the new directory SegmentReader mergedReader = SegmentReader.Get(true, new SegmentInfo(mergedSegment, docsMerged, mergedDir, false, true), IndexReader.DEFAULT_TERMS_INDEX_DIVISOR, null); Assert.IsTrue(mergedReader != null); Assert.IsTrue(mergedReader.NumDocs() == 2); Document newDoc1 = mergedReader.Document(0, null); Assert.IsTrue(newDoc1 != null); //There are 2 unstored fields on the document Assert.IsTrue(DocHelper.NumFields(newDoc1) == DocHelper.NumFields(doc1) - DocHelper.unstored.Count); Document newDoc2 = mergedReader.Document(1, null); Assert.IsTrue(newDoc2 != null); Assert.IsTrue(DocHelper.NumFields(newDoc2) == DocHelper.NumFields(doc2) - DocHelper.unstored.Count); TermDocs termDocs = mergedReader.TermDocs(new Term(DocHelper.TEXT_FIELD_2_KEY, "field"), null); Assert.IsTrue(termDocs != null); Assert.IsTrue(termDocs.Next(null) == true); System.Collections.Generic.ICollection <string> stored = mergedReader.GetFieldNames(IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR); Assert.IsTrue(stored != null); //System.out.println("stored size: " + stored.size()); Assert.IsTrue(stored.Count == 3, "We do not have 3 fields that were indexed with term vector"); ITermFreqVector vector = mergedReader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY, null); Assert.IsTrue(vector != null); System.String[] terms = vector.GetTerms(); Assert.IsTrue(terms != null); //System.out.println("Terms size: " + terms.length); Assert.IsTrue(terms.Length == 3); int[] freqs = vector.GetTermFrequencies(); Assert.IsTrue(freqs != null); //System.out.println("Freqs size: " + freqs.length); Assert.IsTrue(vector is TermPositionVector == true); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; int freq = freqs[i]; //System.out.println("Term: " + term + " Freq: " + freq); Assert.IsTrue(DocHelper.FIELD_2_TEXT.IndexOf(term) != -1); Assert.IsTrue(DocHelper.FIELD_2_FREQS[i] == freq); } TestSegmentReader.CheckNorms(mergedReader); }
// поиск с указанием найденной позиции в тексте public void DoSearch(String db, String querystr, global::Lucene.Net.Store.Directory indexDirectory) { // 1. Specify the analyzer for tokenizing text. // The same analyzer should be used as was used for indexing StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30, ListStopWords); // 2. query Query q = new QueryParser(Version.LUCENE_30, "LineText", analyzer).Parse(querystr); // 3. search int hitsPerPage = 10; IndexSearcher searcher = new IndexSearcher(indexDirectory, true); IndexReader reader = IndexReader.Open(indexDirectory, true); searcher.SetDefaultFieldSortScoring(true, false); TopScoreDocCollector collector = TopScoreDocCollector.Create(hitsPerPage, true); searcher.Search(q, collector); ScoreDoc[] hits = collector.TopDocs().ScoreDocs; // 4. display term positions, and term indexes MessageBox.Show("Found " + hits.Length + " hits."); for (int i = 0; i < hits.Length; ++i) { int docId = hits[i].Doc; ITermFreqVector tfvector = reader.GetTermFreqVector(docId, "LineText"); TermPositionVector tpvector = (TermPositionVector)tfvector; // this part works only if there is one term in the query string, // otherwise you will have to iterate this section over the query terms. int termidx = tfvector.IndexOf(querystr); int[] termposx = tpvector.GetTermPositions(termidx); TermVectorOffsetInfo[] tvoffsetinfo = tpvector.GetOffsets(termidx); for (int j = 0; j < termposx.Length; j++) { MessageBox.Show("termpos : " + termposx[j]); } for (int j = 0; j < tvoffsetinfo.Length; j++) { int offsetStart = tvoffsetinfo[j].StartOffset; int offsetEnd = tvoffsetinfo[j].EndOffset; MessageBox.Show("offsets : " + offsetStart + " " + offsetEnd); } // print some info about where the hit was found... Document d = searcher.Doc(docId); MessageBox.Show((i + 1) + ". " + d.Get("path")); } // searcher can only be closed when there // is no need to access the documents any more. searcher.Dispose(); }
public virtual void TestPositionReader() { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); Assert.IsTrue(reader != null); TermPositionVector vector; System.String[] terms; vector = (TermPositionVector)reader.Get(0, testFields[0]); Assert.IsTrue(vector != null); terms = vector.GetTerms(); Assert.IsTrue(terms != null); Assert.IsTrue(terms.Length == testTerms.Length); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); Assert.IsTrue(term.Equals(testTerms[i])); int[] positions = vector.GetTermPositions(i); Assert.IsTrue(positions != null); Assert.IsTrue(positions.Length == this.positions[i].Length); for (int j = 0; j < positions.Length; j++) { int position = positions[j]; Assert.IsTrue(position == this.positions[i][j]); } TermVectorOffsetInfo[] offset = vector.GetOffsets(i); Assert.IsTrue(offset != null); Assert.IsTrue(offset.Length == this.offsets[i].Length); for (int j = 0; j < offset.Length; j++) { TermVectorOffsetInfo termVectorOffsetInfo = offset[j]; Assert.IsTrue(termVectorOffsetInfo.Equals(offsets[i][j])); } } ITermFreqVector freqVector = reader.Get(0, testFields[1]); //no pos, no offset Assert.IsTrue(freqVector != null); Assert.IsTrue(freqVector is TermPositionVector == false); terms = freqVector.GetTerms(); Assert.IsTrue(terms != null); Assert.IsTrue(terms.Length == testTerms.Length); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); Assert.IsTrue(term.Equals(testTerms[i])); } }
private Dictionary <string, float> getTfForDoc(string filename, IndexReader reader) { ITermFreqVector termFreqVector = reader.GetTermFreqVector(docsInfo[filename], "text"); string[] terms = termFreqVector.GetTerms(); int[] termFreqs = termFreqVector.GetTermFrequencies(); var total_words = termFreqs.Sum(); var results = terms.Zip(termFreqs, (term, freq) => new { Key = term, Value = (float)freq / total_words }).ToDictionary(x => x.Key, x => x.Value); return(results); }
// get all vectors public override ITermFreqVector[] GetTermFreqVectors(int n, IState state) { EnsureOpen(); IList <ITermFreqVector> results = new List <ITermFreqVector>(); foreach (var e in fieldToReader) { System.String field = e.Key; IndexReader reader = e.Value; ITermFreqVector vector = reader.GetTermFreqVector(n, field, state); if (vector != null) { results.Add(vector); } } return(results.ToArray()); }
public virtual void DoTestDocument() { sis.Read(dir); IndexReader reader = OpenReader(); Assert.IsTrue(reader != null); Document newDoc1 = reader.Document(0); Assert.IsTrue(newDoc1 != null); Assert.IsTrue(DocHelper.NumFields(newDoc1) == DocHelper.NumFields(doc1) - DocHelper.unstored.Count); Document newDoc2 = reader.Document(1); Assert.IsTrue(newDoc2 != null); Assert.IsTrue(DocHelper.NumFields(newDoc2) == DocHelper.NumFields(doc2) - DocHelper.unstored.Count); ITermFreqVector vector = reader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY); Assert.IsTrue(vector != null); TestSegmentReader.CheckNorms(reader); }
public virtual void TestReader() { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); Assert.IsTrue(reader != null); for (int j = 0; j < 5; j++) { ITermFreqVector vector = reader.Get(j, testFields[0]); Assert.IsTrue(vector != null); System.String[] terms = vector.GetTerms(); Assert.IsTrue(terms != null); Assert.IsTrue(terms.Length == testTerms.Length); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); Assert.IsTrue(term.Equals(testTerms[i])); } } }
public virtual void TestTermVectors() { ITermFreqVector result = reader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY, null); Assert.IsTrue(result != null); System.String[] terms = result.GetTerms(); int[] freqs = result.GetTermFrequencies(); Assert.IsTrue(terms != null && terms.Length == 3 && freqs != null && freqs.Length == 3); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; int freq = freqs[i]; Assert.IsTrue(DocHelper.FIELD_2_TEXT.IndexOf(term) != -1); Assert.IsTrue(freq > 0); } ITermFreqVector[] results = reader.GetTermFreqVectors(0, null); Assert.IsTrue(results != null); Assert.IsTrue(results.Length == 3, "We do not have 3 term freq vectors, we have: " + results.Length); }
public virtual void TestBadParams() { var reader = new TermVectorsReader(dir, seg, fieldInfos); Assert.IsTrue(reader != null); //Bad document number, good field number Assert.Throws <System.IO.IOException>(() => reader.Get(50, testFields[0])); reader = new TermVectorsReader(dir, seg, fieldInfos); Assert.IsTrue(reader != null); //Bad document number, no field Assert.Throws <System.IO.IOException>(() => reader.Get(50)); reader = new TermVectorsReader(dir, seg, fieldInfos); Assert.IsTrue(reader != null); Assert.DoesNotThrow(() => { //good document number, bad field number ITermFreqVector vector = reader.Get(0, "f50"); Assert.IsTrue(vector == null); }); }
private void VerifyVectors(ITermFreqVector[] vectors, int num) { System.Text.StringBuilder temp = new System.Text.StringBuilder(); System.String[] terms = null; for (int i = 0; i < vectors.Length; i++) { terms = vectors[i].GetTerms(); for (int z = 0; z < terms.Length; z++) { temp.Append(terms[z]); } } if (!English.IntToEnglish(num).Trim().Equals(temp.ToString().Trim())) System.Console.Out.WriteLine("wrong term result"); }
public static void VerifyEquals(ITermFreqVector[] d1, ITermFreqVector[] d2) { if (d1 == null) { Assert.IsTrue(d2 == null); return; } Assert.IsTrue(d2 != null); Assert.AreEqual(d1.Length, d2.Length); for (int i = 0; i < d1.Length; i++) { ITermFreqVector v1 = d1[i]; ITermFreqVector v2 = d2[i]; if (v1 == null || v2 == null) { System.Console.Out.WriteLine("v1=" + v1 + " v2=" + v2 + " i=" + i + " of " + d1.Length); } Assert.AreEqual(v1.Size, v2.Size); int numTerms = v1.Size; System.String[] terms1 = v1.GetTerms(); System.String[] terms2 = v2.GetTerms(); int[] freq1 = v1.GetTermFrequencies(); int[] freq2 = v2.GetTermFrequencies(); for (int j = 0; j < numTerms; j++) { if (!terms1[j].Equals(terms2[j])) { Assert.AreEqual(terms1[j], terms2[j]); } Assert.AreEqual(freq1[j], freq2[j]); } if (v1 is TermPositionVector) { Assert.IsTrue(v2 is TermPositionVector); TermPositionVector tpv1 = (TermPositionVector)v1; TermPositionVector tpv2 = (TermPositionVector)v2; for (int j = 0; j < numTerms; j++) { int[] pos1 = tpv1.GetTermPositions(j); int[] pos2 = tpv2.GetTermPositions(j); Assert.AreEqual(pos1.Length, pos2.Length); TermVectorOffsetInfo[] offsets1 = tpv1.GetOffsets(j); TermVectorOffsetInfo[] offsets2 = tpv2.GetOffsets(j); if (offsets1 == null) { Assert.IsTrue(offsets2 == null); } else { Assert.IsTrue(offsets2 != null); } for (int k = 0; k < pos1.Length; k++) { Assert.AreEqual(pos1[k], pos2[k]); if (offsets1 != null) { Assert.AreEqual(offsets1[k].StartOffset, offsets2[k].StartOffset); Assert.AreEqual(offsets1[k].EndOffset, offsets2[k].EndOffset); } } } } } }
public static void VerifyEquals(ITermFreqVector[] d1, ITermFreqVector[] d2) { if (d1 == null) { Assert.IsTrue(d2 == null); return ; } Assert.IsTrue(d2 != null); Assert.AreEqual(d1.Length, d2.Length); for (int i = 0; i < d1.Length; i++) { ITermFreqVector v1 = d1[i]; ITermFreqVector v2 = d2[i]; if (v1 == null || v2 == null) { System.Console.Out.WriteLine("v1=" + v1 + " v2=" + v2 + " i=" + i + " of " + d1.Length); } Assert.AreEqual(v1.Size, v2.Size); int numTerms = v1.Size; System.String[] terms1 = v1.GetTerms(); System.String[] terms2 = v2.GetTerms(); int[] freq1 = v1.GetTermFrequencies(); int[] freq2 = v2.GetTermFrequencies(); for (int j = 0; j < numTerms; j++) { if (!terms1[j].Equals(terms2[j])) Assert.AreEqual(terms1[j], terms2[j]); Assert.AreEqual(freq1[j], freq2[j]); } if (v1 is TermPositionVector) { Assert.IsTrue(v2 is TermPositionVector); TermPositionVector tpv1 = (TermPositionVector) v1; TermPositionVector tpv2 = (TermPositionVector) v2; for (int j = 0; j < numTerms; j++) { int[] pos1 = tpv1.GetTermPositions(j); int[] pos2 = tpv2.GetTermPositions(j); Assert.AreEqual(pos1.Length, pos2.Length); TermVectorOffsetInfo[] offsets1 = tpv1.GetOffsets(j); TermVectorOffsetInfo[] offsets2 = tpv2.GetOffsets(j); if (offsets1 == null) Assert.IsTrue(offsets2 == null); else Assert.IsTrue(offsets2 != null); for (int k = 0; k < pos1.Length; k++) { Assert.AreEqual(pos1[k], pos2[k]); if (offsets1 != null) { Assert.AreEqual(offsets1[k].StartOffset, offsets2[k].StartOffset); Assert.AreEqual(offsets1[k].EndOffset, offsets2[k].EndOffset); } } } } } }
/// <summary> Add a complete document specified by all its term vectors. If document has no /// term vectors, add value for tvx. /// /// </summary> /// <param name="vectors"> /// </param> /// <throws> IOException </throws> public void AddAllDocVectors(ITermFreqVector[] vectors) { tvx.WriteLong(tvd.FilePointer); tvx.WriteLong(tvf.FilePointer); if (vectors != null) { int numFields = vectors.Length; tvd.WriteVInt(numFields); var fieldPointers = new long[numFields]; for (int i = 0; i < numFields; i++) { fieldPointers[i] = tvf.FilePointer; int fieldNumber = fieldInfos.FieldNumber(vectors[i].Field); // 1st pass: write field numbers to tvd tvd.WriteVInt(fieldNumber); int numTerms = vectors[i].Size; tvf.WriteVInt(numTerms); TermPositionVector tpVector; byte bits; bool storePositions; bool storeOffsets; if (vectors[i] is TermPositionVector) { // May have positions & offsets tpVector = (TermPositionVector) vectors[i]; storePositions = tpVector.Size > 0 && tpVector.GetTermPositions(0) != null; storeOffsets = tpVector.Size > 0 && tpVector.GetOffsets(0) != null; bits = (byte) ((storePositions?TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR: (byte) 0) + (storeOffsets?TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR: (byte) 0)); } else { tpVector = null; bits = 0; storePositions = false; storeOffsets = false; } tvf.WriteVInt(bits); System.String[] terms = vectors[i].GetTerms(); int[] freqs = vectors[i].GetTermFrequencies(); int utf8Upto = 0; utf8Results[1].length = 0; for (int j = 0; j < numTerms; j++) { UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].Length, utf8Results[utf8Upto]); int start = StringHelper.BytesDifference(utf8Results[1 - utf8Upto].result, utf8Results[1 - utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length); int length = utf8Results[utf8Upto].length - start; tvf.WriteVInt(start); // write shared prefix length tvf.WriteVInt(length); // write delta length tvf.WriteBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes utf8Upto = 1 - utf8Upto; int termFreq = freqs[j]; tvf.WriteVInt(termFreq); if (storePositions) { int[] positions = tpVector.GetTermPositions(j); if (positions == null) throw new System.SystemException("Trying to write positions that are null!"); System.Diagnostics.Debug.Assert(positions.Length == termFreq); // use delta encoding for positions int lastPosition = 0; foreach (int position in positions) { tvf.WriteVInt(position - lastPosition); lastPosition = position; } } if (storeOffsets) { TermVectorOffsetInfo[] offsets = tpVector.GetOffsets(j); if (offsets == null) throw new System.SystemException("Trying to write offsets that are null!"); System.Diagnostics.Debug.Assert(offsets.Length == termFreq); // use delta encoding for offsets int lastEndOffset = 0; foreach (TermVectorOffsetInfo t in offsets) { int startOffset = t.StartOffset; int endOffset = t.EndOffset; tvf.WriteVInt(startOffset - lastEndOffset); tvf.WriteVInt(endOffset - startOffset); lastEndOffset = endOffset; } } } } // 2nd pass: write field pointers to tvd if (numFields > 1) { long lastFieldPointer = fieldPointers[0]; for (int i = 1; i < numFields; i++) { long fieldPointer = fieldPointers[i]; tvd.WriteVLong(fieldPointer - lastFieldPointer); lastFieldPointer = fieldPointer; } } } else tvd.WriteVInt(0); }
protected static BrowseHit[] BuildHits(MyScoreDoc[] scoreDocs, SortField[] sortFields, IDictionary <string, IFacetHandler> facetHandlerMap, bool fetchStoredFields, IEnumerable <string> termVectorsToFetch, IFacetHandler groupBy, CombinedFacetAccessible[] groupAccessibles) { BrowseHit[] hits = new BrowseHit[scoreDocs.Length]; IEnumerable <IFacetHandler> facetHandlers = facetHandlerMap.Values; for (int i = scoreDocs.Length - 1; i >= 0; i--) { MyScoreDoc fdoc = scoreDocs[i]; BoboIndexReader reader = fdoc.reader; BrowseHit hit = new BrowseHit(); if (fetchStoredFields) { hit.StoredFields = reader.Document(fdoc.Doc); } if (termVectorsToFetch != null && termVectorsToFetch.Count() > 0) { var tvMap = new Dictionary <string, BrowseHit.TermFrequencyVector>(); hit.TermFreqMap = tvMap; foreach (string field in termVectorsToFetch) { ITermFreqVector tv = reader.GetTermFreqVector(fdoc.Doc, field); if (tv != null) { int[] freqs = tv.GetTermFrequencies(); string[] terms = tv.GetTerms(); tvMap[field] = new BrowseHit.TermFrequencyVector(terms, freqs); } } } var map = new Dictionary <string, string[]>(); var rawMap = new Dictionary <string, object[]>(); foreach (var facetHandler in facetHandlers) { map[facetHandler.Name] = facetHandler.GetFieldValues(reader, fdoc.Doc); rawMap[facetHandler.Name] = facetHandler.GetRawFieldValues(reader, fdoc.Doc); } hit.FieldValues = map; hit.RawFieldValues = rawMap; hit.DocId = fdoc.Doc + fdoc.queue.@base; hit.Score = fdoc.Score; hit.Comparable = fdoc.Value; if (groupBy != null) { hit.GroupField = groupBy.Name; hit.GroupValue = hit.GetField(groupBy.Name); hit.RawGroupValue = hit.GetRawField(groupBy.Name); if (groupAccessibles != null && hit.GroupValue != null && groupAccessibles != null && groupAccessibles.Length > 0) { BrowseFacet facet = groupAccessibles[0].GetFacet(hit.GroupValue); hit.GroupHitsCount = facet.FacetValueHitCount; } } hits[i] = hit; } return(hits); }
public virtual void TestTermPositionVectors() { Query query = new TermQuery(new Term("field", "zero")); try { ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); for (int i = 0; i < hits.Length; i++) { ITermFreqVector[] vector = searcher.reader_ForNUnit.GetTermFreqVectors(hits[i].Doc); Assert.IsTrue(vector != null); Assert.IsTrue(vector.Length == 1); bool shouldBePosVector = (hits[i].Doc % 2 == 0)?true:false; Assert.IsTrue((shouldBePosVector == false) || (shouldBePosVector == true && (vector[0] is TermPositionVector == true))); bool shouldBeOffVector = (hits[i].Doc % 3 == 0)?true:false; Assert.IsTrue((shouldBeOffVector == false) || (shouldBeOffVector == true && (vector[0] is TermPositionVector == true))); if (shouldBePosVector || shouldBeOffVector) { TermPositionVector posVec = (TermPositionVector)vector[0]; System.String[] terms = posVec.GetTerms(); Assert.IsTrue(terms != null && terms.Length > 0); for (int j = 0; j < terms.Length; j++) { int[] positions = posVec.GetTermPositions(j); TermVectorOffsetInfo[] offsets = posVec.GetOffsets(j); if (shouldBePosVector) { Assert.IsTrue(positions != null); Assert.IsTrue(positions.Length > 0); } else { Assert.IsTrue(positions == null); } if (shouldBeOffVector) { Assert.IsTrue(offsets != null); Assert.IsTrue(offsets.Length > 0); } else { Assert.IsTrue(offsets == null); } } } else { try { TermPositionVector posVec = (TermPositionVector)vector[0]; Assert.IsTrue(false); } catch (System.InvalidCastException ignore) { ITermFreqVector freqVec = vector[0]; System.String[] terms = freqVec.GetTerms(); Assert.IsTrue(terms != null && terms.Length > 0); } } } } catch (System.IO.IOException) { Assert.IsTrue(false); } }
private void TestTermVectors() { // check: int numDocs = reader.NumDocs(); long start = 0L; for (int docId = 0; docId < numDocs; docId++) { start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); ITermFreqVector[] vectors = reader.GetTermFreqVectors(docId); timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start; // verify vectors result VerifyVectors(vectors, docId); start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); ITermFreqVector vector = reader.GetTermFreqVector(docId, "field"); timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start; vectors = new ITermFreqVector[1]; vectors[0] = vector; VerifyVectors(vectors, docId); } }
public virtual void TestKnownSetOfDocuments() { System.String test1 = "eating chocolate in a computer lab"; //6 terms System.String test2 = "computer in a computer lab"; //5 terms System.String test3 = "a chocolate lab grows old"; //5 terms System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms System.Collections.IDictionary test4Map = new System.Collections.Hashtable(); test4Map["chocolate"] = 3; test4Map["lab"] = 2; test4Map["eating"] = 1; test4Map["computer"] = 1; test4Map["with"] = 1; test4Map["a"] = 1; test4Map["colored"] = 1; test4Map["in"] = 1; test4Map["an"] = 1; test4Map["computer"] = 1; test4Map["old"] = 1; Document testDoc1 = new Document(); SetupDoc(testDoc1, test1); Document testDoc2 = new Document(); SetupDoc(testDoc2, test2); Document testDoc3 = new Document(); SetupDoc(testDoc3, test3); Document testDoc4 = new Document(); SetupDoc(testDoc4, test4); Directory dir = new MockRAMDirectory(); try { IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Assert.IsTrue(writer != null); writer.AddDocument(testDoc1); writer.AddDocument(testDoc2); writer.AddDocument(testDoc3); writer.AddDocument(testDoc4); writer.Close(); IndexSearcher knownSearcher = new IndexSearcher(dir, true); TermEnum termEnum = knownSearcher.reader_ForNUnit.Terms(); TermDocs termDocs = knownSearcher.reader_ForNUnit.TermDocs(); //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length); Similarity sim = knownSearcher.Similarity; while (termEnum.Next() == true) { Term term = termEnum.Term; //System.out.println("Term: " + term); termDocs.Seek(term); while (termDocs.Next()) { int docId = termDocs.Doc; int freq = termDocs.Freq; //System.out.println("Doc Id: " + docId + " freq " + freq); ITermFreqVector vector = knownSearcher.reader_ForNUnit.GetTermFreqVector(docId, "field"); float tf = sim.Tf(freq); float idf = sim.Idf(knownSearcher.DocFreq(term), knownSearcher.MaxDoc); //float qNorm = sim.queryNorm() //This is fine since we don't have stop words float lNorm = sim.LengthNorm("field", vector.GetTerms().Length); //float coord = sim.coord() //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm); Assert.IsTrue(vector != null); System.String[] vTerms = vector.GetTerms(); int[] freqs = vector.GetTermFrequencies(); for (int i = 0; i < vTerms.Length; i++) { if (term.Text.Equals(vTerms[i])) { Assert.IsTrue(freqs[i] == freq); } } } //System.out.println("--------"); } Query query = new TermQuery(new Term("field", "chocolate")); ScoreDoc[] hits = knownSearcher.Search(query, null, 1000).ScoreDocs; //doc 3 should be the first hit b/c it is the shortest match Assert.IsTrue(hits.Length == 3); float score = hits[0].Score; /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0))); * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ Assert.IsTrue(hits[0].Doc == 2); Assert.IsTrue(hits[1].Doc == 3); Assert.IsTrue(hits[2].Doc == 0); ITermFreqVector vector2 = knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, "field"); Assert.IsTrue(vector2 != null); //System.out.println("Vector: " + vector); System.String[] terms = vector2.GetTerms(); int[] freqs2 = vector2.GetTermFrequencies(); Assert.IsTrue(terms != null && terms.Length == 10); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); int freq = freqs2[i]; Assert.IsTrue(test4.IndexOf(term) != -1); System.Int32 freqInt = -1; try { freqInt = (System.Int32)test4Map[term]; } catch (Exception) { Assert.IsTrue(false); } Assert.IsTrue(freqInt == freq); } SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, mapper); var vectorEntrySet = mapper.TermVectorEntrySet; Assert.IsTrue(vectorEntrySet.Count == 10, "mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.Count + " is not: " + 10); TermVectorEntry last = null; foreach (TermVectorEntry tve in vectorEntrySet) { if (tve != null && last != null) { Assert.IsTrue(last.Frequency >= tve.Frequency, "terms are not properly sorted"); System.Int32 expectedFreq = (System.Int32)test4Map[tve.Term]; //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields Assert.IsTrue(tve.Frequency == 2 * expectedFreq, "Frequency is not correct:"); } last = tve; } FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, fieldMapper); var map = fieldMapper.FieldToTerms; Assert.IsTrue(map.Count == 2, "map Size: " + map.Count + " is not: " + 2); vectorEntrySet = map["field"]; Assert.IsTrue(vectorEntrySet != null, "vectorEntrySet is null and it shouldn't be"); Assert.IsTrue(vectorEntrySet.Count == 10, "vectorEntrySet Size: " + vectorEntrySet.Count + " is not: " + 10); knownSearcher.Close(); } catch (System.IO.IOException e) { System.Console.Error.WriteLine(e.StackTrace); Assert.IsTrue(false); } }