public TermVector(string fieldName, TermFreqVector tfv) { // // Required for Windows Form Designer support // InitializeComponent(); lblField.Text = fieldName; List<TermFrequency> tvs = new List<TermFrequency>(tfv.Size()); String[] terms = tfv.GetTerms(); int[] freqs = tfv.GetTermFrequencies(); for (int i = 0; i < terms.Length; i++) { tvs.Add(new TermFrequency(terms[i], freqs[i])); } tvs.OrderBy( p => p.Term); listViewTVF.BeginUpdate(); foreach(TermFrequency tf in tvs) { ListViewItem item = new ListViewItem(new string[]{tf.Frequency.ToString(), tf.Term}); listViewTVF.Items.Add(item); } listViewTVF.EndUpdate(); }
// get all vectors public override TermFreqVector[] GetTermFreqVectors(int n) { EnsureOpen(); System.Collections.ArrayList results = new System.Collections.ArrayList(); System.Collections.IEnumerator i = new System.Collections.Hashtable(fieldToReader).GetEnumerator(); while (i.MoveNext()) { System.Collections.DictionaryEntry e = (System.Collections.DictionaryEntry)i.Current; System.String field = (System.String)e.Key; IndexReader reader = (IndexReader)e.Value; TermFreqVector vector = reader.GetTermFreqVector(n, field); if (vector != null) { results.Add(vector); } } return((TermFreqVector[])results.ToArray(typeof(TermFreqVector))); }
// get all vectors public override TermFreqVector[] GetTermFreqVectors(int n) { EnsureOpen(); List <TermFreqVector> results = new List <TermFreqVector>(); IEnumerator <KeyValuePair <string, IndexReader> > i = fieldToReader.GetEnumerator(); while (i.MoveNext()) { KeyValuePair <string, IndexReader> e = i.Current; string field = e.Key; IndexReader reader = e.Value; TermFreqVector vector = reader.GetTermFreqVector(n, field); if (vector != null) { results.Add(vector); } } return(results.ToArray()); }
public virtual void DoTestDocument() { sis.Read(dir); IndexReader reader = OpenReader(); Assert.IsTrue(reader != null); Document newDoc1 = reader.Document(0); Assert.IsTrue(newDoc1 != null); Assert.IsTrue(DocHelper.NumFields(newDoc1) == DocHelper.NumFields(doc1) - DocHelper.unstored.Count); Document newDoc2 = reader.Document(1); Assert.IsTrue(newDoc2 != null); Assert.IsTrue(DocHelper.NumFields(newDoc2) == DocHelper.NumFields(doc2) - DocHelper.unstored.Count); TermFreqVector vector = reader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY); Assert.IsTrue(vector != null); TestSegmentReader.CheckNorms(reader); }
// get all vectors public override TermFreqVector[] GetTermFreqVectors(int n) { System.Collections.ArrayList results = new System.Collections.ArrayList(); System.Collections.IEnumerator i = new System.Collections.Hashtable(fieldToReader).GetEnumerator(); while (i.MoveNext()) { System.Collections.DictionaryEntry e = (System.Collections.DictionaryEntry)i.Current; //IndexReader reader = (IndexReader) e.Key; // {{Aroush}} which is right, those two lines? //System.String field = (System.String) e.Value; System.String field = (System.String)e.Key; // {{Aroush-2.0}} or those two lines? IndexReader reader = (IndexReader)e.Value; TermFreqVector vector = reader.GetTermFreqVector(n, field); if (vector != null) { results.Add(vector); } } return((TermFreqVector[])(results.ToArray(typeof(TermFreqVector)))); }
internal void ShowTV() { if (listDocFields.SelectedItems.Count == 0) { return; } if (_luke.IndexReader == null) { _luke.ShowStatus(_luke.resources.GetString("NoIndex")); return; } int docId; try { docId = Int32.Parse(textDocNum.Text); } catch (Exception) { _luke.ShowStatus(_luke.resources.GetString("DocNotSelected")); return; } try { string fieldName = listDocFields.SelectedItems[0].SubItems[0].Text; fieldName = fieldName.Substring(1, fieldName.Length - 2); TermFreqVector tfv = _luke.IndexReader.GetTermFreqVector(docId, fieldName); if (tfv == null) { _luke.ShowStatus(_luke.resources.GetString("NoTV")); return; } TermVector tvDialog = new TermVector(fieldName, tfv); tvDialog.ShowDialog(this); } catch (Exception exc) { _luke.ShowStatus(exc.Message); } }
public virtual void TestTermVectors() { TermFreqVector result = reader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY); Assert.IsTrue(result != null); System.String[] terms = result.GetTerms(); int[] freqs = result.GetTermFrequencies(); Assert.IsTrue(terms != null && terms.Length == 3 && freqs != null && freqs.Length == 3); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; int freq = freqs[i]; Assert.IsTrue(DocHelper.FIELD_2_TEXT.IndexOf(term) != -1); Assert.IsTrue(freq > 0); } TermFreqVector[] results = reader.GetTermFreqVectors(0); Assert.IsTrue(results != null); Assert.IsTrue(results.Length == 4, "We do not have 4 term freq vectors, we have: " + results.Length); }
public virtual void TestReader() { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); Assert.IsTrue(reader != null); for (int j = 0; j < 5; j++) { TermFreqVector vector = reader.Get(j, testFields[0]); Assert.IsTrue(vector != null); System.String[] terms = vector.GetTerms(); Assert.IsTrue(terms != null); Assert.IsTrue(terms.Length == testTerms.Length); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); Assert.IsTrue(term.Equals(testTerms[i])); } } }
public virtual void TestBadParams() { try { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); Assert.IsTrue(reader != null); //Bad document number, good field number reader.Get(50, testFields[0]); Assert.Fail(); } catch (System.IO.IOException e) { // expected exception } try { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); Assert.IsTrue(reader != null); //Bad document number, no field reader.Get(50); Assert.Fail(); } catch (System.IO.IOException e) { // expected exception } try { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); Assert.IsTrue(reader != null); //good document number, bad field number TermFreqVector vector = reader.Get(0, "f50"); Assert.IsTrue(vector == null); } catch (System.IO.IOException e) { Assert.Fail(); } }
public virtual void TestDocument() { try { sis.Read(dir); MultiReader reader = new MultiReader(dir, sis, false, readers); Assert.IsTrue(reader != null); Document newDoc1 = reader.Document(0); Assert.IsTrue(newDoc1 != null); Assert.IsTrue(DocHelper.NumFields(newDoc1) == DocHelper.NumFields(doc1) - 2); Document newDoc2 = reader.Document(1); Assert.IsTrue(newDoc2 != null); Assert.IsTrue(DocHelper.NumFields(newDoc2) == DocHelper.NumFields(doc2) - 2); TermFreqVector vector = reader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY); Assert.IsTrue(vector != null); } catch (System.IO.IOException e) { System.Console.Error.WriteLine(e.StackTrace); Assert.IsTrue(false); } }
public virtual void TestReader() { try { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); Assert.IsTrue(reader != null); TermFreqVector vector = reader.Get(0, testFields[0]); Assert.IsTrue(vector != null); System.String[] terms = vector.GetTerms(); Assert.IsTrue(terms != null); Assert.IsTrue(terms.Length == testTerms.Length); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); Assert.IsTrue(term.Equals(testTerms[i])); } } catch (System.IO.IOException e) { System.Console.Error.WriteLine(e.StackTrace); Assert.IsTrue(false); } }
public static void VerifyEquals(TermFreqVector[] d1, TermFreqVector[] d2) { if (d1 == null) { Assert.IsTrue(d2 == null); return ; } Assert.IsTrue(d2 != null); Assert.AreEqual(d1.Length, d2.Length); for (int i = 0; i < d1.Length; i++) { TermFreqVector v1 = d1[i]; TermFreqVector v2 = d2[i]; if (v1 == null || v2 == null) { System.Console.Out.WriteLine("v1=" + v1 + " v2=" + v2 + " i=" + i + " of " + d1.Length); } Assert.AreEqual(v1.Size(), v2.Size()); int numTerms = v1.Size(); System.String[] terms1 = v1.GetTerms(); System.String[] terms2 = v2.GetTerms(); int[] freq1 = v1.GetTermFrequencies(); int[] freq2 = v2.GetTermFrequencies(); for (int j = 0; j < numTerms; j++) { if (!terms1[j].Equals(terms2[j])) Assert.AreEqual(terms1[j], terms2[j]); Assert.AreEqual(freq1[j], freq2[j]); } if (v1 is TermPositionVector) { Assert.IsTrue(v2 is TermPositionVector); TermPositionVector tpv1 = (TermPositionVector) v1; TermPositionVector tpv2 = (TermPositionVector) v2; for (int j = 0; j < numTerms; j++) { int[] pos1 = tpv1.GetTermPositions(j); int[] pos2 = tpv2.GetTermPositions(j); Assert.AreEqual(pos1.Length, pos2.Length); TermVectorOffsetInfo[] offsets1 = tpv1.GetOffsets(j); TermVectorOffsetInfo[] offsets2 = tpv2.GetOffsets(j); if (offsets1 == null) Assert.IsTrue(offsets2 == null); else Assert.IsTrue(offsets2 != null); for (int k = 0; k < pos1.Length; k++) { Assert.AreEqual(pos1[k], pos2[k]); if (offsets1 != null) { Assert.AreEqual(offsets1[k].GetStartOffset(), offsets2[k].GetStartOffset()); Assert.AreEqual(offsets1[k].GetEndOffset(), offsets2[k].GetEndOffset()); } } } } } }
public virtual void TestKnownSetOfDocuments() { System.String test1 = "eating chocolate in a computer lab"; //6 terms System.String test2 = "computer in a computer lab"; //5 terms System.String test3 = "a chocolate lab grows old"; //5 terms System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms System.Collections.IDictionary test4Map = new System.Collections.Hashtable(); test4Map["chocolate"] = 3; test4Map["lab"] = 2; test4Map["eating"] = 1; test4Map["computer"] = 1; test4Map["with"] = 1; test4Map["a"] = 1; test4Map["colored"] = 1; test4Map["in"] = 1; test4Map["an"] = 1; test4Map["computer"] = 1; test4Map["old"] = 1; Document testDoc1 = new Document(); SetupDoc(testDoc1, test1); Document testDoc2 = new Document(); SetupDoc(testDoc2, test2); Document testDoc3 = new Document(); SetupDoc(testDoc3, test3); Document testDoc4 = new Document(); SetupDoc(testDoc4, test4); Directory dir = new MockRAMDirectory(); try { IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Assert.IsTrue(writer != null); writer.AddDocument(testDoc1); writer.AddDocument(testDoc2); writer.AddDocument(testDoc3); writer.AddDocument(testDoc4); writer.Close(); IndexSearcher knownSearcher = new IndexSearcher(dir); TermEnum termEnum = knownSearcher.reader_ForNUnit.Terms(); TermDocs termDocs = knownSearcher.reader_ForNUnit.TermDocs(); //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length); Similarity sim = knownSearcher.GetSimilarity(); while (termEnum.Next() == true) { Term term = termEnum.Term(); //System.out.println("Term: " + term); termDocs.Seek(term); while (termDocs.Next()) { int docId = termDocs.Doc(); int freq = termDocs.Freq(); //System.out.println("Doc Id: " + docId + " freq " + freq); TermFreqVector vector = knownSearcher.reader_ForNUnit.GetTermFreqVector(docId, "field"); float tf = sim.Tf(freq); float idf = sim.Idf(term, knownSearcher); //float qNorm = sim.queryNorm() //This is fine since we don't have stop words float lNorm = sim.LengthNorm("field", vector.GetTerms().Length); //float coord = sim.coord() //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm); Assert.IsTrue(vector != null); System.String[] vTerms = vector.GetTerms(); int[] freqs = vector.GetTermFrequencies(); for (int i = 0; i < vTerms.Length; i++) { if (term.Text().Equals(vTerms[i])) { Assert.IsTrue(freqs[i] == freq); } } } //System.out.println("--------"); } Query query = new TermQuery(new Term("field", "chocolate")); ScoreDoc[] hits = knownSearcher.Search(query, null, 1000).scoreDocs; //doc 3 should be the first hit b/c it is the shortest match Assert.IsTrue(hits.Length == 3); float score = hits[0].score; /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0))); * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ Assert.IsTrue(hits[0].doc == 2); Assert.IsTrue(hits[1].doc == 3); Assert.IsTrue(hits[2].doc == 0); TermFreqVector vector2 = knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, "field"); Assert.IsTrue(vector2 != null); //System.out.println("Vector: " + vector); System.String[] terms = vector2.GetTerms(); int[] freqs2 = vector2.GetTermFrequencies(); Assert.IsTrue(terms != null && terms.Length == 10); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); int freq = freqs2[i]; Assert.IsTrue(test4.IndexOf(term) != -1); System.Int32 freqInt = -1; try { freqInt = (System.Int32)test4Map[term]; } catch (Exception) { Assert.IsTrue(false); } Assert.IsTrue(freqInt == freq); } SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, mapper); System.Collections.Generic.SortedDictionary <object, object> vectorEntrySet = mapper.GetTermVectorEntrySet(); Assert.IsTrue(vectorEntrySet.Count == 10, "mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.Count + " is not: " + 10); TermVectorEntry last = null; foreach (TermVectorEntry tve in vectorEntrySet.Keys) { if (tve != null && last != null) { Assert.IsTrue(last.GetFrequency() >= tve.GetFrequency(), "terms are not properly sorted"); System.Int32 expectedFreq = (System.Int32)test4Map[tve.GetTerm()]; //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields Assert.IsTrue(tve.GetFrequency() == 2 * expectedFreq, "Frequency is not correct:"); } last = tve; } FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, fieldMapper); System.Collections.IDictionary map = fieldMapper.GetFieldToTerms(); Assert.IsTrue(map.Count == 2, "map Size: " + map.Count + " is not: " + 2); vectorEntrySet = (System.Collections.Generic.SortedDictionary <Object, Object>)map["field"]; Assert.IsTrue(vectorEntrySet != null, "vectorEntrySet is null and it shouldn't be"); Assert.IsTrue(vectorEntrySet.Count == 10, "vectorEntrySet Size: " + vectorEntrySet.Count + " is not: " + 10); knownSearcher.Close(); } catch (System.IO.IOException e) { System.Console.Error.WriteLine(e.StackTrace); Assert.IsTrue(false); } }
public virtual void TestTermPositionVectors() { Query query = new TermQuery(new Term("field", "zero")); try { ScoreDoc[] hits = searcher.Search(query, null, 1000).scoreDocs; Assert.AreEqual(1, hits.Length); for (int i = 0; i < hits.Length; i++) { TermFreqVector[] vector = searcher.reader_ForNUnit.GetTermFreqVectors(hits[i].doc); Assert.IsTrue(vector != null); Assert.IsTrue(vector.Length == 1); bool shouldBePosVector = (hits[i].doc % 2 == 0)?true:false; Assert.IsTrue((shouldBePosVector == false) || (shouldBePosVector == true && (vector[0] is TermPositionVector == true))); bool shouldBeOffVector = (hits[i].doc % 3 == 0)?true:false; Assert.IsTrue((shouldBeOffVector == false) || (shouldBeOffVector == true && (vector[0] is TermPositionVector == true))); if (shouldBePosVector || shouldBeOffVector) { TermPositionVector posVec = (TermPositionVector)vector[0]; System.String[] terms = posVec.GetTerms(); Assert.IsTrue(terms != null && terms.Length > 0); for (int j = 0; j < terms.Length; j++) { int[] positions = posVec.GetTermPositions(j); TermVectorOffsetInfo[] offsets = posVec.GetOffsets(j); if (shouldBePosVector) { Assert.IsTrue(positions != null); Assert.IsTrue(positions.Length > 0); } else { Assert.IsTrue(positions == null); } if (shouldBeOffVector) { Assert.IsTrue(offsets != null); Assert.IsTrue(offsets.Length > 0); } else { Assert.IsTrue(offsets == null); } } } else { try { TermPositionVector posVec = (TermPositionVector)vector[0]; Assert.IsTrue(false); } catch (System.InvalidCastException ignore) { TermFreqVector freqVec = vector[0]; System.String[] terms = freqVec.GetTerms(); Assert.IsTrue(terms != null && terms.Length > 0); } } } } catch (System.IO.IOException e) { Assert.IsTrue(false); } }
public static void VerifyEquals(TermFreqVector[] d1, TermFreqVector[] d2) { if (d1 == null) { Assert.IsTrue(d2 == null); return; } Assert.IsTrue(d2 != null); Assert.AreEqual(d1.Length, d2.Length); for (int i = 0; i < d1.Length; i++) { TermFreqVector v1 = d1[i]; TermFreqVector v2 = d2[i]; if (v1 == null || v2 == null) { System.Console.Out.WriteLine("v1=" + v1 + " v2=" + v2 + " i=" + i + " of " + d1.Length); } Assert.AreEqual(v1.Size(), v2.Size()); int numTerms = v1.Size(); System.String[] terms1 = v1.GetTerms(); System.String[] terms2 = v2.GetTerms(); int[] freq1 = v1.GetTermFrequencies(); int[] freq2 = v2.GetTermFrequencies(); for (int j = 0; j < numTerms; j++) { if (!terms1[j].Equals(terms2[j])) { Assert.AreEqual(terms1[j], terms2[j]); } Assert.AreEqual(freq1[j], freq2[j]); } if (v1 is TermPositionVector) { Assert.IsTrue(v2 is TermPositionVector); TermPositionVector tpv1 = (TermPositionVector)v1; TermPositionVector tpv2 = (TermPositionVector)v2; for (int j = 0; j < numTerms; j++) { int[] pos1 = tpv1.GetTermPositions(j); int[] pos2 = tpv2.GetTermPositions(j); Assert.AreEqual(pos1.Length, pos2.Length); TermVectorOffsetInfo[] offsets1 = tpv1.GetOffsets(j); TermVectorOffsetInfo[] offsets2 = tpv2.GetOffsets(j); if (offsets1 == null) { Assert.IsTrue(offsets2 == null); } else { Assert.IsTrue(offsets2 != null); } for (int k = 0; k < pos1.Length; k++) { Assert.AreEqual(pos1[k], pos2[k]); if (offsets1 != null) { Assert.AreEqual(offsets1[k].GetStartOffset(), offsets2[k].GetStartOffset()); Assert.AreEqual(offsets1[k].GetEndOffset(), offsets2[k].GetEndOffset()); } } } } } }
private void AddTermFreqVectorInternal(TermFreqVector vector) { OpenField(vector.GetField()); for (int i = 0; i < vector.Size(); i++) { AddTermInternal(vector.GetTerms()[i], vector.GetTermFrequencies()[i]); } CloseField(); }
public virtual void TestKnownSetOfDocuments() { System.String[] termArray = new System.String[] { "eating", "chocolate", "in", "a", "computer", "lab", "grows", "old", "colored", "with", "an" }; System.String test1 = "eating chocolate in a computer lab"; //6 terms System.String test2 = "computer in a computer lab"; //5 terms System.String test3 = "a chocolate lab grows old"; //5 terms System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms System.Collections.IDictionary test4Map = new System.Collections.Hashtable(); test4Map["chocolate"] = 3; test4Map["lab"] = 2; test4Map["eating"] = 1; test4Map["computer"] = 1; test4Map["with"] = 1; test4Map["a"] = 1; test4Map["colored"] = 1; test4Map["in"] = 1; test4Map["an"] = 1; test4Map["computer"] = 1; test4Map["old"] = 1; Document testDoc1 = new Document(); SetupDoc(testDoc1, test1); Document testDoc2 = new Document(); SetupDoc(testDoc2, test2); Document testDoc3 = new Document(); SetupDoc(testDoc3, test3); Document testDoc4 = new Document(); SetupDoc(testDoc4, test4); Directory dir = new RAMDirectory(); try { IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true); Assert.IsTrue(writer != null); writer.AddDocument(testDoc1); writer.AddDocument(testDoc2); writer.AddDocument(testDoc3); writer.AddDocument(testDoc4); writer.Close(); IndexSearcher knownSearcher = new IndexSearcher(dir); TermEnum termEnum = knownSearcher.reader.Terms(); TermDocs termDocs = knownSearcher.reader.TermDocs(); //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length); Similarity sim = knownSearcher.GetSimilarity(); while (termEnum.Next() == true) { Term term = termEnum.Term(); //System.out.println("Term: " + term); termDocs.Seek(term); while (termDocs.Next()) { int docId = termDocs.Doc(); int freq = termDocs.Freq(); //System.out.println("Doc Id: " + docId + " freq " + freq); TermFreqVector vector = knownSearcher.reader.GetTermFreqVector(docId, "Field"); float tf = sim.Tf(freq); float idf = sim.Idf(term, knownSearcher); //float qNorm = sim.queryNorm() //This is fine since we don't have stop words float lNorm = sim.LengthNorm("Field", vector.GetTerms().Length); //float coord = sim.coord() //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm); Assert.IsTrue(vector != null); System.String[] vTerms = vector.GetTerms(); int[] freqs = vector.GetTermFrequencies(); for (int i = 0; i < vTerms.Length; i++) { if (term.Text().Equals(vTerms[i]) == true) { Assert.IsTrue(freqs[i] == freq); } } } //System.out.println("--------"); } Query query = new TermQuery(new Term("Field", "chocolate")); Hits hits = knownSearcher.Search(query); //doc 3 should be the first hit b/c it is the shortest match Assert.IsTrue(hits.Length() == 3); float score = hits.Score(0); /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0))); * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ Assert.IsTrue(testDoc3.ToString().Equals(hits.Doc(0).ToString())); Assert.IsTrue(testDoc4.ToString().Equals(hits.Doc(1).ToString())); Assert.IsTrue(testDoc1.ToString().Equals(hits.Doc(2).ToString())); TermFreqVector vector2 = knownSearcher.reader.GetTermFreqVector(hits.Id(1), "Field"); Assert.IsTrue(vector2 != null); //System.out.println("Vector: " + vector); System.String[] terms = vector2.GetTerms(); int[] freqs2 = vector2.GetTermFrequencies(); Assert.IsTrue(terms != null && terms.Length == 10); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); int freq = freqs2[i]; Assert.IsTrue(test4.IndexOf(term) != -1); System.Int32 freqInt = (System.Int32)test4Map[term]; System.Object tmpFreqInt = test4Map[term]; Assert.IsTrue(tmpFreqInt != null); Assert.IsTrue(freqInt == freq); } knownSearcher.Close(); } catch (System.IO.IOException e) { System.Console.Error.WriteLine(e.StackTrace); Assert.IsTrue(false); } }
public List <Post> Similar(int postid, int itemsToReturn) { List <Post> TList = new List <Post>(); int docId = -1; IndexSearcher searcher = null; IndexReader reader = null; if (rd == null) { BuildIndex(); } lck.AcquireReaderLock(ReaderTimeOut); try { Analyzer analyzer = GetAnalyzer(); QueryParser parser = GetQueryParser(analyzer); parser.SetDefaultOperator(QueryParser.AND_OPERATOR); Query q = parser.Parse("postid:" + postid); searcher = new IndexSearcher(rd, true); //TODO #pragma warning disable CS0618 // Type or member is obsolete Hits hits = searcher.Search(q); #pragma warning restore CS0618 // Type or member is obsolete if (hits != null && hits.Length() > 0) { docId = hits.Id(0); } if (docId > -1) { reader = IndexReader.Open(rd, true); TermFreqVector tfv = reader.GetTermFreqVector(docId, "exact"); BooleanQuery booleanQuery = new BooleanQuery(); for (int j = 0; j < tfv.Size(); j++) { TermQuery tq = new TermQuery(new Term("exact", tfv.GetTerms()[j])); booleanQuery.Add(tq, BooleanClause.Occur.SHOULD); } //TODO #pragma warning disable CS0618 // Type or member is obsolete Hits similarhits = searcher.Search(booleanQuery, Sort.RELEVANCE); #pragma warning restore CS0618 // Type or member is obsolete for (int i = 0; i < similarhits.Length(); i++) { Document doc = similarhits.Doc(i); if (similarhits.Id(i) != docId) { TList.Add(CreateFromDocument(doc, analyzer, null)); } if (TList.Count >= itemsToReturn) { break; } } } } catch (Exception) { } finally { if (searcher != null) { searcher.Close(); } if (reader != null) { reader.Close(); } lck.ReleaseReaderLock(); } return(TList); }
/// <summary> Add a complete document specified by all its term vectors. If document has no /// term vectors, add value for tvx. /// /// </summary> /// <param name="vectors"> /// </param> /// <throws> IOException </throws> public void AddAllDocVectors(TermFreqVector[] vectors) { OpenDocument(); if (vectors != null) { for (int i = 0; i < vectors.Length; i++) { bool storePositionWithTermVector = false; bool storeOffsetWithTermVector = false; if (vectors[i] is TermPositionVector) { TermPositionVector tpVector = (TermPositionVector) vectors[i]; if (tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null) storePositionWithTermVector = true; if (tpVector.Size() > 0 && tpVector.GetOffsets(0) != null) storeOffsetWithTermVector = true; FieldInfo fieldInfo = fieldInfos.FieldInfo(tpVector.GetField()); OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector); for (int j = 0; j < tpVector.Size(); j++) AddTermInternal(tpVector.GetTerms()[j], tpVector.GetTermFrequencies()[j], tpVector.GetTermPositions(j), tpVector.GetOffsets(j)); CloseField(); } else { TermFreqVector tfVector = vectors[i]; FieldInfo fieldInfo = fieldInfos.FieldInfo(tfVector.GetField()); OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector); for (int j = 0; j < tfVector.Size(); j++) AddTermInternal(tfVector.GetTerms()[j], tfVector.GetTermFrequencies()[j], null, null); CloseField(); } } } CloseDocument(); }
/// <summary> Add a complete document specified by all its term vectors. If document has no /// term vectors, add value for tvx. /// /// </summary> /// <param name="vectors"> /// </param> /// <throws> IOException </throws> public void AddAllDocVectors(TermFreqVector[] vectors) { tvx.WriteLong(tvd.GetFilePointer()); tvx.WriteLong(tvf.GetFilePointer()); if (vectors != null) { int numFields = vectors.Length; tvd.WriteVInt(numFields); long[] fieldPointers = new long[numFields]; for (int i = 0; i < numFields; i++) { fieldPointers[i] = tvf.GetFilePointer(); int fieldNumber = fieldInfos.FieldNumber(vectors[i].GetField()); // 1st pass: write field numbers to tvd tvd.WriteVInt(fieldNumber); int numTerms = vectors[i].Size(); tvf.WriteVInt(numTerms); TermPositionVector tpVector; byte bits; bool storePositions; bool storeOffsets; if (vectors[i] is TermPositionVector) { // May have positions & offsets tpVector = (TermPositionVector) vectors[i]; storePositions = tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null; storeOffsets = tpVector.Size() > 0 && tpVector.GetOffsets(0) != null; bits = (byte) ((storePositions?TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR: (byte) 0) + (storeOffsets?TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR: (byte) 0)); } else { tpVector = null; bits = 0; storePositions = false; storeOffsets = false; } tvf.WriteVInt(bits); System.String[] terms = vectors[i].GetTerms(); int[] freqs = vectors[i].GetTermFrequencies(); int utf8Upto = 0; utf8Results[1].length = 0; for (int j = 0; j < numTerms; j++) { UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].Length, utf8Results[utf8Upto]); int start = StringHelper.BytesDifference(utf8Results[1 - utf8Upto].result, utf8Results[1 - utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length); int length = utf8Results[utf8Upto].length - start; tvf.WriteVInt(start); // write shared prefix length tvf.WriteVInt(length); // write delta length tvf.WriteBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes utf8Upto = 1 - utf8Upto; int termFreq = freqs[j]; tvf.WriteVInt(termFreq); if (storePositions) { int[] positions = tpVector.GetTermPositions(j); if (positions == null) throw new System.SystemException("Trying to write positions that are null!"); System.Diagnostics.Debug.Assert(positions.Length == termFreq); // use delta encoding for positions int lastPosition = 0; for (int k = 0; k < positions.Length; k++) { int position = positions[k]; tvf.WriteVInt(position - lastPosition); lastPosition = position; } } if (storeOffsets) { TermVectorOffsetInfo[] offsets = tpVector.GetOffsets(j); if (offsets == null) throw new System.SystemException("Trying to write offsets that are null!"); System.Diagnostics.Debug.Assert(offsets.Length == termFreq); // use delta encoding for offsets int lastEndOffset = 0; for (int k = 0; k < offsets.Length; k++) { int startOffset = offsets[k].GetStartOffset(); int endOffset = offsets[k].GetEndOffset(); tvf.WriteVInt(startOffset - lastEndOffset); tvf.WriteVInt(endOffset - startOffset); lastEndOffset = endOffset; } } } } // 2nd pass: write field pointers to tvd if (numFields > 1) { long lastFieldPointer = fieldPointers[0]; for (int i = 1; i < numFields; i++) { long fieldPointer = fieldPointers[i]; tvd.WriteVLong(fieldPointer - lastFieldPointer); lastFieldPointer = fieldPointer; } } } else tvd.WriteVInt(0); }
/// <summary>Add specified vector to the document. Document must be open but no Field /// should be open or exception is thrown. The same document can have <code>addTerm</code> /// and <code>addVectors</code> calls mixed, however a given Field must either be /// populated with <code>addTerm</code> or with <code>addVector</code>. * /// </summary> public void AddTermFreqVector(TermFreqVector vector) { if (!IsDocumentOpen()) throw new System.SystemException("Cannot add term vector when document is not open"); if (IsFieldOpen()) throw new System.SystemException("Cannot add term vector when Field is open"); AddTermFreqVectorInternal(vector); }
/// <summary> Retrieve the term vector for the given document and Field</summary> /// <param name="docNum">The document number to retrieve the vector for /// </param> /// <param name="Field">The Field within the document to retrieve /// </param> /// <returns> The TermFreqVector for the document and Field or null /// </returns> public /*internal*/ virtual TermFreqVector Get(int docNum, System.String field) { lock (this) { // Check if no term vectors are available for this segment at all int fieldNumber = fieldInfos.FieldNumber(field); TermFreqVector result = null; if (tvx != null) { try { //We need to account for the FORMAT_SIZE at when seeking in the tvx //We don't need to do this in other seeks because we already have the file pointer //that was written in another file tvx.Seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE); //System.out.println("TVX Pointer: " + tvx.getFilePointer()); long position = tvx.ReadLong(); tvd.Seek(position); int fieldCount = tvd.ReadVInt(); //System.out.println("Num Fields: " + fieldCount); // There are only a few fields per document. We opt for a full scan // rather then requiring that they be ordered. We need to read through // all of the fields anyway to get to the tvf pointers. int number = 0; int found = -1; for (int i = 0; i < fieldCount; i++) { number += tvd.ReadVInt(); if (number == fieldNumber) { found = i; } } // This Field, although valid in the segment, was not found in this document if (found != -1) { // Compute position in the tvf file position = 0; for (int i = 0; i <= found; i++) { position += tvd.ReadVLong(); } result = ReadTermVector(field, position); } else { //System.out.println("Field not found"); } } catch (System.Exception e) { //System.Console.Out.WriteLine(e.StackTrace); } } else { System.Console.Out.WriteLine("No tvx file"); } return(result); } }
/// <summary>Add specified vectors to the document.</summary> public void AddVectors(TermFreqVector[] vectors) { if (!IsDocumentOpen()) throw new System.SystemException("Cannot add term vectors when document is not open"); if (IsFieldOpen()) throw new System.SystemException("Cannot add term vectors when Field is open"); for (int i = 0; i < vectors.Length; i++) { AddTermFreqVector(vectors[i]); } }